pytorch · DiweiSun · Aug 19, 2025 · Aug 20, 2025 · Aug 20, 2025 · Aug 20, 2025
diff --git a/.github/scripts/ci_test_xpu.sh b/.github/scripts/ci_test_xpu.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+python3 -m pip install torch torchvision torchaudio pytorch-triton-xpu --index-url https://download.pytorch.org/whl/nightly/xpu --force-reinstall --no-cache-dir 
+python3 setup.py install
+
+pip install pytest expecttest parameterized accelerate hf_transfer 'modelscope!=1.15.0'
+
+cd test/quantization
+pytest -v -s *.py
diff --git a/.github/workflows/pr-test-xpu.yml b/.github/workflows/pr-test-xpu.yml
@@ -0,0 +1,168 @@
+# TODO: this looks sort of similar to _linux-test, but there are like a dozen
+# places where you would have to insert an if statement. Probably it's better to
+# just use a different workflow altogether
+
+name: xpu-test
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - ciflow/xpu/*
+
+concurrency:
+  group: xpu_ci_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  test:
+    # Don't run on forked repos or empty test matrix
+    # if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]'
+    timeout-minutes: 60
+    runs-on: linux.idc.xpu
+    env:
+      DOCKER_IMAGE: ci-image:pytorch-linux-jammy-xpu-n-1-py3
+      TEST_COMMAND: .github/scripts/ci_test_xpu.sh
+      PYTORCH_RETRY_TEST_CASES: 1
+      PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
+      XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
+    steps:
+      # [see note: pytorch repo ref]
+      - name: Checkout Torchao
+        uses: actions/checkout@v4
+
+      - name: Setup XPU
+        uses: ./.github/workflows/xpu-action.yml
+
+      - name: configure aws credentials
+        id: aws_creds
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+
+      - name: Login to Amazon ECR
+        id: login-ecr
+        uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
+
+      - name: Clean all stopped docker containers
+        if: always()
+        shell: bash
+        run: |
+          # Prune all stopped containers.
+          # If other runner is pruning on this node, will skip.
+          nprune=$(ps -ef | grep -c "docker container prune")
+          if [[ $nprune -eq 1 ]]; then
+            docker container prune -f
+          fi
+
+      - name: Runner health check GPU count
+        if: always()
+        shell: bash
+        run: |
+          ngpu=$(timeout 30 clinfo -l | grep -c -E 'Device' || true)
+          msg="Please file an issue on pytorch/ao reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
+          if [[ $ngpu -eq 0 ]]; then
+            echo "Error: Failed to detect any GPUs on the runner"
+            echo "$msg"
+            exit 1
+          fi
+
+      - name: Use following to pull public copy of the image
+        id: print-ghcr-mirror
+        shell: bash
+        run: |
+          echo "docker pull ${DOCKER_IMAGE}"
+          docker pull ${DOCKER_IMAGE}
+
+      - name: Test
+        id: test
+        env:
+          BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          GITHUB_REPOSITORY: ${{ github.repository }}
+          GITHUB_WORKFLOW: ${{ github.workflow }}
+          GITHUB_JOB: ${{ github.job }}
+          GITHUB_RUN_ID: ${{ github.run_id }}
+          GITHUB_RUN_NUMBER: ${{ github.run_number }}
+          GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
+          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+        timeout-minutes: 60
+        run: |
+          set -x
+
+          # detached container should get cleaned up by teardown_ec2_linux
+          # Used for GPU_FLAG since that doesn't play nice
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BUILD_ENVIRONMENT \
+            -e PR_NUMBER \
+            -e GITHUB_ACTIONS \
+            -e GITHUB_REPOSITORY \
+            -e GITHUB_WORKFLOW \
+            -e GITHUB_JOB \
+            -e GITHUB_RUN_ID \
+            -e GITHUB_RUN_NUMBER \
+            -e GITHUB_RUN_ATTEMPT \
+            -e JOB_ID \
+            -e BRANCH \
+            -e SHA1 \
+            --user $(id -u):$(id -g) \
+            --ulimit stack=10485760:83886080 \
+            --ulimit core=0 \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --shm-size="8g" \
+            --tty \
+            --detach \
+            --name="${container_name}" \
+            --user jenkins \
+            --privileged \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}"
+          )
+          # save container name for later step
+          echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV"
+          # jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home
+          docker exec -t "${container_name}" sh -c "bash ${TEST_COMMAND}"
+
+      - name: Change permissions
+        if: ${{ always() && steps.test.conclusion }}
+        run: |
+          docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test"
+
+      - name: Collect backtraces from coredumps (if any)
+        if: always()
+        run: |
+          # shellcheck disable=SC2156
+          find . -iname "core.[1-9]*" -exec docker exec "${CONTAINER_NAME}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;
+
+      - name: Stop container before exit
+        if: always()
+        run: |
+          # Workaround for multiple runners on same IDC node
+          docker stop "${{ env.CONTAINER_NAME }}"
+
+      - name: Store Core dumps on GitHub
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        if: failure()
+        with:
+          name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
+          retention-days: 14
+          if-no-files-found: ignore
+          path: ./**/core.[1-9]*
+
+      - name: Teardown XPU
+        if: always()
+        shell: bash
+        run: |
+          # Prune all stopped containers.
+          # If other runner is pruning on this node, will skip.
+          nprune=$(ps -ef | grep -c "docker container prune")
+          if [[ $nprune -eq 1 ]]; then
+            docker container prune -f
+          fi