From a736c41187363d39cf72d1b4a3bda5b62554f3a1 Mon Sep 17 00:00:00 2001 From: "Sun, Diwei" Date: Tue, 19 Aug 2025 08:28:02 +0000 Subject: [PATCH 01/25] enable xpu ci test --- .github/workflows/pr-test-xpu.yml | 156 ++++++++++++++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 .github/workflows/pr-test-xpu.yml diff --git a/.github/workflows/pr-test-xpu.yml b/.github/workflows/pr-test-xpu.yml new file mode 100644 index 0000000000..79621a06d1 --- /dev/null +++ b/.github/workflows/pr-test-xpu.yml @@ -0,0 +1,156 @@ +# TODO: this looks sort of similar to _linux-test, but there are like a dozen +# places where you would have to insert an if statement. Probably it's better to +# just use a different workflow altogether + +name: xpu-test + +on: + push: + branches: + - main + - 'gh/**' + pull_request: + branches: + - main + - 'gh/**' + +concurrency: + group: xpu_ci_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} + cancel-in-progress: true + +jobs: + test: + # Don't run on forked repos or empty test matrix + # if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]' + timeout-minutes: 60 + runs-on: ao-pvc + env: + DOCKER_IMAGE: ghcr.io/pytorch/ci-image:pytorch-linux-jammy-xpu-2025.1-py3-b388c12018df5d6ce2f94b7fb337fa3729978ab3 + TEST_COMMAND: .github/scripts/ci_test_xpu.sh + PYTORCH_RETRY_TEST_CASES: 1 + PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1 + XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla + steps: + # [see note: pytorch repo ref] + - name: Checkout Torchao + uses: actions/checkout@v4 + + - name: Clean all stopped docker containers + if: always() + shell: bash + run: | + # Prune all stopped containers. + # If other runner is pruning on this node, will skip. + nprune=$(ps -ef | grep -c "docker container prune") + if [[ $nprune -eq 1 ]]; then + docker container prune -f + fi + + - name: Runner health check GPU count + if: always() + shell: bash + run: | + ngpu=$(timeout 30 clinfo -l | grep -c -E 'Device' || true) + msg="Please file an issue on pytorch/ao reporting the faulty runner. Include a link to the runner logs so the runner can be identified" + if [[ $ngpu -eq 0 ]]; then + echo "Error: Failed to detect any GPUs on the runner" + echo "$msg" + exit 1 + fi + + - name: Use following to pull public copy of the image + id: print-ghcr-mirror + shell: bash + run: | + echo "docker pull ${DOCKER_IMAGE}" + docker pull ${DOCKER_IMAGE} + + - name: Test + id: test + env: + BUILD_ENVIRONMENT: ${{ inputs.build-environment }} + PR_NUMBER: ${{ github.event.pull_request.number }} + GITHUB_REPOSITORY: ${{ github.repository }} + GITHUB_WORKFLOW: ${{ github.workflow }} + GITHUB_JOB: ${{ github.job }} + GITHUB_RUN_ID: ${{ github.run_id }} + GITHUB_RUN_NUMBER: ${{ github.run_number }} + GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + timeout-minutes: 60 + run: | + set -x + + # detached container should get cleaned up by teardown_ec2_linux + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e GITHUB_ACTIONS \ + -e GITHUB_REPOSITORY \ + -e GITHUB_WORKFLOW \ + -e GITHUB_JOB \ + -e GITHUB_RUN_ID \ + -e GITHUB_RUN_NUMBER \ + -e GITHUB_RUN_ATTEMPT \ + -e JOB_ID \ + -e BRANCH \ + -e SHA1 \ + --user $(id -u):$(id -g) \ + --ulimit stack=10485760:83886080 \ + --ulimit core=0 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --shm-size="8g" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + --privileged \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + # save container name for later step + echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV" + # jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home + docker exec -t "${container_name}" sh -c "bash ${TEST_COMMAND}" + + - name: Change permissions + if: ${{ always() && steps.test.conclusion }} + run: | + docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test" + + - name: Collect backtraces from coredumps (if any) + if: always() + run: | + # shellcheck disable=SC2156 + find . -iname "core.[1-9]*" -exec docker exec "${CONTAINER_NAME}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \; + + - name: Stop container before exit + if: always() + run: | + # Workaround for multiple runners on same IDC node + docker stop "${{ env.CONTAINER_NAME }}" + + - name: Store Core dumps on GitHub + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + if: failure() + with: + name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }} + retention-days: 14 + if-no-files-found: ignore + path: ./**/core.[1-9]* + + - name: Teardown XPU + if: always() + shell: bash + run: | + # Prune all stopped containers. + # If other runner is pruning on this node, will skip. + nprune=$(ps -ef | grep -c "docker container prune") + if [[ $nprune -eq 1 ]]; then + docker container prune -f + fi From 7c96ad46d16039d37a685fa9ed28010d1a3ab0aa Mon Sep 17 00:00:00 2001 From: "Sun, Diwei" Date: Wed, 20 Aug 2025 01:52:54 +0000 Subject: [PATCH 02/25] Revert "enable xpu ci test" This reverts commit a736c41187363d39cf72d1b4a3bda5b62554f3a1. --- .github/workflows/pr-test-xpu.yml | 156 ------------------------------ 1 file changed, 156 deletions(-) delete mode 100644 .github/workflows/pr-test-xpu.yml diff --git a/.github/workflows/pr-test-xpu.yml b/.github/workflows/pr-test-xpu.yml deleted file mode 100644 index 79621a06d1..0000000000 --- a/.github/workflows/pr-test-xpu.yml +++ /dev/null @@ -1,156 +0,0 @@ -# TODO: this looks sort of similar to _linux-test, but there are like a dozen -# places where you would have to insert an if statement. Probably it's better to -# just use a different workflow altogether - -name: xpu-test - -on: - push: - branches: - - main - - 'gh/**' - pull_request: - branches: - - main - - 'gh/**' - -concurrency: - group: xpu_ci_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} - cancel-in-progress: true - -jobs: - test: - # Don't run on forked repos or empty test matrix - # if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]' - timeout-minutes: 60 - runs-on: ao-pvc - env: - DOCKER_IMAGE: ghcr.io/pytorch/ci-image:pytorch-linux-jammy-xpu-2025.1-py3-b388c12018df5d6ce2f94b7fb337fa3729978ab3 - TEST_COMMAND: .github/scripts/ci_test_xpu.sh - PYTORCH_RETRY_TEST_CASES: 1 - PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1 - XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla - steps: - # [see note: pytorch repo ref] - - name: Checkout Torchao - uses: actions/checkout@v4 - - - name: Clean all stopped docker containers - if: always() - shell: bash - run: | - # Prune all stopped containers. - # If other runner is pruning on this node, will skip. - nprune=$(ps -ef | grep -c "docker container prune") - if [[ $nprune -eq 1 ]]; then - docker container prune -f - fi - - - name: Runner health check GPU count - if: always() - shell: bash - run: | - ngpu=$(timeout 30 clinfo -l | grep -c -E 'Device' || true) - msg="Please file an issue on pytorch/ao reporting the faulty runner. Include a link to the runner logs so the runner can be identified" - if [[ $ngpu -eq 0 ]]; then - echo "Error: Failed to detect any GPUs on the runner" - echo "$msg" - exit 1 - fi - - - name: Use following to pull public copy of the image - id: print-ghcr-mirror - shell: bash - run: | - echo "docker pull ${DOCKER_IMAGE}" - docker pull ${DOCKER_IMAGE} - - - name: Test - id: test - env: - BUILD_ENVIRONMENT: ${{ inputs.build-environment }} - PR_NUMBER: ${{ github.event.pull_request.number }} - GITHUB_REPOSITORY: ${{ github.repository }} - GITHUB_WORKFLOW: ${{ github.workflow }} - GITHUB_JOB: ${{ github.job }} - GITHUB_RUN_ID: ${{ github.run_id }} - GITHUB_RUN_NUMBER: ${{ github.run_number }} - GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - timeout-minutes: 60 - run: | - set -x - - # detached container should get cleaned up by teardown_ec2_linux - # Used for GPU_FLAG since that doesn't play nice - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BUILD_ENVIRONMENT \ - -e PR_NUMBER \ - -e GITHUB_ACTIONS \ - -e GITHUB_REPOSITORY \ - -e GITHUB_WORKFLOW \ - -e GITHUB_JOB \ - -e GITHUB_RUN_ID \ - -e GITHUB_RUN_NUMBER \ - -e GITHUB_RUN_ATTEMPT \ - -e JOB_ID \ - -e BRANCH \ - -e SHA1 \ - --user $(id -u):$(id -g) \ - --ulimit stack=10485760:83886080 \ - --ulimit core=0 \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --shm-size="8g" \ - --tty \ - --detach \ - --name="${container_name}" \ - --user jenkins \ - --privileged \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - # save container name for later step - echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV" - # jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home - docker exec -t "${container_name}" sh -c "bash ${TEST_COMMAND}" - - - name: Change permissions - if: ${{ always() && steps.test.conclusion }} - run: | - docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test" - - - name: Collect backtraces from coredumps (if any) - if: always() - run: | - # shellcheck disable=SC2156 - find . -iname "core.[1-9]*" -exec docker exec "${CONTAINER_NAME}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \; - - - name: Stop container before exit - if: always() - run: | - # Workaround for multiple runners on same IDC node - docker stop "${{ env.CONTAINER_NAME }}" - - - name: Store Core dumps on GitHub - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 - if: failure() - with: - name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }} - retention-days: 14 - if-no-files-found: ignore - path: ./**/core.[1-9]* - - - name: Teardown XPU - if: always() - shell: bash - run: | - # Prune all stopped containers. - # If other runner is pruning on this node, will skip. - nprune=$(ps -ef | grep -c "docker container prune") - if [[ $nprune -eq 1 ]]; then - docker container prune -f - fi From d1122fcd9c6ab74aaa6cf2c4deb183c08e5360a1 Mon Sep 17 00:00:00 2001 From: "Sun, Diwei" Date: Wed, 20 Aug 2025 01:55:27 +0000 Subject: [PATCH 03/25] enable ci test for xpu --- .github/workflows/pr-test-xpu.yml | 156 ++++++++++++++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 .github/workflows/pr-test-xpu.yml diff --git a/.github/workflows/pr-test-xpu.yml b/.github/workflows/pr-test-xpu.yml new file mode 100644 index 0000000000..79621a06d1 --- /dev/null +++ b/.github/workflows/pr-test-xpu.yml @@ -0,0 +1,156 @@ +# TODO: this looks sort of similar to _linux-test, but there are like a dozen +# places where you would have to insert an if statement. Probably it's better to +# just use a different workflow altogether + +name: xpu-test + +on: + push: + branches: + - main + - 'gh/**' + pull_request: + branches: + - main + - 'gh/**' + +concurrency: + group: xpu_ci_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} + cancel-in-progress: true + +jobs: + test: + # Don't run on forked repos or empty test matrix + # if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]' + timeout-minutes: 60 + runs-on: ao-pvc + env: + DOCKER_IMAGE: ghcr.io/pytorch/ci-image:pytorch-linux-jammy-xpu-2025.1-py3-b388c12018df5d6ce2f94b7fb337fa3729978ab3 + TEST_COMMAND: .github/scripts/ci_test_xpu.sh + PYTORCH_RETRY_TEST_CASES: 1 + PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1 + XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla + steps: + # [see note: pytorch repo ref] + - name: Checkout Torchao + uses: actions/checkout@v4 + + - name: Clean all stopped docker containers + if: always() + shell: bash + run: | + # Prune all stopped containers. + # If other runner is pruning on this node, will skip. + nprune=$(ps -ef | grep -c "docker container prune") + if [[ $nprune -eq 1 ]]; then + docker container prune -f + fi + + - name: Runner health check GPU count + if: always() + shell: bash + run: | + ngpu=$(timeout 30 clinfo -l | grep -c -E 'Device' || true) + msg="Please file an issue on pytorch/ao reporting the faulty runner. Include a link to the runner logs so the runner can be identified" + if [[ $ngpu -eq 0 ]]; then + echo "Error: Failed to detect any GPUs on the runner" + echo "$msg" + exit 1 + fi + + - name: Use following to pull public copy of the image + id: print-ghcr-mirror + shell: bash + run: | + echo "docker pull ${DOCKER_IMAGE}" + docker pull ${DOCKER_IMAGE} + + - name: Test + id: test + env: + BUILD_ENVIRONMENT: ${{ inputs.build-environment }} + PR_NUMBER: ${{ github.event.pull_request.number }} + GITHUB_REPOSITORY: ${{ github.repository }} + GITHUB_WORKFLOW: ${{ github.workflow }} + GITHUB_JOB: ${{ github.job }} + GITHUB_RUN_ID: ${{ github.run_id }} + GITHUB_RUN_NUMBER: ${{ github.run_number }} + GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + timeout-minutes: 60 + run: | + set -x + + # detached container should get cleaned up by teardown_ec2_linux + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e GITHUB_ACTIONS \ + -e GITHUB_REPOSITORY \ + -e GITHUB_WORKFLOW \ + -e GITHUB_JOB \ + -e GITHUB_RUN_ID \ + -e GITHUB_RUN_NUMBER \ + -e GITHUB_RUN_ATTEMPT \ + -e JOB_ID \ + -e BRANCH \ + -e SHA1 \ + --user $(id -u):$(id -g) \ + --ulimit stack=10485760:83886080 \ + --ulimit core=0 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --shm-size="8g" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + --privileged \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + # save container name for later step + echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV" + # jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home + docker exec -t "${container_name}" sh -c "bash ${TEST_COMMAND}" + + - name: Change permissions + if: ${{ always() && steps.test.conclusion }} + run: | + docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test" + + - name: Collect backtraces from coredumps (if any) + if: always() + run: | + # shellcheck disable=SC2156 + find . -iname "core.[1-9]*" -exec docker exec "${CONTAINER_NAME}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \; + + - name: Stop container before exit + if: always() + run: | + # Workaround for multiple runners on same IDC node + docker stop "${{ env.CONTAINER_NAME }}" + + - name: Store Core dumps on GitHub + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + if: failure() + with: + name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }} + retention-days: 14 + if-no-files-found: ignore + path: ./**/core.[1-9]* + + - name: Teardown XPU + if: always() + shell: bash + run: | + # Prune all stopped containers. + # If other runner is pruning on this node, will skip. + nprune=$(ps -ef | grep -c "docker container prune") + if [[ $nprune -eq 1 ]]; then + docker container prune -f + fi From 4593d9534bd1ed9e1d3c590b4b8caa63cbf397b5 Mon Sep 17 00:00:00 2001 From: DiweiSun <105627594+DiweiSun@users.noreply.github.com> Date: Wed, 20 Aug 2025 14:50:59 +0800 Subject: [PATCH 04/25] Create ci_test_xpu.sh --- .github/scripts/ci_test_xpu.sh | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 .github/scripts/ci_test_xpu.sh diff --git a/.github/scripts/ci_test_xpu.sh b/.github/scripts/ci_test_xpu.sh new file mode 100644 index 0000000000..ccff1b848f --- /dev/null +++ b/.github/scripts/ci_test_xpu.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +python3 -m pip install torch torchvision torchaudio pytorch-triton-xpu --index-url https://download.pytorch.org/whl/nightly/xpu --force-reinstall --no-cache-dir +python3 setup.py install + +pip install pytest expecttest parameterized accelerate hf_transfer 'modelscope!=1.15.0' + +cd test/quantization +pytest -v -s *.py From 7d90b8c923747e4f3897c8f42517ea5c49ebdafd Mon Sep 17 00:00:00 2001 From: DiweiSun <105627594+DiweiSun@users.noreply.github.com> Date: Fri, 22 Aug 2025 14:30:44 +0800 Subject: [PATCH 05/25] Update .github/workflows/pr-test-xpu.yml Co-authored-by: Wang, Chuanqi --- .github/workflows/pr-test-xpu.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/pr-test-xpu.yml b/.github/workflows/pr-test-xpu.yml index 79621a06d1..72c33bdef5 100644 --- a/.github/workflows/pr-test-xpu.yml +++ b/.github/workflows/pr-test-xpu.yml @@ -11,8 +11,7 @@ on: - 'gh/**' pull_request: branches: - - main - - 'gh/**' + - ciflow/xpu/* concurrency: group: xpu_ci_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} From e6bc407a053ad939cb14bf92baa32f0303b60f70 Mon Sep 17 00:00:00 2001 From: DiweiSun <105627594+DiweiSun@users.noreply.github.com> Date: Fri, 22 Aug 2025 14:30:54 +0800 Subject: [PATCH 06/25] Update .github/workflows/pr-test-xpu.yml Co-authored-by: Wang, Chuanqi --- .github/workflows/pr-test-xpu.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr-test-xpu.yml b/.github/workflows/pr-test-xpu.yml index 72c33bdef5..f18975e2eb 100644 --- a/.github/workflows/pr-test-xpu.yml +++ b/.github/workflows/pr-test-xpu.yml @@ -22,7 +22,7 @@ jobs: # Don't run on forked repos or empty test matrix # if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]' timeout-minutes: 60 - runs-on: ao-pvc + runs-on: linux.idc.xpu env: DOCKER_IMAGE: ghcr.io/pytorch/ci-image:pytorch-linux-jammy-xpu-2025.1-py3-b388c12018df5d6ce2f94b7fb337fa3729978ab3 TEST_COMMAND: .github/scripts/ci_test_xpu.sh From c34601f553c1c810ea34d852e187feadc0ee0efc Mon Sep 17 00:00:00 2001 From: DiweiSun <105627594+DiweiSun@users.noreply.github.com> Date: Tue, 2 Sep 2025 10:00:53 +0800 Subject: [PATCH 07/25] fix for trigger scenarios --- .github/workflows/pr-test-xpu.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/pr-test-xpu.yml b/.github/workflows/pr-test-xpu.yml index f18975e2eb..9670d1891d 100644 --- a/.github/workflows/pr-test-xpu.yml +++ b/.github/workflows/pr-test-xpu.yml @@ -8,7 +8,6 @@ on: push: branches: - main - - 'gh/**' pull_request: branches: - ciflow/xpu/* From 3085c2b27a29ce461faaf33260f77c34ce399882 Mon Sep 17 00:00:00 2001 From: DiweiSun <105627594+DiweiSun@users.noreply.github.com> Date: Tue, 2 Sep 2025 10:53:50 +0800 Subject: [PATCH 08/25] port from pytorch repo --- .github/workflows/action.yml | 67 ++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 .github/workflows/action.yml diff --git a/.github/workflows/action.yml b/.github/workflows/action.yml new file mode 100644 index 0000000000..740492475d --- /dev/null +++ b/.github/workflows/action.yml @@ -0,0 +1,67 @@ +name: Setup XPU host + +description: Set up XPU host for CI + +runs: + using: composite + steps: + - name: Clean all stopped docker containers + if: always() + shell: bash + run: | + # Prune all stopped containers. + # If other runner is pruning on this node, will skip. + nprune=$(ps -ef | grep -c "docker container prune") + if [[ $nprune -eq 1 ]]; then + docker container prune -f + fi + + - name: Runner health check system info + if: always() + shell: bash + run: | + cat /etc/os-release || true + cat /etc/apt/sources.list.d/oneAPI.list || true + cat /etc/apt/sources.list.d/intel-gpu-jammy.list || true + whoami + + - name: Runner health check xpu-smi + if: always() + shell: bash + run: | + timeout 30 xpu-smi discovery || true + + - name: Runner health check GPU count + if: always() + shell: bash + run: | + ngpu=$(timeout 30 xpu-smi discovery | grep -c -E 'Device Name' || true) + msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified" + if [[ $ngpu -eq 0 ]]; then + echo "Error: Failed to detect any GPUs on the runner" + echo "$msg" + exit 1 + fi + + - name: Runner diskspace health check + uses: ./.github/actions/diskspace-cleanup + if: always() + + - name: Runner health check disconnect on failure + if: ${{ failure() }} + shell: bash + run: | + killall runsvc.sh + + - name: Preserve github env variables for use in docker + shell: bash + run: | + env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}" + env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}" + + - name: XPU set GPU_FLAG + shell: bash + run: | + # Add render group for container creation. + render_gid=`cat /etc/group | grep render | cut -d: -f3` + echo "GPU_FLAG=--device=/dev/mem --device=/dev/dri --group-add video --group-add $render_gid" >> "${GITHUB_ENV}" From d9ab09e8a9ad936745a402628d305ef1bad9b854 Mon Sep 17 00:00:00 2001 From: DiweiSun <105627594+DiweiSun@users.noreply.github.com> Date: Tue, 2 Sep 2025 10:55:00 +0800 Subject: [PATCH 09/25] Rename action.yml to xpu-action.yml --- .github/workflows/{action.yml => xpu-action.yml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .github/workflows/{action.yml => xpu-action.yml} (100%) diff --git a/.github/workflows/action.yml b/.github/workflows/xpu-action.yml similarity index 100% rename from .github/workflows/action.yml rename to .github/workflows/xpu-action.yml From f87892abcebacc44e71a837eb5ee4a21edf6e6a9 Mon Sep 17 00:00:00 2001 From: DiweiSun <105627594+DiweiSun@users.noreply.github.com> Date: Tue, 2 Sep 2025 10:59:07 +0800 Subject: [PATCH 10/25] update to align with pytorch --- .github/workflows/pr-test-xpu.yml | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pr-test-xpu.yml b/.github/workflows/pr-test-xpu.yml index 9670d1891d..75991289e1 100644 --- a/.github/workflows/pr-test-xpu.yml +++ b/.github/workflows/pr-test-xpu.yml @@ -23,7 +23,7 @@ jobs: timeout-minutes: 60 runs-on: linux.idc.xpu env: - DOCKER_IMAGE: ghcr.io/pytorch/ci-image:pytorch-linux-jammy-xpu-2025.1-py3-b388c12018df5d6ce2f94b7fb337fa3729978ab3 + DOCKER_IMAGE: ci-image:pytorch-linux-jammy-xpu-n-1-py3 TEST_COMMAND: .github/scripts/ci_test_xpu.sh PYTORCH_RETRY_TEST_CASES: 1 PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1 @@ -33,6 +33,20 @@ jobs: - name: Checkout Torchao uses: actions/checkout@v4 + - name: Setup XPU + uses: ./.github/workflows/xpu-action.yml + + - name: configure aws credentials + id: aws_creds + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + + - name: Login to Amazon ECR + id: login-ecr + uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1 + - name: Clean all stopped docker containers if: always() shell: bash From c6f07b5aa1c2d9dd40151b365b56a6e89678792c Mon Sep 17 00:00:00 2001 From: diwei sun Date: Wed, 3 Sep 2025 19:40:04 -0700 Subject: [PATCH 11/25] Revert "Rename action.yml to xpu-action.yml" This reverts commit d9ab09e8a9ad936745a402628d305ef1bad9b854. --- .github/workflows/{xpu-action.yml => action.yml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .github/workflows/{xpu-action.yml => action.yml} (100%) diff --git a/.github/workflows/xpu-action.yml b/.github/workflows/action.yml similarity index 100% rename from .github/workflows/xpu-action.yml rename to .github/workflows/action.yml From 544593a5a1a7f54d9c7792d6ae62c9dca6f891bd Mon Sep 17 00:00:00 2001 From: diwei sun Date: Wed, 3 Sep 2025 19:40:13 -0700 Subject: [PATCH 12/25] Revert "port from pytorch repo" This reverts commit 3085c2b27a29ce461faaf33260f77c34ce399882. --- .github/workflows/action.yml | 67 ------------------------------------ 1 file changed, 67 deletions(-) delete mode 100644 .github/workflows/action.yml diff --git a/.github/workflows/action.yml b/.github/workflows/action.yml deleted file mode 100644 index 740492475d..0000000000 --- a/.github/workflows/action.yml +++ /dev/null @@ -1,67 +0,0 @@ -name: Setup XPU host - -description: Set up XPU host for CI - -runs: - using: composite - steps: - - name: Clean all stopped docker containers - if: always() - shell: bash - run: | - # Prune all stopped containers. - # If other runner is pruning on this node, will skip. - nprune=$(ps -ef | grep -c "docker container prune") - if [[ $nprune -eq 1 ]]; then - docker container prune -f - fi - - - name: Runner health check system info - if: always() - shell: bash - run: | - cat /etc/os-release || true - cat /etc/apt/sources.list.d/oneAPI.list || true - cat /etc/apt/sources.list.d/intel-gpu-jammy.list || true - whoami - - - name: Runner health check xpu-smi - if: always() - shell: bash - run: | - timeout 30 xpu-smi discovery || true - - - name: Runner health check GPU count - if: always() - shell: bash - run: | - ngpu=$(timeout 30 xpu-smi discovery | grep -c -E 'Device Name' || true) - msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified" - if [[ $ngpu -eq 0 ]]; then - echo "Error: Failed to detect any GPUs on the runner" - echo "$msg" - exit 1 - fi - - - name: Runner diskspace health check - uses: ./.github/actions/diskspace-cleanup - if: always() - - - name: Runner health check disconnect on failure - if: ${{ failure() }} - shell: bash - run: | - killall runsvc.sh - - - name: Preserve github env variables for use in docker - shell: bash - run: | - env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}" - env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}" - - - name: XPU set GPU_FLAG - shell: bash - run: | - # Add render group for container creation. - render_gid=`cat /etc/group | grep render | cut -d: -f3` - echo "GPU_FLAG=--device=/dev/mem --device=/dev/dri --group-add video --group-add $render_gid" >> "${GITHUB_ENV}" From 2e1dc508c12ddcdbc0bec7649984d2306db356be Mon Sep 17 00:00:00 2001 From: DiweiSun <105627594+DiweiSun@users.noreply.github.com> Date: Thu, 4 Sep 2025 14:43:22 +0800 Subject: [PATCH 13/25] Update .github/workflows/pr-test-xpu.yml Co-authored-by: Wang, Chuanqi --- .github/workflows/pr-test-xpu.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/pr-test-xpu.yml b/.github/workflows/pr-test-xpu.yml index 75991289e1..f4e27832ad 100644 --- a/.github/workflows/pr-test-xpu.yml +++ b/.github/workflows/pr-test-xpu.yml @@ -6,11 +6,11 @@ name: xpu-test on: push: - branches: - - main + tags: + - ciflow/xpu/* pull_request: branches: - - ciflow/xpu/* + - main concurrency: group: xpu_ci_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} From 188a0f8bf0508efd3a24909718b078d4ca974c3c Mon Sep 17 00:00:00 2001 From: DiweiSun <105627594+DiweiSun@users.noreply.github.com> Date: Fri, 5 Sep 2025 09:27:40 +0800 Subject: [PATCH 14/25] debug for runner --- .github/workflows/pr-test-xpu.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr-test-xpu.yml b/.github/workflows/pr-test-xpu.yml index f4e27832ad..25531166a6 100644 --- a/.github/workflows/pr-test-xpu.yml +++ b/.github/workflows/pr-test-xpu.yml @@ -21,7 +21,7 @@ jobs: # Don't run on forked repos or empty test matrix # if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]' timeout-minutes: 60 - runs-on: linux.idc.xpu + runs-on: ubuntu-latest env: DOCKER_IMAGE: ci-image:pytorch-linux-jammy-xpu-n-1-py3 TEST_COMMAND: .github/scripts/ci_test_xpu.sh From 421d02cfdac942e66c3ce1c72593cbcbd06ac1d2 Mon Sep 17 00:00:00 2001 From: diwei sun Date: Mon, 8 Sep 2025 00:58:02 -0700 Subject: [PATCH 15/25] lint format fix --- .github/workflows/pr-test-xpu.yml | 41 ++++++++++++++++--------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/.github/workflows/pr-test-xpu.yml b/.github/workflows/pr-test-xpu.yml index 25531166a6..6c36b32233 100644 --- a/.github/workflows/pr-test-xpu.yml +++ b/.github/workflows/pr-test-xpu.yml @@ -23,7 +23,7 @@ jobs: timeout-minutes: 60 runs-on: ubuntu-latest env: - DOCKER_IMAGE: ci-image:pytorch-linux-jammy-xpu-n-1-py3 + DOCKER_IMAGE: ci-image:pytorch-linux-jammy-xpu-n-py3 TEST_COMMAND: .github/scripts/ci_test_xpu.sh PYTORCH_RETRY_TEST_CASES: 1 PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1 @@ -34,7 +34,7 @@ jobs: uses: actions/checkout@v4 - name: Setup XPU - uses: ./.github/workflows/xpu-action.yml + uses: pytorch/pytorch/.github/actions/setup-xpu@main - name: configure aws credentials id: aws_creds @@ -46,17 +46,26 @@ jobs: - name: Login to Amazon ECR id: login-ecr uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1 - - - name: Clean all stopped docker containers - if: always() + + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + with: + docker-image-name: ${{ DOCKER_IMAGE }} + + - name: Use following to pull public copy of the image + id: print-ghcr-mirror + env: + ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} shell: bash run: | - # Prune all stopped containers. - # If other runner is pruning on this node, will skip. - nprune=$(ps -ef | grep -c "docker container prune") - if [[ $nprune -eq 1 ]]; then - docker container prune -f - fi + tag=${ECR_DOCKER_IMAGE##*:} + echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}" + + - name: Pull docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@main + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Runner health check GPU count if: always() @@ -157,12 +166,4 @@ jobs: path: ./**/core.[1-9]* - name: Teardown XPU - if: always() - shell: bash - run: | - # Prune all stopped containers. - # If other runner is pruning on this node, will skip. - nprune=$(ps -ef | grep -c "docker container prune") - if [[ $nprune -eq 1 ]]; then - docker container prune -f - fi + uses: pytorch/pytorch/.github/actions/teardown-xpu From 6f6cd17dec5e3288993fc2c0b04217bf5eff6968 Mon Sep 17 00:00:00 2001 From: diwei sun Date: Mon, 8 Sep 2025 01:13:38 -0700 Subject: [PATCH 16/25] format fix --- .github/workflows/pr-test-xpu.yml | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/.github/workflows/pr-test-xpu.yml b/.github/workflows/pr-test-xpu.yml index 6c36b32233..d4ab4e43d5 100644 --- a/.github/workflows/pr-test-xpu.yml +++ b/.github/workflows/pr-test-xpu.yml @@ -24,7 +24,6 @@ jobs: runs-on: ubuntu-latest env: DOCKER_IMAGE: ci-image:pytorch-linux-jammy-xpu-n-py3 - TEST_COMMAND: .github/scripts/ci_test_xpu.sh PYTORCH_RETRY_TEST_CASES: 1 PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1 XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla @@ -51,7 +50,7 @@ jobs: id: calculate-docker-image uses: pytorch/test-infra/.github/actions/calculate-docker-image@main with: - docker-image-name: ${{ DOCKER_IMAGE }} + docker-image-name: ${{ env.DOCKER_IMAGE }} - name: Use following to pull public copy of the image id: print-ghcr-mirror @@ -79,17 +78,11 @@ jobs: exit 1 fi - - name: Use following to pull public copy of the image - id: print-ghcr-mirror - shell: bash - run: | - echo "docker pull ${DOCKER_IMAGE}" - docker pull ${DOCKER_IMAGE} - - name: Test id: test env: - BUILD_ENVIRONMENT: ${{ inputs.build-environment }} + TEST_COMMAND: .github/scripts/ci_test_xpu.sh + DOCKER_IMAGE: ci-image:pytorch-linux-jammy-xpu-n-py3 PR_NUMBER: ${{ github.event.pull_request.number }} GITHUB_REPOSITORY: ${{ github.repository }} GITHUB_WORKFLOW: ${{ github.workflow }} @@ -107,7 +100,6 @@ jobs: # shellcheck disable=SC2086,SC2090 container_name=$(docker run \ ${GPU_FLAG:-} \ - -e BUILD_ENVIRONMENT \ -e PR_NUMBER \ -e GITHUB_ACTIONS \ -e GITHUB_REPOSITORY \ @@ -137,7 +129,7 @@ jobs: # save container name for later step echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV" # jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home - docker exec -t "${container_name}" sh -c "bash ${TEST_COMMAND}" + docker exec -t "${container_name}" sh -c "bash ${env.TEST_COMMAND}" - name: Change permissions if: ${{ always() && steps.test.conclusion }} From bae7000dafec4e8204c023560a3c7045d1ace50d Mon Sep 17 00:00:00 2001 From: diwei sun Date: Mon, 8 Sep 2025 01:16:56 -0700 Subject: [PATCH 17/25] format fix --- .github/workflows/pr-test-xpu.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr-test-xpu.yml b/.github/workflows/pr-test-xpu.yml index d4ab4e43d5..0e3dd47a6b 100644 --- a/.github/workflows/pr-test-xpu.yml +++ b/.github/workflows/pr-test-xpu.yml @@ -158,4 +158,4 @@ jobs: path: ./**/core.[1-9]* - name: Teardown XPU - uses: pytorch/pytorch/.github/actions/teardown-xpu + uses: pytorch/pytorch/.github/actions/teardown-xpu@main From 7bd3d291d5416e1e976f99c35c76f90268b80db2 Mon Sep 17 00:00:00 2001 From: diwei sun Date: Mon, 8 Sep 2025 23:16:06 -0700 Subject: [PATCH 18/25] format fix --- .github/workflows/pr-test-xpu.yml | 65 ++++++++++++++++++++++++++++++- 1 file changed, 63 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr-test-xpu.yml b/.github/workflows/pr-test-xpu.yml index 0e3dd47a6b..4b46f14218 100644 --- a/.github/workflows/pr-test-xpu.yml +++ b/.github/workflows/pr-test-xpu.yml @@ -29,11 +29,72 @@ jobs: XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla steps: # [see note: pytorch repo ref] + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + - name: Checkout Torchao uses: actions/checkout@v4 - - name: Setup XPU - uses: pytorch/pytorch/.github/actions/setup-xpu@main + - name: Clean all stopped docker containers + if: always() + shell: bash + run: | + # Prune all stopped containers. + # If other runner is pruning on this node, will skip. + nprune=$(ps -ef | grep -c "docker container prune") + if [[ $nprune -eq 1 ]]; then + docker container prune -f + fi + + - name: Runner health check system info + if: always() + shell: bash + run: | + cat /etc/os-release || true + cat /etc/apt/sources.list.d/oneAPI.list || true + cat /etc/apt/sources.list.d/intel-gpu-jammy.list || true + whoami + + - name: Runner health check xpu-smi + if: always() + shell: bash + run: | + timeout 30 xpu-smi discovery || true + + - name: Runner health check GPU count + if: always() + shell: bash + run: | + ngpu=$(timeout 30 xpu-smi discovery | grep -c -E 'Device Name' || true) + msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified" + if [[ $ngpu -eq 0 ]]; then + echo "Error: Failed to detect any GPUs on the runner" + echo "$msg" + exit 1 + fi + + - name: Runner diskspace health check + uses: pytorch/pytorch/github/actions/diskspace-cleanup + if: always() + + - name: Runner health check disconnect on failure + if: ${{ failure() }} + shell: bash + run: | + killall runsvc.sh + + - name: Preserve github env variables for use in docker + shell: bash + run: | + env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}" + env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}" + + - name: XPU set GPU_FLAG + shell: bash + run: | + # Add render group for container creation. + render_gid=`cat /etc/group | grep render | cut -d: -f3` + echo "GPU_FLAG=--device=/dev/mem --device=/dev/dri --group-add video --group-add $render_gid" >> "${GITHUB_ENV}" - name: configure aws credentials id: aws_creds From 4fd2909602b09a4d6dbcc83815cbe9b76d03f8a1 Mon Sep 17 00:00:00 2001 From: diwei sun Date: Mon, 8 Sep 2025 23:18:29 -0700 Subject: [PATCH 19/25] format fix --- .github/workflows/pr-test-xpu.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr-test-xpu.yml b/.github/workflows/pr-test-xpu.yml index 4b46f14218..44826d258c 100644 --- a/.github/workflows/pr-test-xpu.yml +++ b/.github/workflows/pr-test-xpu.yml @@ -74,7 +74,7 @@ jobs: fi - name: Runner diskspace health check - uses: pytorch/pytorch/github/actions/diskspace-cleanup + uses: pytorch/pytorch/.github/actions/diskspace-cleanup@main if: always() - name: Runner health check disconnect on failure From 4a7d9af91d38a6911699ea9b0ac654c191e6c758 Mon Sep 17 00:00:00 2001 From: diwei sun Date: Tue, 9 Sep 2025 00:44:07 -0700 Subject: [PATCH 20/25] format fix --- .github/workflows/pr-test-xpu.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr-test-xpu.yml b/.github/workflows/pr-test-xpu.yml index 44826d258c..9d610cbd76 100644 --- a/.github/workflows/pr-test-xpu.yml +++ b/.github/workflows/pr-test-xpu.yml @@ -195,7 +195,7 @@ jobs: - name: Change permissions if: ${{ always() && steps.test.conclusion }} run: | - docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test" + docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R jenkins:jenkins test" - name: Collect backtraces from coredumps (if any) if: always() From c3f43848c7a3259a8759645b8356d498477391e8 Mon Sep 17 00:00:00 2001 From: diwei sun Date: Tue, 9 Sep 2025 00:47:32 -0700 Subject: [PATCH 21/25] format fix --- .github/workflows/pr-test-xpu.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr-test-xpu.yml b/.github/workflows/pr-test-xpu.yml index 9d610cbd76..12b0726717 100644 --- a/.github/workflows/pr-test-xpu.yml +++ b/.github/workflows/pr-test-xpu.yml @@ -21,7 +21,7 @@ jobs: # Don't run on forked repos or empty test matrix # if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]' timeout-minutes: 60 - runs-on: ubuntu-latest + runs-on: linux.idc.xpu env: DOCKER_IMAGE: ci-image:pytorch-linux-jammy-xpu-n-py3 PYTORCH_RETRY_TEST_CASES: 1 From e8936cb6f7c73b69bdf8136423313d4bcf814035 Mon Sep 17 00:00:00 2001 From: diwei sun Date: Tue, 9 Sep 2025 22:42:45 -0700 Subject: [PATCH 22/25] format fix --- .github/workflows/pr-test-xpu.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/pr-test-xpu.yml b/.github/workflows/pr-test-xpu.yml index 12b0726717..07ef696be5 100644 --- a/.github/workflows/pr-test-xpu.yml +++ b/.github/workflows/pr-test-xpu.yml @@ -12,6 +12,10 @@ on: branches: - main +permissions: + id-token: write + contents: read + concurrency: group: xpu_ci_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} cancel-in-progress: true From 50e56ec0f8ae28ab130f5761206439469148e30b Mon Sep 17 00:00:00 2001 From: diwei sun Date: Sun, 14 Sep 2025 23:19:14 -0700 Subject: [PATCH 23/25] trigger by tag only --- .github/workflows/pr-test-xpu.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/pr-test-xpu.yml b/.github/workflows/pr-test-xpu.yml index 07ef696be5..6581349206 100644 --- a/.github/workflows/pr-test-xpu.yml +++ b/.github/workflows/pr-test-xpu.yml @@ -8,9 +8,6 @@ on: push: tags: - ciflow/xpu/* - pull_request: - branches: - - main permissions: id-token: write From 5a4634112bdce70752211b40d2435d1114d520b5 Mon Sep 17 00:00:00 2001 From: diwei sun Date: Tue, 16 Sep 2025 01:31:17 -0700 Subject: [PATCH 24/25] add xpu label for xpuci --- .github/pytorch-probot.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml index f0230a8ecd..85ecfdbed2 100644 --- a/.github/pytorch-probot.yml +++ b/.github/pytorch-probot.yml @@ -4,3 +4,4 @@ ciflow_push_tags: - ciflow/tutorials - ciflow/rocm - ciflow/4xh100 +- ciflow/xpu From 030121f128a5a0007b79ffb8998520548873bb67 Mon Sep 17 00:00:00 2001 From: diwei sun Date: Tue, 16 Sep 2025 19:29:47 -0700 Subject: [PATCH 25/25] fix docker path --- .github/workflows/pr-test-xpu.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pr-test-xpu.yml b/.github/workflows/pr-test-xpu.yml index 6581349206..71b28ed76d 100644 --- a/.github/workflows/pr-test-xpu.yml +++ b/.github/workflows/pr-test-xpu.yml @@ -113,6 +113,7 @@ jobs: uses: pytorch/test-infra/.github/actions/calculate-docker-image@main with: docker-image-name: ${{ env.DOCKER_IMAGE }} + docker-build-dir: pytorch/pytorch/.ci/docker - name: Use following to pull public copy of the image id: print-ghcr-mirror