diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml index f0230a8ecd..85ecfdbed2 100644 --- a/.github/pytorch-probot.yml +++ b/.github/pytorch-probot.yml @@ -4,3 +4,4 @@ ciflow_push_tags: - ciflow/tutorials - ciflow/rocm - ciflow/4xh100 +- ciflow/xpu diff --git a/.github/scripts/ci_test_xpu.sh b/.github/scripts/ci_test_xpu.sh new file mode 100644 index 0000000000..ccff1b848f --- /dev/null +++ b/.github/scripts/ci_test_xpu.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +python3 -m pip install torch torchvision torchaudio pytorch-triton-xpu --index-url https://download.pytorch.org/whl/nightly/xpu --force-reinstall --no-cache-dir +python3 setup.py install + +pip install pytest expecttest parameterized accelerate hf_transfer 'modelscope!=1.15.0' + +cd test/quantization +pytest -v -s *.py diff --git a/.github/workflows/pr-test-xpu.yml b/.github/workflows/pr-test-xpu.yml new file mode 100644 index 0000000000..71b28ed76d --- /dev/null +++ b/.github/workflows/pr-test-xpu.yml @@ -0,0 +1,224 @@ +# TODO: this looks sort of similar to _linux-test, but there are like a dozen +# places where you would have to insert an if statement. Probably it's better to +# just use a different workflow altogether + +name: xpu-test + +on: + push: + tags: + - ciflow/xpu/* + +permissions: + id-token: write + contents: read + +concurrency: + group: xpu_ci_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} + cancel-in-progress: true + +jobs: + test: + # Don't run on forked repos or empty test matrix + # if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]' + timeout-minutes: 60 + runs-on: linux.idc.xpu + env: + DOCKER_IMAGE: ci-image:pytorch-linux-jammy-xpu-n-py3 + PYTORCH_RETRY_TEST_CASES: 1 + PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1 + XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla + steps: + # [see note: pytorch repo ref] + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + + - name: Checkout Torchao + uses: actions/checkout@v4 + + - name: Clean all stopped docker containers + if: always() + shell: bash + run: | + # Prune all stopped containers. + # If other runner is pruning on this node, will skip. + nprune=$(ps -ef | grep -c "docker container prune") + if [[ $nprune -eq 1 ]]; then + docker container prune -f + fi + + - name: Runner health check system info + if: always() + shell: bash + run: | + cat /etc/os-release || true + cat /etc/apt/sources.list.d/oneAPI.list || true + cat /etc/apt/sources.list.d/intel-gpu-jammy.list || true + whoami + + - name: Runner health check xpu-smi + if: always() + shell: bash + run: | + timeout 30 xpu-smi discovery || true + + - name: Runner health check GPU count + if: always() + shell: bash + run: | + ngpu=$(timeout 30 xpu-smi discovery | grep -c -E 'Device Name' || true) + msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified" + if [[ $ngpu -eq 0 ]]; then + echo "Error: Failed to detect any GPUs on the runner" + echo "$msg" + exit 1 + fi + + - name: Runner diskspace health check + uses: pytorch/pytorch/.github/actions/diskspace-cleanup@main + if: always() + + - name: Runner health check disconnect on failure + if: ${{ failure() }} + shell: bash + run: | + killall runsvc.sh + + - name: Preserve github env variables for use in docker + shell: bash + run: | + env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}" + env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}" + + - name: XPU set GPU_FLAG + shell: bash + run: | + # Add render group for container creation. + render_gid=`cat /etc/group | grep render | cut -d: -f3` + echo "GPU_FLAG=--device=/dev/mem --device=/dev/dri --group-add video --group-add $render_gid" >> "${GITHUB_ENV}" + + - name: configure aws credentials + id: aws_creds + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + + - name: Login to Amazon ECR + id: login-ecr + uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1 + + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + with: + docker-image-name: ${{ env.DOCKER_IMAGE }} + docker-build-dir: pytorch/pytorch/.ci/docker + + - name: Use following to pull public copy of the image + id: print-ghcr-mirror + env: + ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} + shell: bash + run: | + tag=${ECR_DOCKER_IMAGE##*:} + echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}" + + - name: Pull docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@main + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + + - name: Runner health check GPU count + if: always() + shell: bash + run: | + ngpu=$(timeout 30 clinfo -l | grep -c -E 'Device' || true) + msg="Please file an issue on pytorch/ao reporting the faulty runner. Include a link to the runner logs so the runner can be identified" + if [[ $ngpu -eq 0 ]]; then + echo "Error: Failed to detect any GPUs on the runner" + echo "$msg" + exit 1 + fi + + - name: Test + id: test + env: + TEST_COMMAND: .github/scripts/ci_test_xpu.sh + DOCKER_IMAGE: ci-image:pytorch-linux-jammy-xpu-n-py3 + PR_NUMBER: ${{ github.event.pull_request.number }} + GITHUB_REPOSITORY: ${{ github.repository }} + GITHUB_WORKFLOW: ${{ github.workflow }} + GITHUB_JOB: ${{ github.job }} + GITHUB_RUN_ID: ${{ github.run_id }} + GITHUB_RUN_NUMBER: ${{ github.run_number }} + GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + timeout-minutes: 60 + run: | + set -x + + # detached container should get cleaned up by teardown_ec2_linux + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e PR_NUMBER \ + -e GITHUB_ACTIONS \ + -e GITHUB_REPOSITORY \ + -e GITHUB_WORKFLOW \ + -e GITHUB_JOB \ + -e GITHUB_RUN_ID \ + -e GITHUB_RUN_NUMBER \ + -e GITHUB_RUN_ATTEMPT \ + -e JOB_ID \ + -e BRANCH \ + -e SHA1 \ + --user $(id -u):$(id -g) \ + --ulimit stack=10485760:83886080 \ + --ulimit core=0 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --shm-size="8g" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + --privileged \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + # save container name for later step + echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV" + # jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home + docker exec -t "${container_name}" sh -c "bash ${env.TEST_COMMAND}" + + - name: Change permissions + if: ${{ always() && steps.test.conclusion }} + run: | + docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R jenkins:jenkins test" + + - name: Collect backtraces from coredumps (if any) + if: always() + run: | + # shellcheck disable=SC2156 + find . -iname "core.[1-9]*" -exec docker exec "${CONTAINER_NAME}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \; + + - name: Stop container before exit + if: always() + run: | + # Workaround for multiple runners on same IDC node + docker stop "${{ env.CONTAINER_NAME }}" + + - name: Store Core dumps on GitHub + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + if: failure() + with: + name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }} + retention-days: 14 + if-no-files-found: ignore + path: ./**/core.[1-9]* + + - name: Teardown XPU + uses: pytorch/pytorch/.github/actions/teardown-xpu@main