Skip to content

Commit 4d38b5e

Browse files
authored
[CI] Fix CICD test issues (#2027)
1. torchbench issues caused by deps installation 2. pt2e test dataset path and deps installation 3. op benchmark github access permission 4. enhance bisect search disable_distributed
1 parent d8c3eef commit 4d38b5e

File tree

7 files changed

+67
-85
lines changed

7 files changed

+67
-85
lines changed

.github/actions/linux-testenv/action.yml

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ runs:
9999
TORCH_XPU_OPS_COMMIT="${{ inputs.torch_xpu_ops }}"
100100
fi
101101
fi
102-
if [ "${{ github.event_name }}" == "pull_request" ];then
102+
if [ "${{ github.event_name }}" == "pull_request" ] && [[ "${{ inputs.pytorch }}" != *"_wheel" ]];then
103103
cp -r ${{ github.workspace }}/torch-xpu-ops third_party/torch-xpu-ops
104104
cd third_party/torch-xpu-ops
105105
else
@@ -139,7 +139,7 @@ runs:
139139
fi
140140
# for dlrm
141141
pip install pyre-extensions
142-
curl -fsSL https://raw.githubusercontent.com/facebookresearch/dlrm/refs/heads/torchrec-dlrm/requirements.txt |xargs pip install --no-deps
142+
curl -fsSL https://raw.githubusercontent.com/facebookresearch/dlrm/refs/heads/torchrec-dlrm/requirements.txt |xargs pip install
143143
# for soft_actor_critic, temp fix
144144
pip install git+https://github.com/nocoding03/gym@fix-np
145145
cd ../pytorch
@@ -152,10 +152,13 @@ runs:
152152
TORCHBENCH_COMMIT_ID="$(git rev-parse --short HEAD)"
153153
sed -i 's/^ *pynvml.*//' requirements.txt
154154
pip install -r requirements.txt
155-
python install.py --continue_on_fail
155+
# python install.py --continue_on_fail
156+
echo "PYTHONPATH=${PWD}:${PYTHONPATH}" >> ${GITHUB_ENV}
157+
pip install dominate
158+
python install.py Super_SloMo
156159
# for dlrm
157160
pip install pyre-extensions
158-
curl -fsSL https://raw.githubusercontent.com/facebookresearch/dlrm/refs/heads/torchrec-dlrm/requirements.txt |xargs pip install --no-deps
161+
curl -fsSL https://raw.githubusercontent.com/facebookresearch/dlrm/refs/heads/torchrec-dlrm/requirements.txt |xargs pip install
159162
cd ../pytorch
160163
else
161164
pip install -r ./.ci/docker/requirements-ci.txt
@@ -170,6 +173,11 @@ runs:
170173
else
171174
pip install torchao --pre --index-url https://download.pytorch.org/whl/nightly/xpu
172175
fi
176+
if [ "${{ inputs.suite }}" != "None" ];then
177+
# To install numpy 1.x for benchmarks as CUDA
178+
# yolov requires numpy>=1.23
179+
pip install -U numpy==1.26.4
180+
fi
173181
- name: Torch Config
174182
shell: bash -xe {0}
175183
run: |

.github/actions/pt2e/action.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,15 @@ runs:
2929
shell: bash -xe {0}
3030
run: |
3131
# dataset
32+
dataset_dir="${RUNNER_TEMP}/_datasets/imagenet"
3233
if [ ! -d ${dataset_dir} ];then
3334
rm -rf ${dataset_dir} && mkdir -p ${dataset_dir} && cd ${dataset_dir}
3435
wget -O valprep.sh https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh
3536
wget -q https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_val.tar
3637
tar -xf ILSVRC2012_img_val.tar
3738
bash valprep.sh
3839
fi
40+
echo "dataset_dir=${dataset_dir}" >> ${GITHUB_ENV}
3941
- name: PT2E Test (${{ inputs.dt }} ${{ inputs.scenario }})
4042
shell: bash -xe {0}
4143
run: |

.github/workflows/_linux_e2e.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,6 @@ jobs:
8787
cpus_per_xpu: ${{ needs.runner.outputs.cpus_per_xpu }}
8888
MODEL_ONLY_NAME: ${{ inputs.model }}
8989
AGENT_TOOLSDIRECTORY: /tmp/xpu-tool
90-
dataset_dir: ${{ runner.temp }}/../_datasets/imagenet
9190
steps:
9291
- name: Checkout torch-xpu-ops
9392
uses: actions/checkout@v4

.github/workflows/_linux_e2e_summary.yml

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,15 @@ jobs:
3030
steps:
3131
- name: Checkout torch-xpu-ops
3232
uses: actions/checkout@v4
33-
- name: Install gh-cli
34-
run: |
35-
sudo apt-get update
36-
sudo apt-get install gh rsync ca-certificates -y
3733
- name: Setup python-${{ inputs.python }}
3834
uses: actions/setup-python@v5
3935
with:
4036
python-version: ${{ inputs.python }}
37+
- name: Install gh-cli
38+
run: |
39+
sudo apt-get update
40+
sudo apt-get install gh rsync ca-certificates -y
41+
pip install pandas requests
4142
- name: Download Target Artifact
4243
run: |
4344
mkdir target/
@@ -64,25 +65,25 @@ jobs:
6465
- name: Get summary
6566
if: ${{ ! cancelled() }}
6667
run: |
67-
pip install pandas requests
68+
exit_label=0
6869
e2e_summary_csv="$(find ./target/ -name "inductor_*.csv" |head -n 1)"
6970
if [ -f "${e2e_summary_csv}" ];then
7071
bash ./.github/scripts/e2e_summary.sh ./target ./baseline >> ${GITHUB_STEP_SUMMARY}
7172
exit_label=$(awk 'BEGIN{sum=0}{if($2>0){sum++}}END{print sum}' /tmp/tmp-result.txt)
7273
if [ ${exit_label} -ne 0 ];then
7374
grep -E "(Real failed|to passed|Warning timeout).*: [1-9]|Summary for" /tmp/tmp-*.txt |grep -E "failed|passed|timeout" -B 1
7475
echo "There are ${exit_label} cases that need look into!!! Please check them"
75-
exit ${exit_label}
7676
fi
7777
fi
7878
pt2e_summary_csv="$(find ./target/ -name "summary.csv")"
7979
if [ -f "${pt2e_summary_csv}" ];then
8080
cat ${pt2e_summary_csv}
81-
failed_num=$(grep -c ',failed' ${pt2e_summary_csv})
81+
failed_num=$(grep -c ',failed' ${pt2e_summary_csv} || true)
8282
if [ ${failed_num} -ne 0 ];then
8383
echo "[Warning] PT2E has failures!"
8484
fi
8585
fi
86+
exit ${exit_label}
8687
- name: Upload Reference Run ID
8788
if: ${{ endsWith(inputs.test_type, 'ly') }}
8889
run: |

.github/workflows/_linux_op_benchmark.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,6 @@ jobs:
5050
op_benchmark:
5151
needs: runner
5252
runs-on: ${{ needs.runner.outputs.runner_id }}
53-
permissions:
54-
issues: write
5553
timeout-minutes: 900
5654
container:
5755
image: mengfeili/intel-pvc-driver:1146-1136
@@ -93,6 +91,8 @@ jobs:
9391
op_benchmark_test_results_check:
9492
needs: op_benchmark
9593
runs-on: ubuntu-24.04
94+
permissions:
95+
issues: write
9696
steps:
9797
- name: Install gh-cli
9898
run: |

.github/workflows/bisect_search.yml

Lines changed: 41 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -45,42 +45,26 @@ jobs:
4545
get_runner:
4646
runs-on: ${{ inputs.runner }}
4747
outputs:
48-
test_host: ${{ steps.runner-info.outputs.test_host }}
49-
test_user: ${{ steps.runner-info.outputs.test_user }}
50-
test_group: ${{ steps.runner-info.outputs.test_group }}
48+
runner_id: ${{ steps.runner-info.outputs.runner_id }}
49+
user_id: ${{ steps.runner-info.outputs.user_id }}
50+
render_id: ${{ steps.runner-info.outputs.render_id }}
51+
hostname: ${{ steps.runner-info.outputs.hostname }}
5152
steps:
52-
- name: Get runner info
53+
- name: Checkout torch-xpu-ops
54+
uses: actions/checkout@v4
55+
- name: Get runner
5356
id: runner-info
54-
run: |
55-
# get test runner
56-
echo "test_host=${RUNNER_NAME}" |tee -a ${GITHUB_OUTPUT}
57-
echo "test_user=$(id -u)" |tee -a ${GITHUB_OUTPUT}
58-
echo "test_group=$(getent group render |cut -d: -f3)" |tee -a ${GITHUB_OUTPUT}
59-
# show host info
60-
cat /etc/os-release
61-
uname -a
62-
source /opt/intel/oneapi/setvars.sh
63-
sycl-ls
64-
dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev'
65-
- name: Cleanup workspace
66-
if: ${{ always() }}
67-
run: |
68-
# clean docker cache
69-
docker stop $(docker ps -aq) || true
70-
docker system prune -af || true
71-
# clean files
72-
ls -al
73-
sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf
57+
uses: ./.github/actions/get-runner
7458

7559
biisect-search:
7660
needs: get_runner
77-
runs-on: ${{ needs.get_runner.outputs.test_host }}
61+
runs-on: ${{ needs.get_runner.outputs.runner_id }}
7862
container:
7963
image: mengfeili/intel-pvc-driver:1146-1136
8064
volumes:
8165
- ${{ github.workspace }}:${{ github.workspace }}
8266
options: --device=/dev/mem --device=/dev/dri --group-add video --privileged --shm-size=8g
83-
-u ${{ needs.get_runner.outputs.test_user }}:${{ needs.get_runner.outputs.test_group }}
67+
-u ${{ needs.get_runner.outputs.user_id }} --group-add ${{ needs.get_runner.outputs.render_id }}
8468
env:
8569
AGENT_TOOLSDIRECTORY: /tmp/xpu-tool
8670
SEARCH_COMMITS: ${{ inputs.search_commits }}
@@ -95,33 +79,19 @@ jobs:
9579
HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
9680
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
9781
steps:
98-
- name: Check runner
99-
run: |
100-
ls -al
101-
sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf
102-
sudo rm -rf /tmp/xpu-tool
103-
- name: Setup python-${{ inputs.python }}
104-
uses: actions/setup-python@v5
82+
- name: Checkout torch-xpu-ops
83+
uses: actions/checkout@v4
10584
with:
106-
python-version: ${{ inputs.python }}
107-
- name: Check runner
108-
run: |
109-
hostname && whoami && id
110-
clinfo --list
111-
gcc -v && g++ -v
112-
which python && which pip
113-
python -V
114-
pip install -U pip wheel setuptools
115-
pip list
116-
uname -a
117-
dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev'
118-
pip install cmake ninja pandas psutil scipy requests pybind11
119-
mkdir gs-logs gs-search
120-
echo "Status,Acc,Perf,PyTorch,Torch-xpu-ops" > gs-logs/summary.csv
85+
path: gs-scripts
86+
- name: Prepare test env
87+
uses: ./gs-scripts/.github/actions/linux-testenv
88+
with:
89+
pytorch: nightly_wheel
90+
python: ${{ inputs.python }}
12191
- name: Install oneAPI DLE
12292
if: ${{ inputs.oneapi != 'installed' }}
12393
run: |
124-
rm -rf ~/intel ~/.intel /tmp/intel
94+
rm -rf ~/intel ~/.intel ${HOME}/intel
12595
if [ "${{ inputs.oneapi }}" == "2025.1" ];then
12696
ONEAPI_URL="https://registrationcenter-download.intel.com/akdlm/IRC_NAS/3435dc45-055e-4f7a-86b1-779931772404/intel-deep-learning-essentials-2025.1.3.7_offline.sh"
12797
elif [ "${{ inputs.oneapi }}" == "2025.2" ];then
@@ -130,42 +100,42 @@ jobs:
130100
ONEAPI_URL="${{ inputs.oneapi }}"
131101
fi
132102
wget -q -O oneapi.sh "${ONEAPI_URL}"
133-
bash oneapi.sh -a -s --eula accept --action install --install-dir /tmp/intel/oneapi
134-
echo "XPU_ONEAPI_PATH=/tmp/intel/oneapi" >> ${GITHUB_ENV}
135-
- name: Checkout torch-xpu-ops
136-
uses: actions/checkout@v4
137-
with:
138-
path: gs-scripts
139-
- name: Prepare source code
103+
bash oneapi.sh -a -s --eula accept --action install --install-dir ${HOME}/intel/oneapi
104+
echo "XPU_ONEAPI_PATH=${HOME}/intel/oneapi" >> ${GITHUB_ENV}
105+
106+
- name: Summary file
140107
run: |
141-
git clone https://github.com/pytorch/pytorch gs-pytorch
142-
cd gs-pytorch
108+
mkdir -p gs-logs
109+
echo "Status,Acc,Perf,PyTorch,Torch-xpu-ops" > gs-logs/summary.csv
110+
- name: Get latest versions
111+
run: |
112+
cd pytorch
143113
LATEST_PT_COMMIT="$(git rev-parse HEAD)"
144-
cd ..
145-
git clone https://github.com/intel/torch-xpu-ops gs-torch-xpu-ops
146-
cd gs-torch-xpu-ops
114+
cd third_party/torch-xpu-ops
147115
LATEST_XPU_COMMIT="$(git rev-parse HEAD)"
148-
cd ..
149116
echo "LATEST_PT_COMMIT=${LATEST_PT_COMMIT}" >> ${GITHUB_ENV}
150117
echo "LATEST_XPU_COMMIT=${LATEST_XPU_COMMIT}" >> ${GITHUB_ENV}
151118
- name: Prepare test env
152119
run: |
153-
pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu
154120
if [[ "${{ inputs.search_case }}" == *"benchmarks/dynamo/huggingface.py"* ]];then
155-
pip install transformers==4.44.2
121+
pip install transformers
156122
elif [[ "${{ inputs.search_case }}" == *"benchmarks/dynamo/timm_models.py"* ]];then
157-
pip install --no-deps git+https://github.com/huggingface/[email protected]
158-
pip install $(curl -sSL https://raw.githubusercontent.com/huggingface/pytorch-image-models/v1.0.14/requirements.txt | grep -vE torch)
123+
pip install timm
159124
elif [[ "${{ inputs.search_case }}" == *"benchmarks/dynamo/torchbench.py"* ]];then
160125
model_name="$(echo ${{ inputs.search_case }} |sed 's+.*\--only *++;s/ .*//')"
161126
git clone https://github.com/pytorch/benchmark gs-benchmark
162127
cd gs-benchmark
128+
pip install -r requirements.txt
163129
echo "PYTHONPATH=${PWD}:${PYTHONPATH}" >> ${GITHUB_ENV}
164130
python install.py ${model_name}
165-
else
166-
pip install -r gs-pytorch/.ci/docker/requirements-ci.txt
131+
cd ..
167132
fi
168-
pip uninstall -y torch && pip uninstall -y torch
133+
pip uninstall -y torchvision torchaudio
134+
git clone https://github.com/pytorch/vision gs-vision
135+
cd gs-vision
136+
python setup.py install
137+
cd ..
138+
pip uninstall -y torch
169139
- name: Bisect search pytorch
170140
if: ${{ contains(inputs.search_commits, 'pytorch') }}
171141
run: |
@@ -186,7 +156,7 @@ jobs:
186156
> ${{ github.workspace }}/gs-logs/search-${new_commit}-${LATEST_XPU_COMMIT}.log 2>&1 && echo $? || echo $?)"
187157
new_result="$(tail -n 1 ${{ github.workspace }}/gs-search/result.csv)"
188158
if [ "${old_status}" != "${new_status}" ];then
189-
cd gs-pytorch
159+
cd pytorch
190160
git reset --hard
191161
rsync -avz --delete ${{ github.workspace }}/gs-scripts/ gs-scripts/
192162
git bisect start ${new_commit} ${old_commit}
@@ -219,7 +189,7 @@ jobs:
219189
> ${{ github.workspace }}/gs-logs/search-${LATEST_PT_COMMIT}-${new_commit}.log && echo $? || echo $?)"
220190
new_result="$(tail -n 1 ${{ github.workspace }}/gs-search/result.csv)"
221191
if [ "${old_status}" != "${new_status}" ];then
222-
cd gs-pytorch
192+
cd pytorch
223193
git reset --hard
224194
rsync -avz --delete ${{ github.workspace }}/gs-scripts/ gs-scripts/
225195
git bisect start ${new_commit} ${old_commit}

.github/workflows/pull.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ jobs:
115115
with:
116116
runner: pvc_rolling
117117
pytorch: ${{ needs.conditions-filter.outputs.pytorch }}
118+
torch_xpu_ops: ${{ needs.conditions-filter.outputs.pytorch == 'nightly_wheel' && 'pinned' || 'main' }}
118119
ut: ${{ matrix.ut_name }}
119120

120121
linux-distributed:
@@ -129,6 +130,7 @@ jobs:
129130
with:
130131
runner: pvc_rolling
131132
pytorch: ${{ needs.conditions-filter.outputs.pytorch }}
133+
torch_xpu_ops: ${{ needs.conditions-filter.outputs.pytorch == 'nightly_wheel' && 'pinned' || 'main' }}
132134
ut: ${{ matrix.ut_name }}
133135

134136
linux-e2e:

0 commit comments

Comments
 (0)