Skip to content

Commit a498bbd

Browse files
committed
Add ported distributed cases
1 parent f2bcd8a commit a498bbd

File tree

4 files changed

+57
-5
lines changed

4 files changed

+57
-5
lines changed

.github/scripts/ut_result_check.sh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -220,9 +220,10 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then
220220
fi
221221
fi
222222
if [[ "${ut_suite}" == 'xpu_distributed' ]]; then
223-
grep -E "^FAILED" xpu_distributed_test.log | awk '{print $2}' > ./"${ut_suite}"_xpu_distributed_test_failed.log
223+
grep -E "^FAILED" xpu_distributed_test.log | awk '{print $3}' > ./"${ut_suite}"_xpu_distributed_test_failed.log
224224
grep -E "have failures" xpu_distributed_test.log | awk '{print $1}' >> ./"${ut_suite}"_xpu_distributed_test_failed.log
225-
grep "PASSED" xpu_distributed_test.log | awk '{print $1}' > ./"${ut_suite}"_xpu_distributed_test_passed.log
225+
sed -i '/^[^.]\+/d' ./"${ut_suite}"_xpu_distributed_test_failed.log
226+
grep "PASSED" xpu_distributed_test.log | awk '{print $3}' > ./"${ut_suite}"_xpu_distributed_test_passed.log
226227
echo -e "========================================================================="
227228
echo -e "Show Failed cases in ${ut_suite} xpu distributed"
228229
echo -e "========================================================================="

.github/workflows/_linux_ut.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -403,7 +403,7 @@ jobs:
403403
distributed_ut_test:
404404
runs-on: pytorch-06
405405
if: ${{ contains(inputs.ut, 'xpu_distributed') && !contains(inputs.disabled_tests, 'disable_distribute') }}
406-
timeout-minutes: 60
406+
timeout-minutes: 120
407407
env:
408408
GH_TOKEN: ${{ github.token }}
409409
NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }}
@@ -510,7 +510,7 @@ jobs:
510510
echo -e "[ERROR] XCCL is not enabled"
511511
exit 1
512512
fi
513-
timeout 1800 python run_distributed.py \
513+
python run_distributed.py \
514514
2>${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \
515515
tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log
516516
- name: Reset Ptrace_scope

test/xpu/run_distributed.py

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,41 @@
1111

1212
# libfabric WA to avoid hang issue
1313
os.environ["FI_PROVIDER"] = "tcp"
14-
# os.environ["ZE_AFFINITY_MASK"] = "0,1,2,3"
14+
# Get the xelink group card affinity
15+
ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
16+
if ret == 0:
17+
gpu_dict = {}
18+
with open("topology.log") as file:
19+
lines = file.readlines()
20+
for line in lines:
21+
if "CPU Affinity" in line:
22+
continue
23+
line = line.strip()
24+
if line.startswith("GPU "):
25+
items = line.split(" ")
26+
items = [x for x in items if x]
27+
gpu_id = items[1]
28+
i = gpu_id.split("/")[0]
29+
affinity = ""
30+
for j, item in enumerate(items):
31+
if "SYS" not in item and ("XL" in item or "S" in item):
32+
if len(affinity) == 0:
33+
affinity = str(j - 2)
34+
else:
35+
affinity = affinity + "," + str(j - 2)
36+
gpu_dict[i] = affinity
37+
38+
max_affinity = ""
39+
for key, value in gpu_dict.items():
40+
if len(value) > len(max_affinity):
41+
max_affinity = value
42+
43+
os.environ["ZE_AFFINITY_MASK"] = str(max_affinity)
44+
print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK")))
45+
46+
else:
47+
print("xpu-smi topology failed")
48+
sys.exit(255)
1549

1650

1751
# run python test
@@ -24,10 +58,18 @@ def run(test_command):
2458
return result.returncode
2559

2660

61+
os.environ["CCL_SEND"] = "direct"
62+
os.environ["CCL_RECV"] = "direct"
2763
test_command = ["python", "distributed/test_c10d_ops_xccl.py"]
2864
res += run(test_command)
65+
del os.environ["CCL_SEND"]
66+
del os.environ["CCL_RECV"]
2967
test_command = ["python", "distributed/test_c10d_xccl.py"]
3068
res += run(test_command)
69+
test_command = ["python", "../../../../test/distributed/pipelining/test_backward.py"]
70+
res += run(test_command)
71+
test_command = ["python", "../../../../test/distributed/pipelining/test_microbatch.py"]
72+
res += run(test_command)
3173

3274
# run pytest with skiplist
3375
for key in skip_dict:

test/xpu/skip_list_dist.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
"test_ddp_parity_xpu",
1111
),
1212
"../../../../test/distributed/fsdp/test_fsdp_comm.py": None,
13+
"../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None,
1314
"../../../../test/distributed/fsdp/test_fsdp_core.py": (
1415
"test_delayed_optim_step_offload_false_no_shard_xpu",
1516
"test_delayed_optim_step_offload_false_none_xpu",
@@ -95,4 +96,12 @@
9596
# "test_root_module_is_not_FSDP_xpu",
9697
# ),
9798
"../../../../test/distributed/fsdp/test_utils.py": None,
99+
"../../../../test/distributed/fsdp/test_fsdp_misc.py": None,
100+
"../../../../test/distributed/test_functional_api.py": (
101+
# depends on https://github.com/pytorch/pytorch/pull/159473
102+
"test_tracing_with_fakepg_xpu",
103+
),
104+
"../../../../test/distributed/_tools/test_fsdp2_mem_tracker.py": None,
105+
"../../../../test/distributed/_tools/test_mem_tracker.py": None,
106+
"../../../../test/distributed/_tools/test_memory_tracker.py": None,
98107
}

0 commit comments

Comments
 (0)