Skip to content

Commit 90423a8

Browse files
committed
Add ported distributed cases
1 parent 728c3f1 commit 90423a8

File tree

4 files changed

+55
-2
lines changed

4 files changed

+55
-2
lines changed

.github/actions/linux-uttest/action.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,3 +168,4 @@ runs:
168168
python run_distributed.py \
169169
2> ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \
170170
tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log
171+
cp *.xml ${{ github.workspace }}/ut_log

.github/scripts/ut_result_check.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -217,8 +217,9 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then
217217
fi
218218
fi
219219
if [[ "${ut_suite}" == 'xpu_distributed' ]]; then
220-
grep -E "^FAILED" xpu_distributed_test.log | awk '{print $2}' > ./"${ut_suite}"_xpu_distributed_test_failed.log
220+
grep -E "^FAILED" xpu_distributed_test.log | awk '{print $3}' > ./"${ut_suite}"_xpu_distributed_test_failed.log
221221
grep -E "have failures" xpu_distributed_test.log | awk '{print $1}' >> ./"${ut_suite}"_xpu_distributed_test_failed.log
222+
sed -i '/^[^.]\+/d' ./"${ut_suite}"_xpu_distributed_test_failed.log
222223
grep "PASSED" xpu_distributed_test.log | awk '{print $1}' > ./"${ut_suite}"_xpu_distributed_test_passed.log
223224
echo -e "========================================================================="
224225
echo -e "Show Failed cases in ${ut_suite} xpu distributed"

test/xpu/run_distributed.py

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,41 @@
1111

1212
# libfabric WA to avoid hang issue
1313
os.environ["FI_PROVIDER"] = "tcp"
14-
# os.environ["ZE_AFFINITY_MASK"] = "0,1,2,3"
14+
# Get the xelink group card affinity
15+
ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
16+
if ret == 0:
17+
gpu_dict = {}
18+
with open("topology.log") as file:
19+
lines = file.readlines()
20+
for line in lines:
21+
if "CPU Affinity" in line:
22+
continue
23+
line = line.strip()
24+
if line.startswith("GPU "):
25+
items = line.split(" ")
26+
items = [x for x in items if x]
27+
gpu_id = items[1]
28+
i = gpu_id.split("/")[0]
29+
affinity = ""
30+
for j, item in enumerate(items):
31+
if "SYS" not in item and ("XL" in item or "S" in item):
32+
if len(affinity) == 0:
33+
affinity = str(j - 2)
34+
else:
35+
affinity = affinity + "," + str(j - 2)
36+
gpu_dict[i] = affinity
37+
38+
max_affinity = ""
39+
for key, value in gpu_dict.items():
40+
if len(value) > len(max_affinity):
41+
max_affinity = value
42+
43+
os.environ["ZE_AFFINITY_MASK"] = str(max_affinity)
44+
print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK")))
45+
46+
else:
47+
print("xpu-smi topology failed")
48+
sys.exit(255)
1549

1650

1751
# run python test
@@ -24,8 +58,16 @@ def run(test_command):
2458
return result.returncode
2559

2660

61+
os.environ["CCL_SEND"] = "direct"
62+
os.environ["CCL_RECV"] = "direct"
2763
test_command = ["python", "distributed/test_c10d_ops_xccl.py"]
2864
res += run(test_command)
65+
del os.environ["CCL_SEND"]
66+
del os.environ["CCL_RECV"]
67+
test_command = ["python", "../../../../test/distributed/pipelining/test_backward.py"]
68+
res += run(test_command)
69+
test_command = ["python", "../../../../test/distributed/pipelining/test_microbatch.py"]
70+
res += run(test_command)
2971

3072
# run pytest with skiplist
3173
for key in skip_dict:

test/xpu/skip_list_dist.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
"test_ddp_parity_xpu",
1111
),
1212
"../../../../test/distributed/fsdp/test_fsdp_comm.py": None,
13+
"../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None,
1314
"../../../../test/distributed/fsdp/test_fsdp_core.py": (
1415
"test_delayed_optim_step_offload_false_no_shard_xpu",
1516
"test_delayed_optim_step_offload_false_none_xpu",
@@ -102,4 +103,12 @@
102103
# will bring back after oneccl upgrade to 2021.16.1
103104
"test_xccl_barrier",
104105
),
106+
"../../../../test/distributed/fsdp/test_fsdp_misc.py": None,
107+
"../../../../test/distributed/test_functional_api.py": (
108+
# depends on https://github.com/pytorch/pytorch/pull/159473
109+
"test_tracing_with_fakepg_xpu",
110+
),
111+
"../../../../test/distributed/_tools/test_fsdp2_mem_tracker.py": None,
112+
"../../../../test/distributed/_tools/test_mem_tracker.py": None,
113+
"../../../../test/distributed/_tools/test_memory_tracker.py": None,
105114
}

0 commit comments

Comments
 (0)