Skip to content

Commit bf75fbe

Browse files
committed
Add ported distributed cases
1 parent 509032d commit bf75fbe

File tree

4 files changed

+61
-61
lines changed

4 files changed

+61
-61
lines changed

.github/actions/linux-uttest/action.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,9 +166,13 @@ runs:
166166
echo -e "[ERROR] XCCL is not enabled"
167167
exit 1
168168
fi
169+
export CCL_ROOT=$(dirname $(which python))/../
170+
export PATH="${CCL_ROOT}/bin/libfabric:${PATH}"
171+
export LD_LIBRARY_PATH="${CCL_ROOT}/lib:${LD_LIBRARY_PATH}"
169172
python run_distributed.py \
170173
2> ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \
171174
tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log
175+
cp *.xml ${{ github.workspace }}/ut_log
172176
173177
# Summary
174178
- name: UT Test Results Summary

.github/scripts/ut_result_check.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,9 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then
218218
fi
219219
fi
220220
if [[ "${ut_suite}" == 'xpu_distributed' ]]; then
221-
grep -E "^FAILED" xpu_distributed_test.log | awk '{print $2}' > ./"${ut_suite}"_xpu_distributed_test_failed.log
221+
grep -E "^FAILED" xpu_distributed_test.log | awk '{print $3}' > ./"${ut_suite}"_xpu_distributed_test_failed.log
222+
grep -E "have failures" xpu_distributed_test.log | awk '{print $1}' >> ./"${ut_suite}"_xpu_distributed_test_failed.log
223+
sed -i '/^[^.]\+/d' ./"${ut_suite}"_xpu_distributed_test_failed.log
222224
grep "PASSED" xpu_distributed_test.log | awk '{print $1}' > ./"${ut_suite}"_xpu_distributed_test_passed.log
223225
echo -e "========================================================================="
224226
echo -e "Show Failed cases in ${ut_suite} xpu distributed"

test/xpu/run_distributed.py

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,41 @@
99
res2 = 0
1010
fail_test = []
1111

12-
# libfabric WA to avoid hang issue
13-
os.environ["FI_PROVIDER"] = "tcp"
14-
# os.environ["ZE_AFFINITY_MASK"] = "0,1,2,3"
12+
# Get the xelink group card affinity
13+
ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
14+
if ret == 0:
15+
gpu_dict = {}
16+
with open("topology.log") as file:
17+
lines = file.readlines()
18+
for line in lines:
19+
if "CPU Affinity" in line:
20+
continue
21+
line = line.strip()
22+
if line.startswith("GPU "):
23+
items = line.split(" ")
24+
items = [x for x in items if x]
25+
gpu_id = items[1]
26+
i = gpu_id.split("/")[0]
27+
affinity = ""
28+
for j, item in enumerate(items):
29+
if "SYS" not in item and ("XL" in item or "S" in item):
30+
if len(affinity) == 0:
31+
affinity = str(j - 2)
32+
else:
33+
affinity = affinity + "," + str(j - 2)
34+
gpu_dict[i] = affinity
35+
36+
max_affinity = ""
37+
for key, value in gpu_dict.items():
38+
if len(value) > len(max_affinity):
39+
max_affinity = value
40+
41+
os.environ["ZE_AFFINITY_MASK"] = str(max_affinity)
42+
print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK")))
43+
44+
else:
45+
print("xpu-smi topology failed")
46+
sys.exit(255)
1547

1648

1749
# run python test
@@ -26,6 +58,12 @@ def run(test_command):
2658

2759
test_command = ["python", "distributed/test_c10d_ops_xccl.py"]
2860
res += run(test_command)
61+
test_command = ["python", "distributed/test_c10d_xccl.py"]
62+
res += run(test_command)
63+
test_command = ["python", "../../../../test/distributed/pipelining/test_backward.py"]
64+
res += run(test_command)
65+
test_command = ["python", "../../../../test/distributed/pipelining/test_microbatch.py"]
66+
res += run(test_command)
2967

3068
# run pytest with skiplist
3169
for key in skip_dict:

test/xpu/skip_list_dist.py

Lines changed: 13 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -1,57 +1,12 @@
11
skip_dict = {
2-
"../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": (
3-
"test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_False_use_orig_params_False",
4-
"test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_False_use_orig_params_False",
5-
"test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_True_use_orig_params_False",
6-
"test_checkpoint_submodule_use_reentrant_False_xpu",
7-
),
2+
"../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None,
83
"../../../../test/distributed/fsdp/test_fsdp_apply.py": None,
94
"../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": (
105
"test_ddp_parity_xpu",
116
),
127
"../../../../test/distributed/fsdp/test_fsdp_comm.py": None,
13-
"../../../../test/distributed/fsdp/test_fsdp_core.py": (
14-
"test_delayed_optim_step_offload_false_no_shard_xpu",
15-
"test_delayed_optim_step_offload_false_none_xpu",
16-
"test_delayed_optim_step_offload_false_shard_grad_op_xpu",
17-
"test_delayed_optim_step_offload_true_none_xpu",
18-
"test_delayed_optim_step_offload_true_shard_grad_op_xpu",
19-
"test_delayed_reduce_scatter_offload_false_no_shard_xpu",
20-
"test_delayed_reduce_scatter_offload_false_none_xpu",
21-
"test_delayed_reduce_scatter_offload_false_shard_grad_op_xpu",
22-
"test_delayed_reduce_scatter_offload_true_none_xpu",
23-
"test_delayed_reduce_scatter_offload_true_shard_grad_op_xpu",
24-
"test_mixture_of_experts_offload_false_no_shard_xpu",
25-
"test_mixture_of_experts_offload_false_none_xpu",
26-
"test_mixture_of_experts_offload_false_shard_grad_op_xpu",
27-
"test_mixture_of_experts_offload_true_no_shard_xpu",
28-
"test_mixture_of_experts_offload_true_none_xpu",
29-
"test_mixture_of_experts_offload_true_shard_grad_op_xpu",
30-
"test_mixture_of_experts_with_delay_before_free_offload_false_no_shard_xpu",
31-
"test_mixture_of_experts_with_delay_before_free_offload_false_none_xpu",
32-
"test_mixture_of_experts_with_delay_before_free_offload_false_shard_grad_op_xpu",
33-
"test_mixture_of_experts_with_delay_before_free_offload_true_no_shard_xpu",
34-
"test_mixture_of_experts_with_delay_before_free_offload_true_none_xpu",
35-
"test_mixture_of_experts_with_delay_before_free_offload_true_shard_grad_op_xpu",
36-
"test_nested_always_wrap_model_offload_false_no_shard_xpu",
37-
"test_nested_always_wrap_model_offload_false_none_xpu",
38-
"test_nested_always_wrap_model_offload_false_shard_grad_op_xpu",
39-
"test_nested_always_wrap_model_offload_true_none_xpu",
40-
"test_nested_always_wrap_model_offload_true_shard_grad_op_xpu",
41-
"test_nested_wrapped_model_offload_false_no_shard_xpu",
42-
"test_nested_wrapped_model_offload_false_none_xpu",
43-
"test_nested_wrapped_model_offload_false_shard_grad_op_xpu",
44-
"test_nested_wrapped_model_offload_true_none_xpu",
45-
"test_nested_wrapped_model_offload_true_shard_grad_op_xpu",
46-
"test_transformer_offload_false_no_shard_xpu",
47-
"test_transformer_offload_false_none_xpu",
48-
"test_transformer_offload_false_shard_grad_op_xpu",
49-
"test_transformer_offload_true_none_xpu",
50-
"test_transformer_offload_true_shard_grad_op_xpu",
51-
# https://github.com/intel/torch-xpu-ops/issues/1475
52-
"test_transformer_no_grad_mixed_precision_True_xpu",
53-
"test_transformer_no_grad_mixed_precision_False_xpu",
54-
),
8+
"../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None,
9+
"../../../../test/distributed/fsdp/test_fsdp_core.py": None,
5510
# Will add them back after debugging
5611
# "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": (
5712
# "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_False_is_even_sharded_model_False_xpu",
@@ -70,17 +25,13 @@
7025
# "test_fsdp_init_with_device_mesh_is_even_sharded_model_True_xpu",
7126
# "test_raises_warning_or_errors_xpu",
7227
# ),
73-
"../../../../test/distributed/fsdp/test_fsdp_exec_order.py": (
74-
"test_invalid_first_iter_order_sharding_strategy1_xpu",
75-
"test_train_eval_sharding_strategy1_xpu",
76-
),
28+
"../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None,
7729
"../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": (
7830
"test_parity_with_non_frozen_fsdp_xpu",
7931
"test_parity_with_ddp_xpu",
8032
),
8133
"../../../../test/distributed/fsdp/test_fsdp_fx.py": None,
82-
# will bring back after oneccl upgrade to 2021.16.1
83-
# "../../../../test/distributed/fsdp/test_fsdp_input.py": None,
34+
"../../../../test/distributed/fsdp/test_fsdp_input.py": None,
8435
"../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": None,
8536
"../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": (
8637
"test_transformer_no_grad_mixed_precision_True_xpu",
@@ -98,8 +49,13 @@
9849
# "test_root_module_is_not_FSDP_xpu",
9950
# ),
10051
"../../../../test/distributed/fsdp/test_utils.py": None,
101-
"distributed/test_c10d_xccl.py": (
102-
# will bring back after oneccl upgrade to 2021.16.1
103-
"test_xccl_barrier",
52+
"distributed/test_c10d_xccl.py": None,
53+
"../../../../test/distributed/fsdp/test_fsdp_misc.py": None,
54+
"../../../../test/distributed/test_functional_api.py": (
55+
# depends on https://github.com/pytorch/pytorch/pull/159473
56+
"test_tracing_with_fakepg_xpu",
10457
),
58+
"../../../../test/distributed/_tools/test_fsdp2_mem_tracker.py": None,
59+
"../../../../test/distributed/_tools/test_mem_tracker.py": None,
60+
"../../../../test/distributed/_tools/test_memory_tracker.py": None,
10561
}

0 commit comments

Comments
 (0)