Skip to content

Commit df1d7ad

Browse files
authored
[CI] Add ported distributed cases (#1945)
This PR intends to add some ported distributed cases in torch-xpu-ops CI. - Add ZE_AFFINITY_MASK to ensure using Xelink. - Add CCL_ROOT for Xelink, this WA can be removed after oneCCL upgrade to 2021.16.2 - Increase distributed test time limit. Currently, the test part needs about 1 hour after add ported cases. disable_e2e disable_ut
1 parent 4d38b5e commit df1d7ad

File tree

5 files changed

+65
-96
lines changed

5 files changed

+65
-96
lines changed

.github/actions/linux-uttest/action.yml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ runs:
155155
tee ${{ github.workspace }}/ut_log/xpu_profiling/test_profiler_tree.log
156156
157157
- name: xpu_distributed
158-
shell: timeout 3600 bash -xeu -o pipefail {0}
158+
shell: timeout 36000 bash -xeu -o pipefail {0}
159159
if: ${{ inputs.ut_name == 'xpu_distributed' }}
160160
run: |
161161
xpu-smi topology -m
@@ -166,9 +166,13 @@ runs:
166166
echo -e "[ERROR] XCCL is not enabled"
167167
exit 1
168168
fi
169+
export CCL_ROOT=$(dirname $(which python))/../
170+
export PATH="${CCL_ROOT}/bin/libfabric:${PATH}"
171+
export LD_LIBRARY_PATH="${CCL_ROOT}/lib:${LD_LIBRARY_PATH}"
169172
python run_distributed.py \
170173
2> ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \
171174
tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log
175+
find ../ -type f -name "*.xml" -exec cp {} ${{ github.workspace }}/ut_log/ \;
172176
173177
# Summary
174178
- name: UT Test Results Summary

.github/scripts/ut_result_check.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,8 @@ if [[ "${ut_suite}" == 'op_regression' || "${ut_suite}" == 'op_regression_dev1'
198198
fi
199199

200200
if [[ "${ut_suite}" == 'xpu_distributed' ]]; then
201-
grep -E "^FAILED" xpu_distributed_test.log | awk '{print $2}' > ./"${ut_suite}"_xpu_distributed_test_failed.log
201+
grep -E "^FAILED" xpu_distributed_test.log | awk '{print $3}' > ./"${ut_suite}"_xpu_distributed_test_failed.log
202+
sed -i '/^[^.]\+/d' ./"${ut_suite}"_xpu_distributed_test_failed.log
202203
grep "PASSED" xpu_distributed_test.log | awk '{print $1}' > ./"${ut_suite}"_xpu_distributed_test_passed.log
203204
echo -e "========================================================================="
204205
echo -e "Show Failed cases in ${ut_suite} xpu distributed"

.github/workflows/_linux_ut.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,11 +99,12 @@ jobs:
9999

100100
test-in-baremetal:
101101
needs: runner
102+
timeout-minutes: 600
102103
if: ${{ contains(inputs.ut, 'distributed') }}
103104
runs-on: ${{ needs.runner.outputs.runner_id }}
104105
env:
105106
AGENT_TOOLSDIRECTORY: /tmp/xpu-tool
106-
PYTEST_ADDOPTS: -v --timeout 600 --timeout_method=thread -n 1
107+
PYTEST_ADDOPTS: -v --timeout 3600 --timeout_method=thread -n 1
107108
steps:
108109
- name: Checkout torch-xpu-ops
109110
uses: actions/checkout@v4

test/xpu/run_distributed.py

Lines changed: 40 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,41 @@
99
res2 = 0
1010
fail_test = []
1111

12-
# libfabric WA to avoid hang issue
13-
os.environ["FI_PROVIDER"] = "tcp"
14-
# os.environ["ZE_AFFINITY_MASK"] = "0,1,2,3"
12+
# Get the xelink group card affinity
13+
ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
14+
if ret == 0:
15+
gpu_dict = {}
16+
with open("topology.log") as file:
17+
lines = file.readlines()
18+
for line in lines:
19+
if "CPU Affinity" in line:
20+
continue
21+
line = line.strip()
22+
if line.startswith("GPU "):
23+
items = line.split(" ")
24+
items = [x for x in items if x]
25+
gpu_id = items[1]
26+
i = gpu_id.split("/")[0]
27+
affinity = ""
28+
for j, item in enumerate(items):
29+
if "SYS" not in item and ("XL" in item or "S" in item):
30+
if len(affinity) == 0:
31+
affinity = str(j - 2)
32+
else:
33+
affinity = affinity + "," + str(j - 2)
34+
gpu_dict[i] = affinity
35+
36+
max_affinity = ""
37+
for key, value in gpu_dict.items():
38+
if len(value) > len(max_affinity):
39+
max_affinity = value
40+
41+
os.environ["ZE_AFFINITY_MASK"] = str(max_affinity)
42+
print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK")))
43+
44+
else:
45+
print("xpu-smi topology failed")
46+
sys.exit(255)
1547

1648

1749
# run python test
@@ -26,6 +58,10 @@ def run(test_command):
2658

2759
test_command = ["python", "distributed/test_c10d_ops_xccl.py"]
2860
res += run(test_command)
61+
test_command = ["python", "../../../../test/distributed/pipelining/test_backward.py"]
62+
res += run(test_command)
63+
test_command = ["python", "../../../../test/distributed/pipelining/test_microbatch.py"]
64+
res += run(test_command)
2965

3066
# run pytest with skiplist
3167
for key in skip_dict:
@@ -38,8 +74,4 @@ def run(test_command):
3874
if fail_test:
3975
print(",".join(fail_test) + " have failures")
4076

41-
exit_code = os.WEXITSTATUS(res2)
42-
if exit_code == 0:
43-
sys.exit(res)
44-
else:
45-
sys.exit(exit_code)
77+
sys.exit(res)

test/xpu/skip_list_dist.py

Lines changed: 16 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -1,105 +1,36 @@
11
skip_dict = {
2-
"../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": (
3-
"test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_False_use_orig_params_False",
4-
"test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_False_use_orig_params_False",
5-
"test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_True_use_orig_params_False",
6-
"test_checkpoint_submodule_use_reentrant_False_xpu",
7-
),
2+
"../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None,
83
"../../../../test/distributed/fsdp/test_fsdp_apply.py": None,
94
"../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": (
105
"test_ddp_parity_xpu",
116
),
127
"../../../../test/distributed/fsdp/test_fsdp_comm.py": None,
13-
"../../../../test/distributed/fsdp/test_fsdp_core.py": (
14-
"test_delayed_optim_step_offload_false_no_shard_xpu",
15-
"test_delayed_optim_step_offload_false_none_xpu",
16-
"test_delayed_optim_step_offload_false_shard_grad_op_xpu",
17-
"test_delayed_optim_step_offload_true_none_xpu",
18-
"test_delayed_optim_step_offload_true_shard_grad_op_xpu",
19-
"test_delayed_reduce_scatter_offload_false_no_shard_xpu",
20-
"test_delayed_reduce_scatter_offload_false_none_xpu",
21-
"test_delayed_reduce_scatter_offload_false_shard_grad_op_xpu",
22-
"test_delayed_reduce_scatter_offload_true_none_xpu",
23-
"test_delayed_reduce_scatter_offload_true_shard_grad_op_xpu",
24-
"test_mixture_of_experts_offload_false_no_shard_xpu",
25-
"test_mixture_of_experts_offload_false_none_xpu",
26-
"test_mixture_of_experts_offload_false_shard_grad_op_xpu",
27-
"test_mixture_of_experts_offload_true_no_shard_xpu",
28-
"test_mixture_of_experts_offload_true_none_xpu",
29-
"test_mixture_of_experts_offload_true_shard_grad_op_xpu",
30-
"test_mixture_of_experts_with_delay_before_free_offload_false_no_shard_xpu",
31-
"test_mixture_of_experts_with_delay_before_free_offload_false_none_xpu",
32-
"test_mixture_of_experts_with_delay_before_free_offload_false_shard_grad_op_xpu",
33-
"test_mixture_of_experts_with_delay_before_free_offload_true_no_shard_xpu",
34-
"test_mixture_of_experts_with_delay_before_free_offload_true_none_xpu",
35-
"test_mixture_of_experts_with_delay_before_free_offload_true_shard_grad_op_xpu",
36-
"test_nested_always_wrap_model_offload_false_no_shard_xpu",
37-
"test_nested_always_wrap_model_offload_false_none_xpu",
38-
"test_nested_always_wrap_model_offload_false_shard_grad_op_xpu",
39-
"test_nested_always_wrap_model_offload_true_none_xpu",
40-
"test_nested_always_wrap_model_offload_true_shard_grad_op_xpu",
41-
"test_nested_wrapped_model_offload_false_no_shard_xpu",
42-
"test_nested_wrapped_model_offload_false_none_xpu",
43-
"test_nested_wrapped_model_offload_false_shard_grad_op_xpu",
44-
"test_nested_wrapped_model_offload_true_none_xpu",
45-
"test_nested_wrapped_model_offload_true_shard_grad_op_xpu",
46-
"test_transformer_offload_false_no_shard_xpu",
47-
"test_transformer_offload_false_none_xpu",
48-
"test_transformer_offload_false_shard_grad_op_xpu",
49-
"test_transformer_offload_true_none_xpu",
50-
"test_transformer_offload_true_shard_grad_op_xpu",
51-
# https://github.com/intel/torch-xpu-ops/issues/1475
52-
"test_transformer_no_grad_mixed_precision_True_xpu",
53-
"test_transformer_no_grad_mixed_precision_False_xpu",
54-
),
55-
# Will add them back after debugging
56-
# "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": (
57-
# "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_False_is_even_sharded_model_False_xpu",
58-
# "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_False_is_even_sharded_model_True_xpu",
59-
# "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_True_is_even_sharded_model_False_xpu",
60-
# "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_True_is_even_sharded_model_True_xpu",
61-
# "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_False_is_even_sharded_model_False_xpu",
62-
# "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_False_is_even_sharded_model_True_xpu",
63-
# "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_True_is_even_sharded_model_False_xpu",
64-
# "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_True_is_even_sharded_model_True_xpu",
65-
# "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_False_is_even_sharded_model_False_xpu",
66-
# "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_False_is_even_sharded_model_True_xpu",
67-
# "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_is_even_sharded_model_False_xpu",
68-
# "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_is_even_sharded_model_True_xpu",
69-
# "test_fsdp_init_with_device_mesh_is_even_sharded_model_False_xpu",
70-
# "test_fsdp_init_with_device_mesh_is_even_sharded_model_True_xpu",
71-
# "test_raises_warning_or_errors_xpu",
72-
# ),
73-
"../../../../test/distributed/fsdp/test_fsdp_exec_order.py": (
74-
"test_invalid_first_iter_order_sharding_strategy1_xpu",
75-
"test_train_eval_sharding_strategy1_xpu",
76-
),
8+
"../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None,
9+
"../../../../test/distributed/fsdp/test_fsdp_core.py": None,
10+
"../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None,
7711
"../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": (
7812
"test_parity_with_non_frozen_fsdp_xpu",
7913
"test_parity_with_ddp_xpu",
8014
),
8115
"../../../../test/distributed/fsdp/test_fsdp_fx.py": None,
82-
# will bring back after oneccl upgrade to 2021.16.1
83-
# "../../../../test/distributed/fsdp/test_fsdp_input.py": None,
16+
"../../../../test/distributed/fsdp/test_fsdp_input.py": None,
8417
"../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": None,
8518
"../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": (
8619
"test_transformer_no_grad_mixed_precision_True_xpu",
8720
),
8821
"../../../../test/distributed/fsdp/test_fsdp_uneven.py": None,
89-
# Will add them back after debugging
90-
# "../../../../test/distributed/fsdp/test_hsdp_dtensor_state_dict.py": (
91-
# "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_False_xpu",
92-
# "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_True_xpu",
93-
# "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_False_xpu",
94-
# "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_True_xpu",
95-
# "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_False_xpu",
96-
# "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_xpu",
97-
# "test_hsdp_init_with_device_mesh_xpu",
98-
# "test_root_module_is_not_FSDP_xpu",
99-
# ),
10022
"../../../../test/distributed/fsdp/test_utils.py": None,
10123
"distributed/test_c10d_xccl.py": (
102-
# will bring back after oneccl upgrade to 2021.16.1
103-
"test_xccl_barrier",
24+
# https://github.com/intel/torch-xpu-ops/issues/2046
25+
"test_unwaited",
26+
),
27+
"distributed/test_c10d_ops_xccl.py": None,
28+
"../../../../test/distributed/fsdp/test_fsdp_misc.py": None,
29+
"../../../../test/distributed/test_functional_api.py": (
30+
# depends on https://github.com/pytorch/pytorch/pull/159473
31+
"test_tracing_with_fakepg_xpu",
10432
),
33+
"../../../../test/distributed/_tools/test_fsdp2_mem_tracker.py": None,
34+
"../../../../test/distributed/_tools/test_mem_tracker.py": None,
35+
"../../../../test/distributed/_tools/test_memory_tracker.py": None,
10536
}

0 commit comments

Comments
 (0)