Skip to content

Commit 809331e

Browse files
committed
Add ported distributed cases
1 parent d8c3eef commit 809331e

File tree

5 files changed

+64
-68
lines changed

5 files changed

+64
-68
lines changed

.github/actions/linux-uttest/action.yml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ runs:
155155
tee ${{ github.workspace }}/ut_log/xpu_profiling/test_profiler_tree.log
156156
157157
- name: xpu_distributed
158-
shell: timeout 3600 bash -xeu -o pipefail {0}
158+
shell: timeout 36000 bash -xeu -o pipefail {0}
159159
if: ${{ inputs.ut_name == 'xpu_distributed' }}
160160
run: |
161161
xpu-smi topology -m
@@ -166,9 +166,13 @@ runs:
166166
echo -e "[ERROR] XCCL is not enabled"
167167
exit 1
168168
fi
169+
export CCL_ROOT=$(dirname $(which python))/../
170+
export PATH="${CCL_ROOT}/bin/libfabric:${PATH}"
171+
export LD_LIBRARY_PATH="${CCL_ROOT}/lib:${LD_LIBRARY_PATH}"
169172
python run_distributed.py \
170173
2> ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \
171174
tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log
175+
cp *.xml ${{ github.workspace }}/ut_log
172176
173177
# Summary
174178
- name: UT Test Results Summary

.github/scripts/ut_result_check.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,8 @@ if [[ "${ut_suite}" == 'op_regression' || "${ut_suite}" == 'op_regression_dev1'
198198
fi
199199

200200
if [[ "${ut_suite}" == 'xpu_distributed' ]]; then
201-
grep -E "^FAILED" xpu_distributed_test.log | awk '{print $2}' > ./"${ut_suite}"_xpu_distributed_test_failed.log
201+
grep -E "^FAILED" xpu_distributed_test.log | awk '{print $3}' > ./"${ut_suite}"_xpu_distributed_test_failed.log
202+
sed -i '/^[^.]\+/d' ./"${ut_suite}"_xpu_distributed_test_failed.log
202203
grep "PASSED" xpu_distributed_test.log | awk '{print $1}' > ./"${ut_suite}"_xpu_distributed_test_passed.log
203204
echo -e "========================================================================="
204205
echo -e "Show Failed cases in ${ut_suite} xpu distributed"

.github/workflows/_linux_ut.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,11 +99,12 @@ jobs:
9999

100100
test-in-baremetal:
101101
needs: runner
102+
timeout-minutes: 600
102103
if: ${{ contains(inputs.ut, 'distributed') }}
103104
runs-on: ${{ needs.runner.outputs.runner_id }}
104105
env:
105106
AGENT_TOOLSDIRECTORY: /tmp/xpu-tool
106-
PYTEST_ADDOPTS: -v --timeout 600 --timeout_method=thread -n 1
107+
PYTEST_ADDOPTS: -v --timeout 3600 --timeout_method=thread -n 1
107108
steps:
108109
- name: Checkout torch-xpu-ops
109110
uses: actions/checkout@v4

test/xpu/run_distributed.py

Lines changed: 42 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,41 @@
99
res2 = 0
1010
fail_test = []
1111

12-
# libfabric WA to avoid hang issue
13-
os.environ["FI_PROVIDER"] = "tcp"
14-
# os.environ["ZE_AFFINITY_MASK"] = "0,1,2,3"
12+
# Get the xelink group card affinity
13+
ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
14+
if ret == 0:
15+
gpu_dict = {}
16+
with open("topology.log") as file:
17+
lines = file.readlines()
18+
for line in lines:
19+
if "CPU Affinity" in line:
20+
continue
21+
line = line.strip()
22+
if line.startswith("GPU "):
23+
items = line.split(" ")
24+
items = [x for x in items if x]
25+
gpu_id = items[1]
26+
i = gpu_id.split("/")[0]
27+
affinity = ""
28+
for j, item in enumerate(items):
29+
if "SYS" not in item and ("XL" in item or "S" in item):
30+
if len(affinity) == 0:
31+
affinity = str(j - 2)
32+
else:
33+
affinity = affinity + "," + str(j - 2)
34+
gpu_dict[i] = affinity
35+
36+
max_affinity = ""
37+
for key, value in gpu_dict.items():
38+
if len(value) > len(max_affinity):
39+
max_affinity = value
40+
41+
os.environ["ZE_AFFINITY_MASK"] = str(max_affinity)
42+
print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK")))
43+
44+
else:
45+
print("xpu-smi topology failed")
46+
sys.exit(255)
1547

1648

1749
# run python test
@@ -26,6 +58,12 @@ def run(test_command):
2658

2759
test_command = ["python", "distributed/test_c10d_ops_xccl.py"]
2860
res += run(test_command)
61+
test_command = ["python", "distributed/test_c10d_xccl.py"]
62+
res += run(test_command)
63+
test_command = ["python", "../../../../test/distributed/pipelining/test_backward.py"]
64+
res += run(test_command)
65+
test_command = ["python", "../../../../test/distributed/pipelining/test_microbatch.py"]
66+
res += run(test_command)
2967

3068
# run pytest with skiplist
3169
for key in skip_dict:
@@ -38,8 +76,4 @@ def run(test_command):
3876
if fail_test:
3977
print(",".join(fail_test) + " have failures")
4078

41-
exit_code = os.WEXITSTATUS(res2)
42-
if exit_code == 0:
43-
sys.exit(res)
44-
else:
45-
sys.exit(exit_code)
79+
sys.exit(res)

test/xpu/skip_list_dist.py

Lines changed: 13 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -1,57 +1,12 @@
11
skip_dict = {
2-
"../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": (
3-
"test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_False_use_orig_params_False",
4-
"test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_False_use_orig_params_False",
5-
"test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_True_use_orig_params_False",
6-
"test_checkpoint_submodule_use_reentrant_False_xpu",
7-
),
2+
"../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None,
83
"../../../../test/distributed/fsdp/test_fsdp_apply.py": None,
94
"../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": (
105
"test_ddp_parity_xpu",
116
),
127
"../../../../test/distributed/fsdp/test_fsdp_comm.py": None,
13-
"../../../../test/distributed/fsdp/test_fsdp_core.py": (
14-
"test_delayed_optim_step_offload_false_no_shard_xpu",
15-
"test_delayed_optim_step_offload_false_none_xpu",
16-
"test_delayed_optim_step_offload_false_shard_grad_op_xpu",
17-
"test_delayed_optim_step_offload_true_none_xpu",
18-
"test_delayed_optim_step_offload_true_shard_grad_op_xpu",
19-
"test_delayed_reduce_scatter_offload_false_no_shard_xpu",
20-
"test_delayed_reduce_scatter_offload_false_none_xpu",
21-
"test_delayed_reduce_scatter_offload_false_shard_grad_op_xpu",
22-
"test_delayed_reduce_scatter_offload_true_none_xpu",
23-
"test_delayed_reduce_scatter_offload_true_shard_grad_op_xpu",
24-
"test_mixture_of_experts_offload_false_no_shard_xpu",
25-
"test_mixture_of_experts_offload_false_none_xpu",
26-
"test_mixture_of_experts_offload_false_shard_grad_op_xpu",
27-
"test_mixture_of_experts_offload_true_no_shard_xpu",
28-
"test_mixture_of_experts_offload_true_none_xpu",
29-
"test_mixture_of_experts_offload_true_shard_grad_op_xpu",
30-
"test_mixture_of_experts_with_delay_before_free_offload_false_no_shard_xpu",
31-
"test_mixture_of_experts_with_delay_before_free_offload_false_none_xpu",
32-
"test_mixture_of_experts_with_delay_before_free_offload_false_shard_grad_op_xpu",
33-
"test_mixture_of_experts_with_delay_before_free_offload_true_no_shard_xpu",
34-
"test_mixture_of_experts_with_delay_before_free_offload_true_none_xpu",
35-
"test_mixture_of_experts_with_delay_before_free_offload_true_shard_grad_op_xpu",
36-
"test_nested_always_wrap_model_offload_false_no_shard_xpu",
37-
"test_nested_always_wrap_model_offload_false_none_xpu",
38-
"test_nested_always_wrap_model_offload_false_shard_grad_op_xpu",
39-
"test_nested_always_wrap_model_offload_true_none_xpu",
40-
"test_nested_always_wrap_model_offload_true_shard_grad_op_xpu",
41-
"test_nested_wrapped_model_offload_false_no_shard_xpu",
42-
"test_nested_wrapped_model_offload_false_none_xpu",
43-
"test_nested_wrapped_model_offload_false_shard_grad_op_xpu",
44-
"test_nested_wrapped_model_offload_true_none_xpu",
45-
"test_nested_wrapped_model_offload_true_shard_grad_op_xpu",
46-
"test_transformer_offload_false_no_shard_xpu",
47-
"test_transformer_offload_false_none_xpu",
48-
"test_transformer_offload_false_shard_grad_op_xpu",
49-
"test_transformer_offload_true_none_xpu",
50-
"test_transformer_offload_true_shard_grad_op_xpu",
51-
# https://github.com/intel/torch-xpu-ops/issues/1475
52-
"test_transformer_no_grad_mixed_precision_True_xpu",
53-
"test_transformer_no_grad_mixed_precision_False_xpu",
54-
),
8+
"../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None,
9+
"../../../../test/distributed/fsdp/test_fsdp_core.py": None,
5510
# Will add them back after debugging
5611
# "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": (
5712
# "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_False_is_even_sharded_model_False_xpu",
@@ -70,17 +25,13 @@
7025
# "test_fsdp_init_with_device_mesh_is_even_sharded_model_True_xpu",
7126
# "test_raises_warning_or_errors_xpu",
7227
# ),
73-
"../../../../test/distributed/fsdp/test_fsdp_exec_order.py": (
74-
"test_invalid_first_iter_order_sharding_strategy1_xpu",
75-
"test_train_eval_sharding_strategy1_xpu",
76-
),
28+
"../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None,
7729
"../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": (
7830
"test_parity_with_non_frozen_fsdp_xpu",
7931
"test_parity_with_ddp_xpu",
8032
),
8133
"../../../../test/distributed/fsdp/test_fsdp_fx.py": None,
82-
# will bring back after oneccl upgrade to 2021.16.1
83-
# "../../../../test/distributed/fsdp/test_fsdp_input.py": None,
34+
"../../../../test/distributed/fsdp/test_fsdp_input.py": None,
8435
"../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": None,
8536
"../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": (
8637
"test_transformer_no_grad_mixed_precision_True_xpu",
@@ -98,8 +49,13 @@
9849
# "test_root_module_is_not_FSDP_xpu",
9950
# ),
10051
"../../../../test/distributed/fsdp/test_utils.py": None,
101-
"distributed/test_c10d_xccl.py": (
102-
# will bring back after oneccl upgrade to 2021.16.1
103-
"test_xccl_barrier",
52+
"distributed/test_c10d_xccl.py": None,
53+
"../../../../test/distributed/fsdp/test_fsdp_misc.py": None,
54+
"../../../../test/distributed/test_functional_api.py": (
55+
# depends on https://github.com/pytorch/pytorch/pull/159473
56+
"test_tracing_with_fakepg_xpu",
10457
),
58+
"../../../../test/distributed/_tools/test_fsdp2_mem_tracker.py": None,
59+
"../../../../test/distributed/_tools/test_mem_tracker.py": None,
60+
"../../../../test/distributed/_tools/test_memory_tracker.py": None,
10561
}

0 commit comments

Comments
 (0)