[CI] Add ported distributed cases (#1945)

zxd1997066 · web-flow · commit df1d7ad08c12 · 2025-09-16T08:57:02.000Z
This PR intends to add some ported distributed cases in torch-xpu-ops
CI.
- Add ZE_AFFINITY_MASK to ensure using Xelink.
- Add CCL_ROOT for Xelink, this WA can be removed after oneCCL upgrade
to 2021.16.2
- Increase distributed test time limit. Currently, the test part needs
about 1 hour after add ported cases.

disable_e2e
disable_ut
diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml
@@ -155,7 +155,7 @@ runs:
           tee ${{ github.workspace }}/ut_log/xpu_profiling/test_profiler_tree.log
 
     - name: xpu_distributed
-      shell: timeout 3600 bash -xeu -o pipefail {0}
+      shell: timeout 36000 bash -xeu -o pipefail {0}
       if: ${{ inputs.ut_name == 'xpu_distributed' }}
       run: |
         xpu-smi topology -m
@@ -166,9 +166,13 @@ runs:
           echo -e "[ERROR] XCCL is not enabled"
           exit 1
         fi
+        export CCL_ROOT=$(dirname $(which python))/../
+        export PATH="${CCL_ROOT}/bin/libfabric:${PATH}"
+        export LD_LIBRARY_PATH="${CCL_ROOT}/lib:${LD_LIBRARY_PATH}"
         python run_distributed.py \
           2> ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \
           tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log
+        find ../ -type f -name "*.xml" -exec cp {} ${{ github.workspace }}/ut_log/ \;
 
     # Summary
     - name: UT Test Results Summary
diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh
@@ -198,7 +198,8 @@ if [[ "${ut_suite}" == 'op_regression' || "${ut_suite}" == 'op_regression_dev1'
 fi
 
 if [[ "${ut_suite}" == 'xpu_distributed' ]]; then
-    grep -E "^FAILED" xpu_distributed_test.log | awk '{print $2}' > ./"${ut_suite}"_xpu_distributed_test_failed.log
+    grep -E "^FAILED" xpu_distributed_test.log | awk '{print $3}' > ./"${ut_suite}"_xpu_distributed_test_failed.log
+    sed -i '/^[^.]\+/d' ./"${ut_suite}"_xpu_distributed_test_failed.log
     grep "PASSED" xpu_distributed_test.log | awk '{print $1}' > ./"${ut_suite}"_xpu_distributed_test_passed.log
     echo -e "========================================================================="
     echo -e "Show Failed cases in ${ut_suite} xpu distributed"
diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
@@ -99,11 +99,12 @@ jobs:
 
   test-in-baremetal:
     needs: runner
+    timeout-minutes: 600
     if: ${{ contains(inputs.ut, 'distributed') }}
     runs-on: ${{ needs.runner.outputs.runner_id }}
     env:
       AGENT_TOOLSDIRECTORY: /tmp/xpu-tool
-      PYTEST_ADDOPTS: -v --timeout 600 --timeout_method=thread -n 1
+      PYTEST_ADDOPTS: -v --timeout 3600 --timeout_method=thread -n 1
     steps:
       - name: Checkout torch-xpu-ops
         uses: actions/checkout@v4
diff --git a/test/xpu/run_distributed.py b/test/xpu/run_distributed.py
@@ -9,9 +9,41 @@
 res2 = 0
 fail_test = []
 
-# libfabric WA to avoid hang issue
-os.environ["FI_PROVIDER"] = "tcp"
-# os.environ["ZE_AFFINITY_MASK"] = "0,1,2,3"
+# Get the xelink group card affinity
+ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
+if ret == 0:
+    gpu_dict = {}
+    with open("topology.log") as file:
+        lines = file.readlines()
+        for line in lines:
+            if "CPU Affinity" in line:
+                continue
+            line = line.strip()
+            if line.startswith("GPU "):
+                items = line.split(" ")
+                items = [x for x in items if x]
+                gpu_id = items[1]
+                i = gpu_id.split("/")[0]
+                affinity = ""
+                for j, item in enumerate(items):
+                    if "SYS" not in item and ("XL" in item or "S" in item):
+                        if len(affinity) == 0:
+                            affinity = str(j - 2)
+                        else:
+                            affinity = affinity + "," + str(j - 2)
+                gpu_dict[i] = affinity
+
+    max_affinity = ""
+    for key, value in gpu_dict.items():
+        if len(value) > len(max_affinity):
+            max_affinity = value
+
+    os.environ["ZE_AFFINITY_MASK"] = str(max_affinity)
+    print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK")))
+
+else:
+    print("xpu-smi topology failed")
+    sys.exit(255)
 
 
 # run python test
@@ -26,6 +58,10 @@ def run(test_command):
 
 test_command = ["python", "distributed/test_c10d_ops_xccl.py"]
 res += run(test_command)
+test_command = ["python", "../../../../test/distributed/pipelining/test_backward.py"]
+res += run(test_command)
+test_command = ["python", "../../../../test/distributed/pipelining/test_microbatch.py"]
+res += run(test_command)
 
 # run pytest with skiplist
 for key in skip_dict:
@@ -38,8 +74,4 @@ def run(test_command):
 if fail_test:
     print(",".join(fail_test) + " have failures")
 
-exit_code = os.WEXITSTATUS(res2)
-if exit_code == 0:
-    sys.exit(res)
-else:
-    sys.exit(exit_code)
+sys.exit(res)
diff --git a/test/xpu/skip_list_dist.py b/test/xpu/skip_list_dist.py
@@ -1,105 +1,36 @@
 skip_dict = {
-    "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": (
-        "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_False_use_orig_params_False",
-        "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_False_use_orig_params_False",
-        "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_True_use_orig_params_False",
-        "test_checkpoint_submodule_use_reentrant_False_xpu",
-    ),
+    "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_apply.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": (
         "test_ddp_parity_xpu",
     ),
     "../../../../test/distributed/fsdp/test_fsdp_comm.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_core.py": (
-        "test_delayed_optim_step_offload_false_no_shard_xpu",
-        "test_delayed_optim_step_offload_false_none_xpu",
-        "test_delayed_optim_step_offload_false_shard_grad_op_xpu",
-        "test_delayed_optim_step_offload_true_none_xpu",
-        "test_delayed_optim_step_offload_true_shard_grad_op_xpu",
-        "test_delayed_reduce_scatter_offload_false_no_shard_xpu",
-        "test_delayed_reduce_scatter_offload_false_none_xpu",
-        "test_delayed_reduce_scatter_offload_false_shard_grad_op_xpu",
-        "test_delayed_reduce_scatter_offload_true_none_xpu",
-        "test_delayed_reduce_scatter_offload_true_shard_grad_op_xpu",
-        "test_mixture_of_experts_offload_false_no_shard_xpu",
-        "test_mixture_of_experts_offload_false_none_xpu",
-        "test_mixture_of_experts_offload_false_shard_grad_op_xpu",
-        "test_mixture_of_experts_offload_true_no_shard_xpu",
-        "test_mixture_of_experts_offload_true_none_xpu",
-        "test_mixture_of_experts_offload_true_shard_grad_op_xpu",
-        "test_mixture_of_experts_with_delay_before_free_offload_false_no_shard_xpu",
-        "test_mixture_of_experts_with_delay_before_free_offload_false_none_xpu",
-        "test_mixture_of_experts_with_delay_before_free_offload_false_shard_grad_op_xpu",
-        "test_mixture_of_experts_with_delay_before_free_offload_true_no_shard_xpu",
-        "test_mixture_of_experts_with_delay_before_free_offload_true_none_xpu",
-        "test_mixture_of_experts_with_delay_before_free_offload_true_shard_grad_op_xpu",
-        "test_nested_always_wrap_model_offload_false_no_shard_xpu",
-        "test_nested_always_wrap_model_offload_false_none_xpu",
-        "test_nested_always_wrap_model_offload_false_shard_grad_op_xpu",
-        "test_nested_always_wrap_model_offload_true_none_xpu",
-        "test_nested_always_wrap_model_offload_true_shard_grad_op_xpu",
-        "test_nested_wrapped_model_offload_false_no_shard_xpu",
-        "test_nested_wrapped_model_offload_false_none_xpu",
-        "test_nested_wrapped_model_offload_false_shard_grad_op_xpu",
-        "test_nested_wrapped_model_offload_true_none_xpu",
-        "test_nested_wrapped_model_offload_true_shard_grad_op_xpu",
-        "test_transformer_offload_false_no_shard_xpu",
-        "test_transformer_offload_false_none_xpu",
-        "test_transformer_offload_false_shard_grad_op_xpu",
-        "test_transformer_offload_true_none_xpu",
-        "test_transformer_offload_true_shard_grad_op_xpu",
-        # https://github.com/intel/torch-xpu-ops/issues/1475
-        "test_transformer_no_grad_mixed_precision_True_xpu",
-        "test_transformer_no_grad_mixed_precision_False_xpu",
-    ),
-    # Will add them back after debugging
-    # "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": (
-    #    "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_False_is_even_sharded_model_False_xpu",
-    #    "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_False_is_even_sharded_model_True_xpu",
-    #    "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_True_is_even_sharded_model_False_xpu",
-    #    "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_True_is_even_sharded_model_True_xpu",
-    #    "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_False_is_even_sharded_model_False_xpu",
-    #    "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_False_is_even_sharded_model_True_xpu",
-    #    "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_True_is_even_sharded_model_False_xpu",
-    #    "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_True_is_even_sharded_model_True_xpu",
-    #    "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_False_is_even_sharded_model_False_xpu",
-    #    "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_False_is_even_sharded_model_True_xpu",
-    #    "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_is_even_sharded_model_False_xpu",
-    #    "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_is_even_sharded_model_True_xpu",
-    #    "test_fsdp_init_with_device_mesh_is_even_sharded_model_False_xpu",
-    #    "test_fsdp_init_with_device_mesh_is_even_sharded_model_True_xpu",
-    #    "test_raises_warning_or_errors_xpu",
-    # ),
-    "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": (
-        "test_invalid_first_iter_order_sharding_strategy1_xpu",
-        "test_train_eval_sharding_strategy1_xpu",
-    ),
+    "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_core.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": (
         "test_parity_with_non_frozen_fsdp_xpu",
         "test_parity_with_ddp_xpu",
     ),
     "../../../../test/distributed/fsdp/test_fsdp_fx.py": None,
-    # will bring back after oneccl upgrade to 2021.16.1
-    # "../../../../test/distributed/fsdp/test_fsdp_input.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_input.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": (
         "test_transformer_no_grad_mixed_precision_True_xpu",
     ),
     "../../../../test/distributed/fsdp/test_fsdp_uneven.py": None,
-    # Will add them back after debugging
-    # "../../../../test/distributed/fsdp/test_hsdp_dtensor_state_dict.py": (
-    #    "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_False_xpu",
-    #    "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_True_xpu",
-    #    "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_False_xpu",
-    #    "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_True_xpu",
-    #    "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_False_xpu",
-    #    "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_xpu",
-    #    "test_hsdp_init_with_device_mesh_xpu",
-    #    "test_root_module_is_not_FSDP_xpu",
-    # ),
     "../../../../test/distributed/fsdp/test_utils.py": None,
     "distributed/test_c10d_xccl.py": (
-        # will bring back after oneccl upgrade to 2021.16.1
-        "test_xccl_barrier",
+        # https://github.com/intel/torch-xpu-ops/issues/2046
+        "test_unwaited",
+    ),
+    "distributed/test_c10d_ops_xccl.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_misc.py": None,
+    "../../../../test/distributed/test_functional_api.py": (
+        # depends on https://github.com/pytorch/pytorch/pull/159473
+        "test_tracing_with_fakepg_xpu",
     ),
+    "../../../../test/distributed/_tools/test_fsdp2_mem_tracker.py": None,
+    "../../../../test/distributed/_tools/test_mem_tracker.py": None,
+    "../../../../test/distributed/_tools/test_memory_tracker.py": None,
 }