11
11
12
12
# libfabric WA to avoid hang issue
13
13
os .environ ["FI_PROVIDER" ] = "tcp"
14
- # os.environ["ZE_AFFINITY_MASK"] = "0,1,2,3"
14
+ # Get the xelink group card affinity
15
+ ret = os .system ("xpu-smi topology -m 2>&1|tee topology.log" )
16
+ if ret == 0 :
17
+ gpu_dict = {}
18
+ with open ("topology.log" ) as file :
19
+ lines = file .readlines ()
20
+ for line in lines :
21
+ if "CPU Affinity" in line :
22
+ continue
23
+ line = line .strip ()
24
+ if line .startswith ("GPU " ):
25
+ items = line .split (" " )
26
+ items = [x for x in items if x ]
27
+ gpu_id = items [1 ]
28
+ i = gpu_id .split ("/" )[0 ]
29
+ affinity = ""
30
+ for j , item in enumerate (items ):
31
+ if "SYS" not in item and ("XL" in item or "S" in item ):
32
+ if len (affinity ) == 0 :
33
+ affinity = str (j - 2 )
34
+ else :
35
+ affinity = affinity + "," + str (j - 2 )
36
+ gpu_dict [i ] = affinity
37
+
38
+ max_affinity = ""
39
+ for key , value in gpu_dict .items ():
40
+ if len (value ) > len (max_affinity ):
41
+ max_affinity = value
42
+
43
+ os .environ ["ZE_AFFINITY_MASK" ] = str (max_affinity )
44
+ print (str ("ZE_AFFINITY_MASK=" + os .environ .get ("ZE_AFFINITY_MASK" )))
45
+
46
+ else :
47
+ print ("xpu-smi topology failed" )
48
+ sys .exit (255 )
15
49
16
50
17
51
# run python test
@@ -24,8 +58,16 @@ def run(test_command):
24
58
return result .returncode
25
59
26
60
61
+ os .environ ["CCL_SEND" ] = "direct"
62
+ os .environ ["CCL_RECV" ] = "direct"
27
63
test_command = ["python" , "distributed/test_c10d_ops_xccl.py" ]
28
64
res += run (test_command )
65
+ del os .environ ["CCL_SEND" ]
66
+ del os .environ ["CCL_RECV" ]
67
+ test_command = ["python" , "../../../../test/distributed/pipelining/test_backward.py" ]
68
+ res += run (test_command )
69
+ test_command = ["python" , "../../../../test/distributed/pipelining/test_microbatch.py" ]
70
+ res += run (test_command )
29
71
30
72
# run pytest with skiplist
31
73
for key in skip_dict :
0 commit comments