Merge pull request #60 from danielhua23/a2a_each_rank_seed

msaroufim · web-flow · commit 750868c61cd8 · 2025-09-02T20:47:38.000-07:00
[enhance] change seed and moe const different for each rank for debug…
diff --git a/problems/amd_distributed/all2all/reference.py b/problems/amd_distributed/all2all/reference.py
@@ -239,7 +239,7 @@ def generate_input(
 ):
     device = torch.device(f"cuda:{rank}")
     gen = torch.Generator(device=device)
-    gen.manual_seed(seed)
+    gen.manual_seed(seed + rank)
 
     cfg = MoEConfig(
         num_experts=num_experts,
@@ -259,7 +259,7 @@ def ref_kernel(data: input_t) -> output_t:
     ata = PyTorchAllToAll(cfg, rank, world_size)
 
     expert_num, expert_x, expert_meta = ata.dispatch(rank_data.x, rank_data.indices)
-    expert_y = expert_x.to(cfg.out_dtype) * 2
+    expert_y = expert_x.to(cfg.out_dtype) * (1 + rank)
     y = torch.zeros(
         cfg.max_num_tokens,
         cfg.hidden_dim,
diff --git a/problems/amd_distributed/all2all/submission.py b/problems/amd_distributed/all2all/submission.py
@@ -193,7 +193,7 @@ def custom_kernel(data: input_t) -> output_t:
     ata = PyTorchAllToAll(cfg, rank, world_size)
 
     expert_num, expert_x, expert_meta = ata.dispatch(rank_data.x, rank_data.indices)
-    expert_y = expert_x.to(cfg.out_dtype) * 2
+    expert_y = expert_x.to(cfg.out_dtype) * (1 + rank)
     y = torch.zeros(
         cfg.max_num_tokens,
         cfg.hidden_dim,