Feat: works

S1ro1 · S1ro1 · commit 75bd4abc5f4a · 2025-09-12T14:24:43.000+02:00
diff --git a/problems/amd_distributed.yaml b/problems/amd_distributed.yaml
@@ -10,3 +10,8 @@ problems:
     deadline: "2025-10-14"
     gpus:
       - MI300x8
+  - directory: amd_distributed/gemm-rs
+    name: amd-gemm-rs
+    deadline: "2025-10-14"
+    gpus:
+      - MI300x8
diff --git a/problems/amd_distributed/gemm-rs/reference.py b/problems/amd_distributed/gemm-rs/reference.py
@@ -3,7 +3,7 @@
 import torch
 
 
-def generate_input(RANK: int, world_size: int, m: int, n: int, k: int, has_bias: bool, seed: int) -> input_t:
+def generate_input(rank: int, world_size: int, m: int, n: int, k: int, has_bias: bool, seed: int) -> input_t:
     """
     Generate random input and weights for the Gemm-ReduceScatter operation.
 
@@ -14,21 +14,22 @@ def generate_input(RANK: int, world_size: int, m: int, n: int, k: int, has_bias:
             bias: Optional[torch.Tensor],  # [N] or None
         )
     """
-    gen = torch.Generator(device='cuda')
-    gen.manual_seed(seed + RANK)
+    device = torch.device(f'cuda:{rank}')
+    gen = torch.Generator(device=device)
+    gen.manual_seed(seed + rank)
 
     assert m % world_size == 0, "m must be divisible by world_size"
     assert k % world_size == 0, "k must be divisible by world_size"
     local_k = k // world_size
 
     # Generate random inputs and weights
-    input = (torch.rand((m, local_k), dtype=torch.bfloat16, device="cuda", generator=gen) * 2 - 1) * 0.01
-    weight = (torch.rand((n, local_k), dtype=torch.bfloat16, device="cuda", generator=gen) * 2 - 1) * 0.01
+    input = (torch.rand((m, local_k), dtype=torch.bfloat16, device=device, generator=gen) * 2 - 1) * 0.01
+    weight = (torch.rand((n, local_k), dtype=torch.bfloat16, device=device, generator=gen) * 2 - 1) * 0.01
 
     bias = None
     if has_bias:
         gen.manual_seed(seed)
-        bias = (torch.rand((n,), dtype=torch.bfloat16, device="cuda", generator=gen) * 2 - 1) * 0.01
+        bias = (torch.rand((n,), dtype=torch.bfloat16, device=device, generator=gen) * 2 - 1) * 0.01
 
     return (input, weight, bias)
 
@@ -60,4 +61,12 @@ def ref_kernel(data: input_t) -> output_t:
     return rs_output
 
 
-check_implementation = make_match_reference(ref_kernel, rtol=1e-2, atol=1e-2)
+def check_implementation(data: input_t, output: output_t):
+    expected = ref_kernel(data)
+    if output.device != expected.device:
+        return False, f"Output device mismatch: {output.device} != {expected.device}"
+    res = torch.allclose(output, expected, rtol=1e-2, atol=1e-2)
+    if not res:
+        return False, f"Output values mismatch, {output} != {expected}"
+
+    return True, ""
diff --git a/problems/amd_distributed/gemm-rs/task.yml b/problems/amd_distributed/gemm-rs/task.yml
@@ -8,6 +8,7 @@ files:
   - {"name": "eval.py", "source": "../eval.py"}
 
 lang: "py"
+multi_gpu: true
 
 description: |
   Implement a Gemm-ReduceScatter kernel on a single MI300X node.