Use functional collective

soulitzer · soulitzer · commit 2790933a9698 · 2025-09-03T15:54:44.000-07:00
diff --git a/torchtitan/distributed/activation_checkpoint.py b/torchtitan/distributed/activation_checkpoint.py
@@ -25,6 +25,7 @@
     torch.ops.aten._scaled_dot_product_flash_attention.default,
     torch.ops._c10d_functional.reduce_scatter_tensor.default,
     torch.ops._c10d_functional.all_to_all_single.default,
+    torch.ops._c10d_functional.wait_tensor.default,
     # for low precision training, it's useful to always save
     # the result of max, since the absolute maximum is
     # used to compute the scaling factor for quantization.
@@ -85,15 +86,13 @@ def _apply_ac_to_transformer_block(
 
         def _get_custom_policy(meta):
             def _custom_policy(ctx, func, *args, **kwargs):
-                # print("custom policy called", func)
-                if (func == torch.ops.aten._to_copy.default
+                if (
+                    func == torch.ops.aten._to_copy.default
                     and "cuda" in str(args[0].device)
                     and "device" in kwargs
                     and str(kwargs["device"]) == "cpu"
                 ):
                     return CheckpointPolicy.MUST_SAVE
-                    # print("to_copy", args[0].device, kwargs)
-                    # print("to_copy", args[0].device, kwargs)
                 mode = "recompute" if ctx.is_recompute else "forward"
                 mm_count_key = f"{mode}_mm_count"
                 if func == torch.ops.aten.mm.default:
diff --git a/torchtitan/distributed/expert_parallel.py b/torchtitan/distributed/expert_parallel.py
@@ -11,7 +11,10 @@
 import torch
 import torch.distributed as dist
 import torch.nn as nn
-from torch.distributed._functional_collectives import all_to_all_single_autograd
+from torch.distributed._functional_collectives import (
+    all_to_all_single,
+    all_to_all_single_autograd,
+)
 from torch.distributed.tensor import (
     DeviceMesh,
     distribute_module,
@@ -146,26 +149,26 @@ def _token_dispatch(self, mod, inputs, device_mesh):
 
         # generate the input splits and output splits for all-to-all
         with torch.no_grad():
-            num_tokens_per_expert_group = num_tokens_per_expert.new_empty(
-                num_tokens_per_expert.shape[0]
-            )
-            dist.all_to_all_single(
-                num_tokens_per_expert_group,
+            num_tokens_per_expert_group = all_to_all_single(
                 num_tokens_per_expert,
+                None,
+                None,
                 group=device_mesh.get_group(),
             )
+            num_tokens_per_expert_group = torch.ops._c10d_functional.wait_tensor(
+                num_tokens_per_expert_group
+            )
             input_splits = (
                 num_tokens_per_expert.view(ep_size, -1)
                 .sum(dim=1)
-                .to(torch.device("cpu"), non_blocking=False)
+                .to(torch.device("cpu"), non_blocking=True)
             )
             output_splits = (
                 num_tokens_per_expert_group.view(ep_size, -1)
                 .sum(dim=1)
                 .to(torch.device("cpu"), non_blocking=False)
             )
             # NOTE: this would incur a device-to-host sync
-            # torch.cuda.current_stream().synchronize()
             self.input_splits = input_splits.tolist()
             self.output_splits = output_splits.tolist()