pytorch · tianyu-l · Sep 6, 2025 · Sep 1, 2025 · tianyu-l · Sep 5, 2025
@@ -76,6 +76,13 @@ def _apply_ac_to_transformer_block(
 
         def _get_custom_policy(meta):
             def _custom_policy(ctx, func, *args, **kwargs):
+                if (
+                    func == torch.ops.aten._to_copy.default
+                    and "cuda" in str(args[0].device)
+                    and "device" in kwargs
+                    and str(kwargs["device"]) == "cpu"
+                ):
+                    return CheckpointPolicy.MUST_SAVE
                 mode = "recompute" if ctx.is_recompute else "forward"
                 mm_count_key = f"{mode}_mm_count"
                 if func == torch.ops.aten.mm.default:

@@ -8,9 +8,11 @@
 from typing import Callable, Literal
 
 import torch
-import torch.distributed as dist
 import torch.nn as nn
-from torch.distributed._functional_collectives import all_to_all_single_autograd
+from torch.distributed._functional_collectives import (
+    all_to_all_single,
+    all_to_all_single_autograd,
+)
 from torch.distributed.tensor import (
     DeviceMesh,
     distribute_module,
@@ -90,26 +92,28 @@ def _token_dispatch(self, mod, inputs, device_mesh):
 
         # generate the input splits and output splits for all-to-all
         with torch.no_grad():
-            num_tokens_per_expert_group = num_tokens_per_expert.new_empty(
-                num_tokens_per_expert.shape[0]
-            )
-            dist.all_to_all_single(
-                num_tokens_per_expert_group,
+            num_tokens_per_expert_group = all_to_all_single(
                 num_tokens_per_expert,
+                None,
+                None,
                 group=device_mesh.get_group(),
             )
+            # Need to wait explicitly because it is used by a triton kernel later
+            # which doesn't realize that AsyncCollectiveTensor needs unwrapping
+            num_tokens_per_expert_group = torch.ops._c10d_functional.wait_tensor(
+                num_tokens_per_expert_group
+            )
             input_splits = (
                 num_tokens_per_expert.view(ep_size, -1)
                 .sum(dim=1)
                 .to(torch.device("cpu"), non_blocking=True)
             )
+            # NOTE: this would incur a device-to-host sync
             output_splits = (
                 num_tokens_per_expert_group.view(ep_size, -1)
                 .sum(dim=1)
-                .to(torch.device("cpu"), non_blocking=True)
+                .to(torch.device("cpu"), non_blocking=False)
             )
-            # NOTE: this would incur a device-to-host sync
-            torch.cuda.current_stream().synchronize()
             self.input_splits = input_splits.tolist()
             self.output_splits = output_splits.tolist()
 

diff --git a/torchtitan/experiments/llama4/infra/parallelize.py b/torchtitan/experiments/llama4/infra/parallelize.py
@@ -38,6 +38,7 @@
     torch.ops.aten._scaled_dot_product_efficient_attention.default,
     torch.ops.aten._scaled_dot_product_flash_attention.default,
     torch.ops._c10d_functional.reduce_scatter_tensor.default,
+    torch.ops._c10d_functional.all_to_all_single.default,
     # for low precision training, it's useful to always save
     # the result of max, since the absolute maximum is
     # used to compute the scaling factor for quantization.

@@ -35,6 +35,7 @@
     torch.ops.aten._scaled_dot_product_efficient_attention.default,
     torch.ops.aten._scaled_dot_product_flash_attention.default,
     torch.ops._c10d_functional.reduce_scatter_tensor.default,
+    torch.ops._c10d_functional.all_to_all_single.default,
     # for low precision training, it's useful to always save
     # the result of max, since the absolute maximum is
     # used to compute the scaling factor for quantization.