Save _to_copy and a2a in selective AC policy

soulitzer · soulitzer · commit 0809e9a1018d · 2025-09-04T08:33:56.000-07:00
diff --git a/torchtitan/distributed/activation_checkpoint.py b/torchtitan/distributed/activation_checkpoint.py
@@ -76,6 +76,13 @@ def _apply_ac_to_transformer_block(
 
         def _get_custom_policy(meta):
             def _custom_policy(ctx, func, *args, **kwargs):
+                if (
+                    func == torch.ops.aten._to_copy.default
+                    and "cuda" in str(args[0].device)
+                    and "device" in kwargs
+                    and str(kwargs["device"]) == "cpu"
+                ):
+                    return CheckpointPolicy.MUST_SAVE
                 mode = "recompute" if ctx.is_recompute else "forward"
                 mm_count_key = f"{mode}_mm_count"
                 if func == torch.ops.aten.mm.default:
diff --git a/torchtitan/distributed/expert_parallel.py b/torchtitan/distributed/expert_parallel.py
@@ -10,7 +10,10 @@
 import torch
 import torch.distributed as dist
 import torch.nn as nn
-from torch.distributed._functional_collectives import all_to_all_single_autograd
+from torch.distributed._functional_collectives import (
+    all_to_all_single,
+    all_to_all_single_autograd,
+)
 from torch.distributed.tensor import (
     DeviceMesh,
     distribute_module,
@@ -90,14 +93,15 @@ def _token_dispatch(self, mod, inputs, device_mesh):
 
         # generate the input splits and output splits for all-to-all
         with torch.no_grad():
-            num_tokens_per_expert_group = num_tokens_per_expert.new_empty(
-                num_tokens_per_expert.shape[0]
-            )
-            dist.all_to_all_single(
-                num_tokens_per_expert_group,
+            num_tokens_per_expert_group = all_to_all_single(
                 num_tokens_per_expert,
+                None,
+                None,
                 group=device_mesh.get_group(),
             )
+            num_tokens_per_expert_group = torch.ops._c10d_functional.wait_tensor(
+                num_tokens_per_expert_group
+            )
             input_splits = (
                 num_tokens_per_expert.view(ep_size, -1)
                 .sum(dim=1)
@@ -106,10 +110,9 @@ def _token_dispatch(self, mod, inputs, device_mesh):
             output_splits = (
                 num_tokens_per_expert_group.view(ep_size, -1)
                 .sum(dim=1)
-                .to(torch.device("cpu"), non_blocking=True)
+                .to(torch.device("cpu"), non_blocking=False)
             )
             # NOTE: this would incur a device-to-host sync
-            torch.cuda.current_stream().synchronize()
             self.input_splits = input_splits.tolist()
             self.output_splits = output_splits.tolist()
 
diff --git a/torchtitan/experiments/llama4/infra/parallelize.py b/torchtitan/experiments/llama4/infra/parallelize.py
@@ -38,6 +38,7 @@
     torch.ops.aten._scaled_dot_product_efficient_attention.default,
     torch.ops.aten._scaled_dot_product_flash_attention.default,
     torch.ops._c10d_functional.reduce_scatter_tensor.default,
+    torch.ops._c10d_functional.all_to_all_single.default,
     # for low precision training, it's useful to always save
     # the result of max, since the absolute maximum is
     # used to compute the scaling factor for quantization.
diff --git a/torchtitan/models/deepseek_v3/infra/parallelize.py b/torchtitan/models/deepseek_v3/infra/parallelize.py
@@ -35,6 +35,7 @@
     torch.ops.aten._scaled_dot_product_efficient_attention.default,
     torch.ops.aten._scaled_dot_product_flash_attention.default,
     torch.ops._c10d_functional.reduce_scatter_tensor.default,
+    torch.ops._c10d_functional.all_to_all_single.default,
     # for low precision training, it's useful to always save
     # the result of max, since the absolute maximum is
     # used to compute the scaling factor for quantization.