[simplefsdp] add reshard after forward option (#1961)

ruisizhang123 · web-flow · commit 29eb9102d73b · 2025-10-30T10:59:17.000-07:00
As titled, this pr adds zero2-style FSDP sharding option to SimpleFSDP. It can be enabled with `--parallelism.simple_fsdp_reshard_after_forward "never"` config. As seen, with `--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config --parallelism.simple_fsdp_reshard_after_forward "always"`, there is AG in bwd for re-gather parameters (Trace [link](https://interncache-all.fbcdn.net/manifold/perfetto-artifacts/tree/ui/index.html#!/?url=https://interncache-all.fbcdn.net/manifold/perfetto_internal_traces/tree/shared_trace/ruisizhang123_2025-10-29-20-02-34_rank0_trace.json)) <img width="1289" height="234" alt="Screenshot 2025-10-29 at 8 03 05 PM" src="https://github.com/user-attachments/assets/a6e5b736-9d1f-44a2-aa35-af7b315fd24a" /> with `--parallelism.simple_fsdp_reshard_after_forward "never"`, there is no AG in bwd (Trace [link](https://interncache-all.fbcdn.net/manifold/perfetto-artifacts/tree/ui/index.html#!/?url=https://interncache-all.fbcdn.net/manifold/perfetto_internal_traces/tree/shared_trace/ruisizhang123_2025-10-29-20-04-49_rank0_trace.json)) <img width="870" height="215" alt="Screenshot 2025-10-29 at 8 05 07 PM" src="https://github.com/user-attachments/assets/022f1a02-fe45-45d6-ba04-6bcaf2e9d64f" />
diff --git a/torchtitan/experiments/simple_fsdp/README.md b/torchtitan/experiments/simple_fsdp/README.md
@@ -15,13 +15,13 @@ This folder includes an experimental frontend implementation for [SimpleFSDP: Si
 #### Training Llama3 models
 
 ```bash
-CONFIG_FILE="./torchtitan/models/llama3/train_configs/llama3_8b.toml" ./run_train.sh --model.name simple_fsdp.llama3 --compile.enable
+CONFIG_FILE="./torchtitan/models/llama3/train_configs/llama3_8b.toml" ./run_train.sh --model.name simple_fsdp.llama3 --compile.enable --job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config
 ```
 
 #### Training DeepSeek_v3 models
 
 ```bash
-CONFIG_FILE="./torchtitan/models/deepseek_v3/train_configs/debug_model.toml" ./run_train.sh --model.name simple_fsdp.deepseek_v3 --compile.enable
+CONFIG_FILE="./torchtitan/models/deepseek_v3/train_configs/debug_model.toml" ./run_train.sh --model.name simple_fsdp.deepseek_v3 --compile.enable --activation_checkpoint.mode "none" --job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config
 ```
 
 ### Composability Support
@@ -56,7 +56,7 @@ SimpleFSDP relies on compiler backend to perform optimizations (i.e., bucketing
 users can specify the pass (e.g., "aot_eager_autobucketing") via additional configs:
 
 ```bash
---job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config  --compile.model_backend_override "aot_eager_autobucketing"
+--compile.model_backend_override "aot_eager_autobucketing"
 ```
 
 ### Citation
diff --git a/torchtitan/experiments/simple_fsdp/deepseek_v3/parallelize.py b/torchtitan/experiments/simple_fsdp/deepseek_v3/parallelize.py
@@ -91,6 +91,20 @@ def parallelize_deepseekv3(
         reduce_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_reduce],
     )
 
+    match job_config.parallelism.fsdp_reshard_after_forward:
+        case "always":
+            reshard_after_forward = True
+        case "never":
+            reshard_after_forward = False
+        case "default":
+            # For PP, by default do not reshard after forward to avoid per-microbatch
+            # all-gathers, which can be expensive and non-overlapped
+            reshard_after_forward = not parallel_dims.pp_enabled
+        case _:
+            raise ValueError(
+                f"Invalid reshard_after_forward_policy: {job_config.parallelism.fsdp_reshard_after_forward}."
+            )
+
     # apply data parallel
     dp_mesh: DeviceMesh | None = None
     if (
@@ -143,6 +157,7 @@ def parallelize_deepseekv3(
                     dp_mode,
                     ac_mode=job_config.activation_checkpoint.mode,
                     mp_policy=mp_policy,
+                    reshard_after_forward=reshard_after_forward,
                     shard_dim=experts_shard_dim,
                     reduction_divide_factor=parallel_dims.fsdp_gradient_divide_factor,
                 )
@@ -153,6 +168,7 @@ def parallelize_deepseekv3(
             dp_mode,
             ac_mode=job_config.activation_checkpoint.mode,
             mp_policy=mp_policy,
+            reshard_after_forward=reshard_after_forward,
         )
 
         logger.info(
diff --git a/torchtitan/experiments/simple_fsdp/llama3/parallelize.py b/torchtitan/experiments/simple_fsdp/llama3/parallelize.py
@@ -112,12 +112,27 @@ def parallelize_llama(
             reduce_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_reduce],
         )
 
+        match job_config.parallelism.fsdp_reshard_after_forward:
+            case "always":
+                reshard_after_forward = True
+            case "never":
+                reshard_after_forward = False
+            case "default":
+                # For PP, by default do not reshard after forward to avoid per-microbatch
+                # all-gathers, which can be expensive and non-overlapped
+                reshard_after_forward = not parallel_dims.pp_enabled
+            case _:
+                raise ValueError(
+                    f"Invalid reshard_after_forward_policy: {job_config.parallelism.fsdp_reshard_after_forward}."
+                )
+
         model = data_parallel(
             model,
             parallel_dims.world_mesh[tuple(dp_mesh_dim_names)],
             mode=dp_mode,
             ac_mode=job_config.activation_checkpoint.mode,
             mp_policy=mp_policy,
+            reshard_after_forward=reshard_after_forward,
         )
         logger.info(
             "Applied Data Parallel (simple_fsdp) (dp mode=%s) to the model", dp_mode
diff --git a/torchtitan/experiments/simple_fsdp/simple_fsdp.py b/torchtitan/experiments/simple_fsdp/simple_fsdp.py
@@ -210,6 +210,7 @@ def __init__(
         mode,
         regional_ac,
         mp_policy,
+        reshard_after_forward,
         reduction_divide_factor,
     ):
         super().__init__()
@@ -228,6 +229,7 @@ def __init__(
         mp_policy = mp_policy or MixedPrecisionPolicy()
         self.param_dtype = mp_policy.param_dtype
         self.reduce_dtype = mp_policy.reduce_dtype
+        self.reshard_after_forward = reshard_after_forward
 
     def replicate_compute(self, x: DTensor) -> torch.Tensor:
         # data parallel runtime replicate parameters and do local compute
@@ -290,7 +292,11 @@ def forward(self, x: DTensor) -> torch.Tensor:
         if not _active_parametrization:
             return x
 
-        if self.regional_ac and self.mode in ("fully_shard", "hybrid_shard"):
+        if (
+            self.regional_ac
+            and self.mode in ("fully_shard", "hybrid_shard")
+            and self.reshard_after_forward
+        ):
             # apply checkpointing to implement reshard_after_forward
             output = checkpoint(
                 self.replicate_compute,
@@ -310,6 +316,7 @@ def data_parallel(
     mode: str = "replicate",
     ac_mode: str = "none",
     mp_policy: MixedPrecisionPolicy | None = None,
+    reshard_after_forward: bool = True,
     shard_dim: int = 0,
     reduction_divide_factor: float | None = None,
 ):
@@ -374,6 +381,7 @@ def data_parallel(
                 mode,
                 regional_ac,
                 mp_policy=mp_policy,
+                reshard_after_forward=reshard_after_forward,
                 reduction_divide_factor=reduction_divide_factor,
             ),
         )