add manual bucketing pass

ruisizhang123 · ruisizhang123 · commit a5c40272bbb9 · 2025-10-23T14:27:40.000-07:00
diff --git a/torchtitan/experiments/simple_fsdp/README.md b/torchtitan/experiments/simple_fsdp/README.md
@@ -51,13 +51,14 @@ SimpleFSDP relies on compiler backend to perform optimizations (i.e., bucketing
 
 2. auto optimization: perform auto-bucketing & reordering without user inputs. **Note: it is not guaranteed that users will get the most optimized training performance**
     - "aot_eager_autobucketing": perform autobucketing at aten fx-level, and perform code execution with aot_eager backend.
-
-
-users can specify the pass (e.g., "aot_eager_autobucketing") via addtional configs:
-
-```bash
---job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config  --compile.model_backend_override "aot_eager_autobucketing"
-```
+      ```bash
+      --compile.backend "aot_eager" --job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config  --compile.model_backend_override "aot_eager_autobucketing"
+      ```
+3. manual optimization: perform manual bucketing & reordering with user FQN inputs.
+    - "aot_eager_manualbucketing": perform manual bucketing at aten fx-level, and perform code execution with aot_eager backend.
+      ```bash
+      --compile.backend "aot_eager" --job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config  --compile.model_backend_override "aot_eager_manualbucketing" --compile.manual_bucketed_modules "tok_embeddings,layers.[0-5],norm+output"
+      ```
 
 ### Citation
 
diff --git a/torchtitan/experiments/simple_fsdp/backend.py b/torchtitan/experiments/simple_fsdp/backend.py
@@ -7,12 +7,14 @@
 from typing import Any, Union
 
 import torch
+from torchtitan.config import JobConfig
 
-
-def get_compile_backend(backend_name: str) -> Union[str, callable]:
+def get_compile_backend(job_config: JobConfig) -> Union[str, callable]:
     # return the compile backends used in SimpleFSDP training
     # Step1: check if backend_name is inside available torch.compile backends
     # Step2: check if the backend_name has been registered as a customized backend
+    backend_name = job_config.compile.model_backend_override or job_config.compile.backend
+
     available_torch_backend = torch._dynamo.list_backends(exclude_tags=())
     if backend_name in available_torch_backend:
         return backend_name
@@ -41,6 +43,32 @@ def aten_autobucketing_reordering_pass(
             bw_compiler=aten_autobucketing_reordering_pass,
             keep_inference_input_mutations=True,
         )
+    elif backend_name == "aot_eager_manualbucketing":
+        # Perform manual optimization in aten fx-level and execute code in aot_eager backend
+        # The manualbucketing logic is here:
+        bucketing_modules = job_config.compile.manual_bucketed_modules
+        from torch._dynamo.backends.common import aot_autograd as aot_autograd_backend
+        from torch._inductor.fx_passes.overlap_manual_scheduling import (
+            manual_overlap_bucketing,
+        )
+        from functools import partial
+
+        torch._inductor.config.test_configs.aten_fx_overlap_preserving_bucketing = True
+        torch._inductor.config.test_configs.aten_fx_overlap_insert_overlap_deps = False
+        torch._inductor.config.allow_buffer_reuse = False
+        manual_overlap_bucketing = partial(manual_overlap_bucketing, module_bucket_plans=job_config.compile.manual_bucketed_modules)
+
+        def aten_manualbucketing_reordering_pass(
+            gm: torch.fx.GraphModule, example_inputs: Any
+        ) -> torch.fx.GraphModule:
+            manual_overlap_bucketing(gm)
+            return gm
+
+        backend = aot_autograd_backend(
+            fw_compiler=aten_manualbucketing_reordering_pass,
+            bw_compiler=aten_manualbucketing_reordering_pass,
+            keep_inference_input_mutations=True,
+        )
     else:
         raise AssertionError(f"Unsupported customized backend: {backend_name}")
 
diff --git a/torchtitan/experiments/simple_fsdp/job_config.py b/torchtitan/experiments/simple_fsdp/job_config.py
@@ -10,8 +10,10 @@
 @dataclass
 class Compile:
     model_backend_override: str | None = None
-    """Override backend to compile in simplefsdp. Additional backend includes aot_eager_autobucketing"""
+    """Override backend to compile in simplefsdp. Additional backend includes aot_eager_autobucketing """
 
+    manual_bucketed_modules: list[str] = field(default_factory=list)
+    """Which modules should be bucketed together based on user specifications in manual optimization """
 
 @dataclass
 class JobConfig:
diff --git a/torchtitan/experiments/simple_fsdp/llama3/parallelize.py b/torchtitan/experiments/simple_fsdp/llama3/parallelize.py
@@ -125,12 +125,9 @@ def parallelize_llama(
 
     if job_config.compile.enable and "model" in job_config.compile.components:
         torch._inductor.config.reorder_for_peak_memory = False
-        backend = (
-            job_config.compile.model_backend_override or job_config.compile.backend
-        )
         model = torch.compile(
             model,
-            backend=get_compile_backend(backend),
+            backend=get_compile_backend(job_config),
             fullgraph=True,
         )
 
diff --git a/torchtitan/experiments/simple_fsdp/simple_fsdp.py b/torchtitan/experiments/simple_fsdp/simple_fsdp.py
@@ -342,7 +342,6 @@ def data_parallel(
 
     # apply regional ac (with fsdp_policy) if no global ac is to be applied
     regional_ac = ac_mode == "none"
-
     for mod in modules:
         params_dict = dict(mod.named_parameters(recurse=False))
         # we shouldn't apply data parallel to the modules that are already
diff --git a/torchtitan/models/llama3/model/model.py b/torchtitan/models/llama3/model/model.py
@@ -405,6 +405,7 @@ def __init__(self, model_args: TransformerModelArgs):
             self.layers[str(layer_id)] = TransformerBlock(layer_id, model_args)
         self.norm = nn.RMSNorm(model_args.dim, eps=model_args.norm_eps)
         self.output = nn.Linear(model_args.dim, model_args.vocab_size, bias=False)
+        self.total_list = []
 
     def init_weights(
         self,
@@ -498,10 +499,13 @@ def forward(
 
         """
         # passthrough for nonexistent layers, allows easy configuration of pipeline parallel stages
+        self.total_list = []
         h = self.tok_embeddings(tokens) if self.tok_embeddings else tokens
+        self.total_list.append(h)
 
         for layer in self.layers.values():
             h = layer(h, self.freqs_cis, attention_masks=attention_masks)
+            self.total_list.append(h)
 
         h = self.norm(h) if self.norm else h
         output = self.output(h) if self.output else h
diff --git a/torchtitan/train.py b/torchtitan/train.py
@@ -412,6 +412,9 @@ def batch_generator(
 
             yield input_dict, labels
 
+    def custom_hash_fn(self, tensor):
+        return tensor.norm(p=2, dtype=torch.float64)
+
     def forward_backward_step(
         self, input_dict: dict[str, torch.Tensor], labels: torch.Tensor
     ) -> torch.Tensor:
@@ -488,6 +491,9 @@ def forward_backward_step(
                     pred = model_parts[0](inputs, **extra_inputs, **extra_args)
                     loss = self.loss_fn(pred, labels)
                 # need to free pred before bwd to avoid peaking memory
+                for res in model_parts[0].total_list:
+                    print("[FWD] pred results", self.custom_hash_fn(res))
+                print("[FWD] pred results", self.custom_hash_fn(pred))
                 del pred
                 loss.backward()
 
@@ -521,6 +527,10 @@ def train_step(
             ),
             ep_enabled=parallel_dims.ep_enabled,
         )
+
+        for m in self.model_parts:
+            for p_name, p in m.named_parameters():
+                print("[BWD] grad", self.custom_hash_fn(p).to_local())
         self.checkpointer.maybe_wait_for_staging()
         self.optimizers.step()
         self.lr_schedulers.step()