Add post_dataloading_processing method to Trainer

fegin · fegin · commit 2c13a68be75e · 2025-11-04T00:03:30.000-08:00
We are adding more actions to convert the raw inputs and label. 1. The new CP can do the input/label/BlockMask sharding this in this method. 2. The experimental full dtensor model can simply override this method without changing too many Trainer code. This method is extracted from #1857 Makeing this a standalone PR allows us to continue the two projects above without one blocks another. ghstack-source-id: d1882a7 Pull-Request: #1985
diff --git a/torchtitan/train.py b/torchtitan/train.py
@@ -410,26 +410,37 @@ def batch_generator(
 
             yield input_dict, labels
 
-    def forward_backward_step(
-        self, input_dict: dict[str, torch.Tensor], labels: torch.Tensor
-    ) -> torch.Tensor:
-        model_parts = self.model_parts
-        parallel_dims = self.parallel_dims
-
+    def post_dataloading_processing(
+        self, input_dict: dict[str, torch.Tensor], label: torch.Tensor
+    ) -> tuple[
+        dict[str, torch.Tensor], torch.Tensor, dict[str, torch.Tensor], dict[str, Any]
+    ]:
+        """Post processing after data loading."""
         inputs = input_dict["input"]
         extra_inputs = {k: v for k, v in input_dict.items() if k != "input"}
         # For arguments, like attention_masks, we have to put them in a separate
         # dict as extra_inputs are not forwarded to other stages in PP, but
         # extra_kwargs are.
-        extra_kwargs = {}
+        extra_kwargs: dict[str, Any] = {}
 
         if getattr(self.model_args, "use_flex_attn", False):
-            extra_kwargs["attention_masks"] = model_parts[0].get_attention_masks(
+            extra_kwargs["attention_masks"] = self.model_parts[0].get_attention_masks(
                 input_batch=inputs,
                 tokenizer=self.tokenizer,
                 extra_inputs=extra_inputs,
             )
 
+        return inputs, label, extra_inputs, extra_kwargs
+
+    def forward_backward_step(
+        self, input_dict: dict[str, torch.Tensor], labels: torch.Tensor
+    ) -> torch.Tensor:
+        model_parts = self.model_parts
+        parallel_dims = self.parallel_dims
+
+        inputs, label, extra_inputs, extra_kwargs = self.post_dataloading_processing(
+            input_dict, labels
+        )
         # apply context parallelism if cp is enabled
         # ensure CP handles the separate freqs_cis buffer for each pp stage
         optional_context_parallel_ctx = (