misc

fegin · fegin · commit 8fed32ac31cf · 2025-09-05T15:48:07.000-07:00
diff --git a/torchtitan/distributed/activation_checkpoint.py b/torchtitan/distributed/activation_checkpoint.py
@@ -22,9 +22,7 @@
 _layer_sac_count = 0
 
 
-def _apply_layer_sac(
-    module: nn.Module, ac_config: ACConfig, *, ac_freq: int | None = None
-) -> nn.Module:
+def _apply_layer_sac(module: nn.Module, ac_config: ACConfig) -> nn.Module:
     """Apply layer selective activation checkpointing to the module.
 
     Args:
@@ -58,12 +56,11 @@ def _apply_op_sac(
         module (nn.Module): The module to apply selective activation checkpointing to.
         ac_config (ActivationCheckpoint): The activation checkpointing config.
         base_fqn (str, optional): The base fqn of the module. Defaults to None.
-        save_list (set[torch._ops.OpOverload]): The list of ops to save when selective
-            activation checkpointing is used.
+        save_list (set[torch._ops.OpOverload]): The list of ops to save instead
+            of recomputing.
 
     Returns:
         nn.Module: The module with selective activation checkpointing applied.
-
     """
     from torch.utils.checkpoint import (
         CheckpointPolicy,
@@ -130,59 +127,29 @@ def _apply_full_ac(module: nn.Module, ac_config: ACConfig) -> nn.Module:
     )
 
 
-def _apply_ac_to_transformer_block(
+def _apply_op_sac_to_transformer_block_with_flex(
     module: nn.Module,
     ac_config: ACConfig,
     *,
     base_fqn: str | None = None,
     model_compile_enabled: bool = False,
-    use_flex_attn: bool = False,
-    save_list: set[torch._ops.OpOverload] | None = None,
+    save_list: set[torch._ops.OpOverload],
 ) -> nn.Module:
-    valid_ac_modes = ("full", "selective")
-    if ac_config.mode not in valid_ac_modes:
-        raise ValueError(
-            f"Invalid AC mode: {ac_config.mode}. Valid modes: {valid_ac_modes}"
-        )
+    """Apply SAC to the transformer block that uses FlexAttention.
 
-    if ac_config.mode == "full":
-        return _apply_full_ac(module, ac_config)
-
-    assert ac_config.mode == "selective", f"{ac_config.mode}"
-    use_op_sac = ac_config.selective_ac_option == "op"
-    use_layer_sac = ac_config.selective_ac_option.isdigit()
-    if not use_op_sac and not use_layer_sac:
-        raise ValueError(
-            f"Invalid selective AC option: {ac_config.selective_ac_option}. "
-            f"Valid options: 'op' or a positive int representing layer frequency"
-        )
-
-    if use_op_sac:
-        save_list = save_list or set()
-        if use_flex_attn:
-            return _apply_op_sac_to_transformer_block_with_flex(
-                module,
-                ac_config,
-                base_fqn=base_fqn,
-                model_compile_enabled=model_compile_enabled,
-                save_list=save_list,
-            )
-        else:
-            return _apply_op_sac(
-                module, ac_config, base_fqn=base_fqn, save_list=save_list
-            )
-
-    return _apply_layer_sac(module, ac_config)
+    Args:
+        module (nn.Module): The transformer block to apply SAC to.
+        ac_config (ACConfig): The activation checkpointing config.
+        base_fqn (str, optional): The base fqn of the module. Defaults to None.
+        model_compile_enabled (bool): Whether model compilation is enabled.
+            Defaults to False.
+        save_list (set[torch._ops.OpOverload]): The list of ops to save instead
+            of recomputing.
 
+    Returns:
+        nn.Module: The transformer block with SAC applied.
+    """
 
-def _apply_op_sac_to_transformer_block_with_flex(
-    module: nn.Module,
-    ac_config: ACConfig,
-    *,
-    base_fqn: str | None = None,
-    model_compile_enabled: bool = False,
-    save_list: set[torch._ops.OpOverload],
-) -> nn.Module:
     warn_once(
         logger,
         (
@@ -227,6 +194,51 @@ def _apply_op_sac_to_transformer_block_with_flex(
     return module
 
 
+def _apply_ac_to_transformer_block(
+    module: nn.Module,
+    ac_config: ACConfig,
+    *,
+    base_fqn: str | None = None,
+    model_compile_enabled: bool = False,
+    use_flex_attn: bool = False,
+    save_list: set[torch._ops.OpOverload] | None = None,
+) -> nn.Module:
+    valid_ac_modes = ("full", "selective")
+    if ac_config.mode not in valid_ac_modes:
+        raise ValueError(
+            f"Invalid AC mode: {ac_config.mode}. Valid modes: {valid_ac_modes}"
+        )
+
+    if ac_config.mode == "full":
+        return _apply_full_ac(module, ac_config)
+
+    assert ac_config.mode == "selective", f"{ac_config.mode}"
+    use_op_sac = ac_config.selective_ac_option == "op"
+    use_layer_sac = ac_config.selective_ac_option.isdigit()
+    if not use_op_sac and not use_layer_sac:
+        raise ValueError(
+            f"Invalid selective AC option: {ac_config.selective_ac_option}. "
+            f"Valid options: 'op' or a positive int representing layer frequency"
+        )
+
+    if use_op_sac:
+        save_list = save_list or set()
+        if use_flex_attn:
+            return _apply_op_sac_to_transformer_block_with_flex(
+                module,
+                ac_config,
+                base_fqn=base_fqn,
+                model_compile_enabled=model_compile_enabled,
+                save_list=save_list,
+            )
+        else:
+            return _apply_op_sac(
+                module, ac_config, base_fqn=base_fqn, save_list=save_list
+            )
+
+    return _apply_layer_sac(module, ac_config)
+
+
 def apply_ac(
     model: nn.Module,
     ac_config: ACConfig,
@@ -238,15 +250,16 @@ def apply_ac(
     """Apply activation checkpointing to the model.
 
     Note that SAC, Flex Attention and model compilation have some conflicts.
-    We explicitly ask the user to pass these configs to warn if there are conflicts.
+    We explicitly ask the user to pass these configs to warn as the wrapping
+    will be different.
 
     Args:
         model (nn.Module): The model to apply activation checkpointing to.
         ac_config (ActivationCheckpoint): The activation checkpointing config.
         model_compile_enabled (bool): Whether torch.compile is enabled for the model.
         use_flex_attn (bool): Whether flex attention is enabled for the model.
-        save_list (set[torch._ops.OpOverload]): The list of ops to save when selective
-            activation checkpointing is used.
+        save_list (set[torch._ops.OpOverload]): The list of ops to save instead
+            of recomputing.
     Returns:
         None
     """