NVIDIA-NeMo · terrykong · Oct 10, 2025 · Oct 1, 2025 · Oct 1, 2025 · Oct 8, 2025
@@ -85,6 +85,8 @@ policy:
     moe_permute_fusion: false
     #gives ~20% training perf speedup with sequence packing
     apply_rope_fusion: True
+    # gives ~25% training perf speedup with sequence packing and apply_rope_fusion
+    bias_activation_fusion: True
     defer_fp32_logits: null
 
     optimizer:

@@ -89,7 +89,9 @@ policy:
     moe_router_bias_update_rate: 1e-3
     moe_permute_fusion: false
     #gives ~20% training perf speedup with sequence packing 
-    apply_rope_fusion: True   
+    apply_rope_fusion: True
+    # gives ~25% training perf speedup with sequence packing and apply_rope_fusion
+    bias_activation_fusion: True
 
     optimizer:
       optimizer: "adam"

@@ -89,6 +89,8 @@ policy:
     moe_permute_fusion: false
     #gives ~20% training perf speedup with sequence packing
     apply_rope_fusion: True
+    # gives ~25% training perf speedup with sequence packing and apply_rope_fusion
+    bias_activation_fusion: True
 
     env_vars:
       PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:False"

@@ -104,6 +104,10 @@ class MegatronConfig(TypedDict):
     expert_tensor_parallel_size: int
     expert_model_parallel_size: int
     defer_fp32_logits: NotRequired[bool]
+    # gives ~20% training perf speedup with sequence packing
+    apply_rope_fusion: bool
+    # gives ~25% training perf speedup with sequence packing and apply_rope_fusion
+    bias_activation_fusion: bool
 
     optimizer: NotRequired[MegatronOptimizerConfig]
     scheduler: NotRequired[MegatronSchedulerConfig]

@@ -603,6 +603,9 @@ def __init__(
                 "https://github.com/NVIDIA/Megatron-LM/blob/1ab876ddc4c1893c76f26d775226a8d1dcdfb3d2/megatron/core/transformer/mlp.py#L174."
             )
         model_cfg.apply_rope_fusion = self.cfg["megatron_cfg"]["apply_rope_fusion"]
+        model_cfg.bias_activation_fusion = self.cfg["megatron_cfg"][
+            "bias_activation_fusion"
+        ]
         fp8_cfg = self.cfg["megatron_cfg"].get("fp8_cfg", None)
         self.fp8_cfg = fp8_cfg
         if fp8_cfg is not None and fp8_cfg.get("enabled", False):

@@ -177,6 +177,7 @@ def get_basic_megatron_test_config(
             "moe_router_bias_update_rate": 0.0,
             "moe_permute_fusion": False,
             "apply_rope_fusion": True,
+            "bias_activation_fusion": True,
             "train_iters": 100,  # Required for Megatron training
             "optimizer": {
                 "optimizer": "adam",

@@ -98,6 +98,7 @@ def create_megatron_test_config(
             "moe_router_bias_update_rate": 0.0,
             "moe_permute_fusion": False,
             "apply_rope_fusion": True,
+            "bias_activation_fusion": True,
             "defer_fp32_logits": defer_fp32_logits,
             "train_iters": 100,  # Required for Megatron training
             "optimizer": {