Add comments and description in the MegatronConfig(TypedDict)

katec846 · katec846 · commit f8eb594ce767 · 2025-10-08T10:13:50.000-07:00
Signed-off-by: Kate Cheng &lt;yunhsuanc@nvidia.com&gt;
diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml
@@ -85,7 +85,7 @@ policy:
     moe_permute_fusion: false
     #gives ~20% training perf speedup with sequence packing
     apply_rope_fusion: True
-    # gives training perf speedup
+    # gives ~25% training perf speedup with sequence packing and apply_rope_fusion
     bias_activation_fusion: True
     defer_fp32_logits: null
 
diff --git a/examples/configs/sft.yaml b/examples/configs/sft.yaml
@@ -90,7 +90,7 @@ policy:
     moe_permute_fusion: false
     #gives ~20% training perf speedup with sequence packing 
     apply_rope_fusion: True
-    # gives training perf speedup
+    # gives ~25% training perf speedup with sequence packing and apply_rope_fusion
     bias_activation_fusion: True
 
     optimizer:
diff --git a/examples/configs/sft_openmathinstruct2_megatron.yaml b/examples/configs/sft_openmathinstruct2_megatron.yaml
@@ -85,7 +85,7 @@ policy:
     moe_permute_fusion: false
     #gives ~20% training perf speedup with sequence packing
     apply_rope_fusion: True
-    # gives training perf speedup
+    # gives ~25% training perf speedup with sequence packing and apply_rope_fusion
     bias_activation_fusion: True
 
     env_vars:
diff --git a/nemo_rl/models/policy/__init__.py b/nemo_rl/models/policy/__init__.py
@@ -99,6 +99,10 @@ class MegatronConfig(TypedDict):
     expert_tensor_parallel_size: int
     expert_model_parallel_size: int
     defer_fp32_logits: NotRequired[bool]
+    # gives ~20% training perf speedup with sequence packing 
+    apply_rope_fusion: bool
+    # gives ~25% training perf speedup with sequence packing and apply_rope_fusion
+    bias_activation_fusion: bool
 
     optimizer: NotRequired[MegatronOptimizerConfig]
     scheduler: NotRequired[MegatronSchedulerConfig]