Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions examples/configs/grpo_math_1B.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ policy:
moe_permute_fusion: false
#gives ~20% training perf speedup with sequence packing
apply_rope_fusion: True
# gives ~25% training perf speedup with sequence packing and apply_rope_fusion
bias_activation_fusion: True
defer_fp32_logits: null

optimizer:
Expand Down
4 changes: 3 additions & 1 deletion examples/configs/sft.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,9 @@ policy:
moe_router_bias_update_rate: 1e-3
moe_permute_fusion: false
#gives ~20% training perf speedup with sequence packing
apply_rope_fusion: True
apply_rope_fusion: True
# gives ~25% training perf speedup with sequence packing and apply_rope_fusion
bias_activation_fusion: True

optimizer:
optimizer: "adam"
Expand Down
2 changes: 2 additions & 0 deletions examples/configs/sft_openmathinstruct2_megatron.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ policy:
moe_permute_fusion: false
#gives ~20% training perf speedup with sequence packing
apply_rope_fusion: True
# gives ~25% training perf speedup with sequence packing and apply_rope_fusion
bias_activation_fusion: True

env_vars:
PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:False"
Expand Down
4 changes: 4 additions & 0 deletions nemo_rl/models/policy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,10 @@ class MegatronConfig(TypedDict):
expert_tensor_parallel_size: int
expert_model_parallel_size: int
defer_fp32_logits: NotRequired[bool]
# gives ~20% training perf speedup with sequence packing
apply_rope_fusion: bool
# gives ~25% training perf speedup with sequence packing and apply_rope_fusion
bias_activation_fusion: bool

optimizer: NotRequired[MegatronOptimizerConfig]
scheduler: NotRequired[MegatronSchedulerConfig]
Expand Down
3 changes: 3 additions & 0 deletions nemo_rl/models/policy/megatron_policy_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -603,6 +603,9 @@ def __init__(
"https://github.com/NVIDIA/Megatron-LM/blob/1ab876ddc4c1893c76f26d775226a8d1dcdfb3d2/megatron/core/transformer/mlp.py#L174."
)
model_cfg.apply_rope_fusion = self.cfg["megatron_cfg"]["apply_rope_fusion"]
model_cfg.bias_activation_fusion = self.cfg["megatron_cfg"][
"bias_activation_fusion"
]
fp8_cfg = self.cfg["megatron_cfg"].get("fp8_cfg", None)
self.fp8_cfg = fp8_cfg
if fp8_cfg is not None and fp8_cfg.get("enabled", False):
Expand Down
1 change: 1 addition & 0 deletions tests/unit/models/generation/test_vllm_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ def get_basic_megatron_test_config(
"moe_router_bias_update_rate": 0.0,
"moe_permute_fusion": False,
"apply_rope_fusion": True,
"bias_activation_fusion": True,
"train_iters": 100, # Required for Megatron training
"optimizer": {
"optimizer": "adam",
Expand Down
1 change: 1 addition & 0 deletions tests/unit/models/policy/test_megatron_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ def create_megatron_test_config(
"moe_router_bias_update_rate": 0.0,
"moe_permute_fusion": False,
"apply_rope_fusion": True,
"bias_activation_fusion": True,
"defer_fp32_logits": defer_fp32_logits,
"train_iters": 100, # Required for Megatron training
"optimizer": {
Expand Down
Loading