File tree Expand file tree Collapse file tree 4 files changed +7
-3
lines changed Expand file tree Collapse file tree 4 files changed +7
-3
lines changed Original file line number Diff line number Diff line change @@ -85,7 +85,7 @@ policy:
8585 moe_permute_fusion : false
8686 # gives ~20% training perf speedup with sequence packing
8787 apply_rope_fusion : True
88- # gives training perf speedup
88+ # gives ~25% training perf speedup with sequence packing and apply_rope_fusion
8989 bias_activation_fusion : True
9090 defer_fp32_logits : null
9191
Original file line number Diff line number Diff line change @@ -90,7 +90,7 @@ policy:
9090 moe_permute_fusion : false
9191 # gives ~20% training perf speedup with sequence packing
9292 apply_rope_fusion : True
93- # gives training perf speedup
93+ # gives ~25% training perf speedup with sequence packing and apply_rope_fusion
9494 bias_activation_fusion : True
9595
9696 optimizer :
Original file line number Diff line number Diff line change @@ -85,7 +85,7 @@ policy:
8585 moe_permute_fusion : false
8686 # gives ~20% training perf speedup with sequence packing
8787 apply_rope_fusion : True
88- # gives training perf speedup
88+ # gives ~25% training perf speedup with sequence packing and apply_rope_fusion
8989 bias_activation_fusion : True
9090
9191 env_vars :
Original file line number Diff line number Diff line change @@ -99,6 +99,10 @@ class MegatronConfig(TypedDict):
9999 expert_tensor_parallel_size : int
100100 expert_model_parallel_size : int
101101 defer_fp32_logits : NotRequired [bool ]
102+ # gives ~20% training perf speedup with sequence packing
103+ apply_rope_fusion : bool
104+ # gives ~25% training perf speedup with sequence packing and apply_rope_fusion
105+ bias_activation_fusion : bool
102106
103107 optimizer : NotRequired [MegatronOptimizerConfig ]
104108 scheduler : NotRequired [MegatronSchedulerConfig ]
You can’t perform that action at this time.
0 commit comments