NVIDIA-NeMo
diff --git a/‎scripts/performance/configs/deepseek/deepseek_v3_llm_pretrain.yaml‎
Lines changed: 4 additions & 7 deletions b/‎scripts/performance/configs/deepseek/deepseek_v3_llm_pretrain.yaml‎
Lines changed: 4 additions & 7 deletions
diff --git a/‎scripts/performance/configs/llama3/llama3_70b_llm_pretrain.yaml‎
Lines changed: 21 additions & 21 deletions b/‎scripts/performance/configs/llama3/llama3_70b_llm_pretrain.yaml‎
Lines changed: 21 additions & 21 deletions
diff --git a/‎scripts/performance/configs/llama3/llama3_8b_llm_pretrain.yaml‎
Lines changed: 6 additions & 6 deletions b/‎scripts/performance/configs/llama3/llama3_8b_llm_pretrain.yaml‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎scripts/performance/configs/llama3/llama31_405b_llm_pretrain.yaml‎ renamed to ‎scripts/performance/configs/llama31/llama31_405b_llm_pretrain.yaml‎
Lines changed: 10 additions & 13 deletions b/‎scripts/performance/configs/llama3/llama31_405b_llm_pretrain.yaml‎ renamed to ‎scripts/performance/configs/llama31/llama31_405b_llm_pretrain.yaml‎
Lines changed: 10 additions & 13 deletions
diff --git a/‎scripts/performance/configs/qwen3/qwen3_235b_a22b_llm_pretrain.yaml‎
Lines changed: 108 additions & 0 deletions b/‎scripts/performance/configs/qwen3/qwen3_235b_a22b_llm_pretrain.yaml‎
Lines changed: 108 additions & 0 deletions
@@ -45,9 +45,6 @@ ConfigContainer:
 
   mixed_precision:
     grad_reduce_in_fp32: false
-  
-  comm_overlap:
-    overlap_grad_reduce: false # TODO: enable when 1F1B_A2A is fixed in Megatron-LM
 
   profiling:
     # For optional fields in the config, specify the target to instantiate the object.
@@ -74,7 +71,7 @@ perf_matrix:
         vp: 4
         ep: 64
         etp: 1
-        fsdp: false
+        use_megatron_fsdp: false
       bf16:
       fp8_cs:
       fp8_ss:
@@ -91,7 +88,7 @@ perf_matrix:
         vp: null
         ep: 8
         etp: 1
-        fsdp: false
+        use_megatron_fsdp: false
       bf16:
         cuda_graphs: false
       fp8_cs:
@@ -111,8 +108,8 @@ perf_matrix:
         vp: 8
         ep: 64
         etp: 1
-        fsdp: false
-        cuda_graphs: true
+        use_megatron_fsdp: false
+        cuda_graphs: false
       bf16:
       fp8_cs:
       fp8_mx:
@@ -19,9 +19,7 @@
 ConfigContainer:
   model:
     cross_entropy_fusion_impl: "te"
-    enable_cuda_graph: false
-    use_te_rng_tracker: false
-    use_transformer_engine_op_fuser: true
+    use_transformer_engine_op_fuser: false
 
   train:
     train_iters: 25
@@ -40,8 +38,6 @@ ConfigContainer:
   ddp:
     check_for_nan_in_grad: false
     check_for_large_grads: false
-    use_megatron_fsdp: true
-    use_torch_fsdp2: false
 
   mixed_precision:
     grad_reduce_in_fp32: false
@@ -69,7 +65,7 @@ perf_matrix:
         vp: 5
         ep: 1
         etp: null
-        fsdp: false
+        use_megatron_fsdp: false
       bf16:
         pp: 4
         cp: 2
@@ -93,60 +89,64 @@ perf_matrix:
         pp: 4
         cp: 2
         vp: 5
-        fsdp: false
+        use_megatron_fsdp: false
         cuda_graphs: true
       fp8_ds:
         tp: 1
         pp: 1
         cp: 1
-        vp: 1
-        fsdp: true
+        vp: null
+        use_megatron_fsdp: true
         cuda_graphs: false
         recompute_num_layers: 5
       fp8_cs:
         tp: 1
         pp: 1
         cp: 1
-        vp: 1
-        fsdp: true
+        vp: null
+        use_megatron_fsdp: true
         cuda_graphs: false
         recompute_num_layers: 5
       fp8_mx:
         tp: 2
         pp: 4
         cp: 1
         vp: 5
-        fsdp: false
+        use_megatron_fsdp: false
         cuda_graphs: false
   gb200:
     num_gpus_64:
       common:
         num_gpus_per_node: 4
         seq_length: 8192
-        mbs: 1
         gbs: 128
         cuda_graphs: false
         cp: 1
         ep: 1
         etp: null
       bf16:
+        mbs: 1
         tp: 1
         pp: 1
-        vp: 1
-        fsdp: true
-        recompute_num_layers: 20
+        vp: null
+        use_megatron_fsdp: true
+        cpu_offloading_num_layers: 20
       fp8_ds:
+        mbs: 1
         tp: 1
         pp: 1
-        vp: 1
-        fsdp: true
+        vp: null
+        use_megatron_fsdp: true
       fp8_cs:
+        mbs: 2
         tp: 1
         pp: 1
-        vp: 1
-        fsdp: true
+        vp: null
+        use_megatron_fsdp: true
+        cpu_offloading_num_layers: 40
       fp8_mx:
+        mbs: 1
         tp: 2
         pp: 4
         vp: 5
-        fsdp: false
+        use_megatron_fsdp: false
@@ -15,7 +15,7 @@
 ConfigContainer:
   model:
     cross_entropy_fusion_impl: "te"
-    use_transformer_engine_op_fuser: true
+    use_transformer_engine_op_fuser: false
 
   train:
     train_iters: 25
@@ -64,15 +64,15 @@ perf_matrix:
         etp: null
       bf16:
         cp: 2
-        fsdp: false
+        use_megatron_fsdp: false
       fp8_ds:
         cp: 1
-        fsdp: true
+        use_megatron_fsdp: true
         keep_fp8_transpose_cache_when_using_custom_fsdp: true
         nccl_ub: true
       fp8_cs:
         cp: 1
-        fsdp: true
+        use_megatron_fsdp: true
         keep_fp8_transpose_cache_when_using_custom_fsdp: true
         nccl_ub: true
   b200:
@@ -89,7 +89,7 @@ perf_matrix:
         vp: null
         ep: 1
         etp: null
-        fsdp: false
+        use_megatron_fsdp: false
       bf16:
       fp8_ds:
       fp8_cs:
@@ -108,7 +108,7 @@ perf_matrix:
         vp: null
         ep: 1
         etp: null
-        fsdp: false
+        use_megatron_fsdp: false
       bf16:
       fp8_ds:
       fp8_cs:
 
@@ -19,9 +19,8 @@
 ConfigContainer:
   model:
     cross_entropy_fusion_impl: "te"
-    enable_cuda_graph: false
-    use_te_rng_tracker: false
-    use_transformer_engine_op_fuser: true
+    use_transformer_engine_op_fuser: false
+    seq_length: 8192
 
   train:
     train_iters: 25
@@ -40,8 +39,6 @@ ConfigContainer:
   ddp:
     check_for_nan_in_grad: false
     check_for_large_grads: false
-    use_megatron_fsdp: true
-    use_torch_fsdp2: false
 
   mixed_precision:
     grad_reduce_in_fp32: false
@@ -71,7 +68,7 @@ perf_matrix:
         vp: 8
         ep: 1
         etp: null
-        fsdp: false
+        use_megatron_fsdp: false
       bf16:
       fp8_ds:
       fp8_cs:
@@ -89,7 +86,7 @@ perf_matrix:
         vp: 8
         ep: 1
         etp: null
-        fsdp: false
+        use_megatron_fsdp: false
       bf16:
       fp8_ds:
       fp8_cs:
@@ -109,24 +106,24 @@ perf_matrix:
         pp: 8
         cp: 2
         vp: 8
-        fsdp: false
+        use_megatron_fsdp: false
       fp8_ds:
         tp: 2
         pp: 1
         cp: 1
-        vp: 1
-        fsdp: true
+        vp: null
+        use_megatron_fsdp: true
         cpu_offloading_num_layers: 95
       fp8_cs:
         tp: 2
         pp: 1
         cp: 1
-        vp: 1
-        fsdp: true
+        vp: null
+        use_megatron_fsdp: true
         cpu_offloading_num_layers: 95
       fp8_mx:
         tp: 4
         pp: 8
         cp: 2
         vp: 8
-        fsdp: false
+        use_megatron_fsdp: false
@@ -0,0 +1,108 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ConfigContainer:
+  model:
+    cross_entropy_fusion_impl: "te"
+    bias_activation_fusion: true
+    recompute_granularity: null
+    recompute_method: null
+    recompute_num_layers: null
+    moe_router_fusion: true
+
+  train:
+    train_iters: 25
+    eval_iters: 0
+
+  rerun_state_machine:
+    check_for_nan_in_loss: false
+
+  checkpoint:
+    # Directory to save to. If null, no checkpoint will be saved.
+    save: null
+
+  logger:
+    log_interval: 1
+
+  ddp:
+    check_for_nan_in_grad: false
+    check_for_large_grads: false
+
+  mixed_precision:
+    grad_reduce_in_fp32: false
+
+  profiling:
+    # For optional fields in the config, specify the target to instantiate the object.
+    _target_: megatron.bridge.training.config.ProfilingConfig
+    use_nsys_profiler: true
+    profile_step_start: 5
+    profile_step_end: 6
+    profile_ranks: [0]
+    record_shapes: false
+    use_pytorch_profiler: false
+
+perf_matrix:
+  h100:
+    num_gpus_256:
+      common:
+        num_gpus_per_node: 8
+        seq_length: 4096
+        mbs: 1
+        gbs: 2048
+        cuda_graphs: false
+        tp: 2
+        pp: 8
+        cp: 1
+        vp: 4
+        ep: 32
+        etp: 1
+        fsdp: false
+      bf16:
+      fp8_cs:
+  gb200:
+    num_gpus_64:
+      common:
+        num_gpus_per_node: 4
+        seq_length: 4096
+        mbs: 1
+        gbs: 1024
+        cuda_graphs: true
+        tp: 2
+        pp: 1
+        cp: 1
+        vp: null
+        ep: 64
+        etp: 1
+        fsdp: false
+      bf16:
+      fp8_cs:
+      fp8_mx:
+  b200:
+    num_gpus_64:
+      common:
+        num_gpus_per_node: 4
+        seq_length: 4096
+        mbs: 1
+        gbs: 1024
+        cuda_graphs: false
+        tp: 1
+        pp: 8
+        cp: 1
+        vp: 12
+        ep: 8
+        etp: 1
+        fsdp: false
+      bf16:
+      fp8_cs:
+      fp8_mx: