NVIDIA-NeMo · terrykong · Oct 6, 2025 · Sep 30, 2025 · Sep 30, 2025 · Oct 1, 2025
@@ -26,3 +26,37 @@
     "model.layers.*.mlp.down_proj": RowwiseParallel(),
     "lm_head": ColwiseParallel(output_layouts=Shard(-1), use_local_output=False),
 }
+
+"""
+Note on numerical stability:
+
+- Default plans that keep attention output proj and mlp downproj RowwiseParallel are numerically
+  unstable and tend to increase with larger TP (e.g., TP >= 4).
+
+Enable this custom plan via:
+
+- policy.dtensor_cfg.custom_parallel_plan=examples.custom_parallel.qwen_model_tp_plan_stable
+
+Based on https://github.com/NVIDIA-NeMo/Automodel/blob/d79ccb94b0eca94a4c479313db2f9eee80db0139/nemo_automodel/components/distributed/optimized_tp_plans.py#L205-L217
+"""
+qwen_model_tp_plan_stable = {
+    "lm_head": ColwiseParallel(output_layouts=Shard(-1), use_local_output=False),
+    "model.embed_tokens": RowwiseParallel(
+        input_layouts=Replicate(),
+    ),
+    "model.layers.*.self_attn.q_proj": ColwiseParallel(),
+    "model.layers.*.self_attn.k_proj": ColwiseParallel(),
+    "model.layers.*.self_attn.v_proj": ColwiseParallel(),
+    "model.layers.*.self_attn.o_proj": ColwiseParallel(
+        input_layouts=Shard(-1),
+        output_layouts=Replicate(),
+        use_local_output=True,
+    ),
+    "model.layers.*.mlp.up_proj": ColwiseParallel(),
+    "model.layers.*.mlp.gate_proj": ColwiseParallel(),
+    "model.layers.*.mlp.down_proj": ColwiseParallel(
+        input_layouts=Shard(-1),
+        output_layouts=Replicate(),
+        use_local_output=True,
+    ),
+}