cleaner workaround for moe + compile + ac graph break issue

danielvegamyhre · danielvegamyhre · commit 07096f45a7da · 2025-10-14T14:05:24.000-07:00
diff --git a/torchtitan/models/llama4/infra/parallelize.py b/torchtitan/models/llama4/infra/parallelize.py
@@ -513,14 +513,38 @@ def apply_compile(model: nn.Module, compile_config: CompileConfig):
     # torch._dynamo.config.capture_scalar_outputs = True
     for layer_id, transformer_block in model.layers.named_children():
         # TODO: remove when torch.compile supports fullgraph=True for MoE
-        fullgraph = True
         if transformer_block.moe_enabled:
-            fullgraph = False
-        transformer_block = torch.compile(
-            transformer_block,
-            backend=compile_config.backend,
-            fullgraph=fullgraph,
-        )
+            transformer_block.moe.experts = torch.compile(
+                transformer_block.moe.experts,
+                backend=compile_config.backend,
+                fullgraph=True,
+            )
+            transformer_block.moe.shared_experts = torch.compile(
+                transformer_block.moe.shared_experts,
+                backend=compile_config.backend,
+                fullgraph=True,
+            )
+            # transformer_block.attention = torch.compile(
+            #     transformer_block.attention,
+            #     backend=compile_config.backend,
+            #     fullgraph=True,
+            # )
+            # transformer_block.attention_norm = torch.compile(
+            #     transformer_block.attention_norm,
+            #     backend=compile_config.backend,
+            #     fullgraph=True,
+            # )
+            # transformer_block.ffn_norm = torch.compile(
+            #     transformer_block.ffn_norm,
+            #     backend=compile_config.backend,
+            #     fullgraph=True,
+            # )
+        else:
+            transformer_block = torch.compile(
+                transformer_block,
+                backend=compile_config.backend,
+                fullgraph=True,
+            )
         model.layers.register_module(layer_id, transformer_block)
 
     logger.info("Compiling each TransformerBlock with torch.compile")