pytorch · yiliu30 · Aug 22, 2025 · yiliu30 · Aug 22, 2025 · jerryzh168
diff --git a/torchao/prototype/moe_quant/llama4_quant.py b/torchao/prototype/moe_quant/llama4_quant.py
@@ -58,7 +58,10 @@ def convert_fn(module):
 
 
 model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
-model = Llama4ForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+
+dtype = torch.bfloat16
+torch.set_default_dtype(dtype)
+model = Llama4ForCausalLM.from_pretrained(model_id, torch_dtype=dtype)
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 
 _replace_with_custom_fn_if_matches_filter(

diff --git a/torchao/prototype/moe_quant/quantizable_moe_modules.py b/torchao/prototype/moe_quant/quantizable_moe_modules.py
@@ -30,7 +30,7 @@ def __init__(
     def forward(self, x: Tensor) -> Tensor:
         batch_size = x.shape[0]
         x = x.view(-1, self.hidden_dim)  # x: [T, D]
-        scores = self.router(x)  # [T, E]
+        scores = self.router(x)[0]  # [T, E]
 router = module.router 
 up_proj = module.experts.gate_up_proj 
 w1, w3 = up_proj.permute(0, 2, 1).chunk(2, dim=1) 
 w2 = module.experts.down_proj.permute(0, 2, 1) 
 new_mod.router = router 
 router = module.router 
 up_proj = module.experts.gate_up_proj 
 w1, w3 = up_proj.permute(0, 2, 1).chunk(2, dim=1) 
 w2 = module.experts.down_proj.permute(0, 2, 1) 
  
 new_mod.router = router 
         scores = F.softmax(scores, dim=-1)
         scores, expert_indices = torch.topk(
             scores, self.top_k, dim=-1