mlc-ai · giterator · Jul 6, 2025 · Jul 14, 2025 · Jul 21, 2025 · Jul 21, 2025
diff --git a/python/mlc_llm/conversation_template/llama.py b/python/mlc_llm/conversation_template/llama.py
@@ -4,6 +4,27 @@
 
 from .registry import ConvTemplateRegistry
 
+# Llama4 - same as Llama3.1 except naming has changed slightly
+ConvTemplateRegistry.register_conv_template(
+    Conversation(
+        name="llama-4",
+        system_template="",
+        system_message="",
+        roles={
+            "user": "<|header_start|>user",
+            "assistant": "<|header_start|>assistant",
+            "tool": "<|header_start|>ipython",
+        },
+        seps=["<|eot|>"],
+        role_content_sep="<|header_end|>\n\n",
+        role_empty_sep="<|header_end|>\n\n",
+        stop_str=[],
+        stop_token_ids=[200001, 200007, 200008],  # "<|end_of_text|>", "<|eom|>", "<|eot|>"
+        system_prefix_token_ids=[200000],  # "<|begin_of_text|>"
+        add_role_after_system_message=False,
+    )
+)
+
 # Llama3.1 -- same as Llama3 except stop token ids and stop str
 ConvTemplateRegistry.register_conv_template(
     Conversation(

diff --git a/python/mlc_llm/interface/gen_config.py b/python/mlc_llm/interface/gen_config.py
@@ -262,6 +262,7 @@ def gen_config(  # pylint: disable=too-many-locals,too-many-arguments,too-many-b
 # FIXME: Copy RWKV tokenizer file # pylint: disable=fixme
 
 CONV_TEMPLATES = {
+    "llama-4",
     "llama-3",
     "llama-3_1",
     "chatml",

diff --git a/python/mlc_llm/model/llama4/__init__.py b/python/mlc_llm/model/llama4/__init__.py
diff --git a/python/mlc_llm/model/llama4/llama4_loader.py b/python/mlc_llm/model/llama4/llama4_loader.py
@@ -0,0 +1,119 @@
+"""
+This file specifies how MLC's Llama parameter maps from other formats, for example HuggingFace
+PyTorch, HuggingFace safetensors.
+"""
+
+import functools
+
+import numpy as np
+
+from mlc_llm.loader import ExternMapping
+from mlc_llm.quantization import Quantization
+
+from .llama4_model import Llama4Config, Llama4ForCausalLM
+
+
+def huggingface(model_config: Llama4Config, quantization: Quantization) -> ExternMapping:
+    """Returns a parameter mapping that maps from the names of MLC LLM parameters to
+    the names of HuggingFace PyTorch parameters.
+
+    Parameters
+    ----------
+    model_config : Llama4Config
+        The configuration of the Llama model.
+
+    quantization : Quantization
+        The quantization configuration.
+
+    Returns
+    -------
+    param_map : ExternMapping
+        The parameter mapping from MLC to HuggingFace PyTorch.
+    """
+    model = Llama4ForCausalLM(model_config)
+    if quantization is not None:
+        model.to(quantization.model_dtype)
+    _, _named_params, _ = model.export_tvm(  # type: ignore[misc]
+        spec=model.get_default_spec(),
+        allow_extern=True,
+    )
+    named_parameters = dict(_named_params)
+
+    mapping = ExternMapping()
+
+    for i in range(model_config.text_config.num_hidden_layers):
+        # Add shared expert weights
+        mlp = f"model.layers.{i}.feed_forward.shared_expert"
+        mlc_name = f"{mlp}.gate_up_proj.weight"
+        mlc_param = named_parameters[mlc_name]
+        mapping.add_mapping(
+            mlc_name,
+            [
+                f"language_model.{mlp}.gate_proj.weight",
+                f"language_model.{mlp}.up_proj.weight",
+            ],
+            functools.partial(
+                lambda gate, up, dtype: np.concatenate([gate, up], axis=0).astype(dtype),
+                dtype=mlc_param.dtype,
+            ),
+        )
+
+        # Add router weights
+        mlp = f"model.layers.{i}.feed_forward"
+        mlc_name = f"{mlp}.router.router.weight"
+        hf_name = f"language_model.{mlp}.router.weight"
+        mlc_param = named_parameters[mlc_name]
+        mapping.add_mapping(
+            mlc_name,
+            [
+                hf_name,
+            ],
+            functools.partial(
+                lambda x, dtype: x.astype(dtype),
+                dtype=mlc_param.dtype,
+            ),
+        )
+
+        # Add experts weights
+        mlp = f"model.layers.{i}.feed_forward"
+        hf_name = f"language_model.{mlp}.experts.gate_up_proj"
+        mlc_name = f"{mlp}.experts.gate_up_proj"
+        mlc_param = named_parameters[mlc_name]
+        mapping.add_mapping(
+            mlc_name,
+            [
+                hf_name,
+            ],
+            functools.partial(
+                lambda x, dtype: x.astype(dtype),
+                dtype=mlc_param.dtype,
+            ),
+        )
+
+        mlp = f"model.layers.{i}.feed_forward"
+        mlc_name = f"{mlp}.experts.down_proj"
+        hf_name = f"language_model.{mlp}.experts.down_proj"
+
+        mlc_param = named_parameters[mlc_name]
+        mapping.add_mapping(
+            mlc_name,
+            [
+                hf_name,
+            ],
+            functools.partial(
+                lambda x, dtype: x.astype(dtype),
+                dtype=mlc_param.dtype,
+            ),
+        )
+
+    for mlc_name, mlc_param in named_parameters.items():
+        if mlc_name not in mapping.param_map:
+            mapping.add_mapping(
+                mlc_name,
+                [f"language_model.{mlc_name}"],
+                functools.partial(
+                    lambda x, dtype: x.astype(dtype),
+                    dtype=mlc_param.dtype,
+                ),
+            )
+    return mapping