[mxfp8 moe training] update torchao MXFP8 MoE training integration; bump version guard

danielvegamyhre · danielvegamyhre · commit 90582f1e553e · 2025-09-11T17:43:07.000-07:00
diff --git a/torchtitan/components/quantization/mx.py b/torchtitan/components/quantization/mx.py
@@ -30,6 +30,7 @@ class MXConverter(ModelConverter):
     enabled: bool
     filter_fqns: List[str]
     mx_config: Any  # MXLinearConfig type when imported
+    mxfp8_token_group_alignment_size = 32
 
     def __init__(self, job_config: JobConfig, parallel_dims: ParallelDims):
         # Ensure minimum torchao versions
@@ -39,8 +40,8 @@ def __init__(self, job_config: JobConfig, parallel_dims: ParallelDims):
             )
         torchao_version = version("torchao")
 
-        # Last torchao release was 0.12.0, so nightly build starts with 0.13.0+git...
-        is_nightly_build = torchao_version.startswith("0.13.0")
+        # Last torchao release was 0.13.0, so nightly build starts with 0.13.0+git...
+        is_nightly_build = torchao_version.startswith("0.14.0")
         if not is_nightly_build:
             raise ImportError(
                 f"torchao version {torchao_version} is too old, please install torchao nightly build and try again"
@@ -52,7 +53,6 @@ def __init__(self, job_config: JobConfig, parallel_dims: ParallelDims):
         ), "MXFP8 is only supported on SM100 or architectures"
 
         # TP not yet supported with torch.compile
-
         model_compile_enabled = (
             job_config.compile.enable and "model" in job_config.compile.components
         )
@@ -61,10 +61,12 @@ def __init__(self, job_config: JobConfig, parallel_dims: ParallelDims):
         ), "TP not yet supported with torch.compile for mxfp8"
 
         # For MoE training with mxfp8, token group sizes must be multiples of 32
-        if job_config.mx.moe_fqns_prototype:
-            mxfp8_block_size = 32
-            set_token_group_alignment_size_m(mxfp8_block_size)
-            logger.info(f"Setting token group alignment size to {mxfp8_block_size}")
+        self.moe_fqns = job_config.mx.moe_fqns_prototype
+        if self.moe_fqns:
+            logger.info(
+                f"Setting token group alignment size to {self.mxfp8_token_group_alignment_size}"
+            )
+            set_token_group_alignment_size_m(self.mxfp8_token_group_alignment_size)
 
         # Configure MXFP8
         from torchao.prototype.mx_formats.config import (
@@ -94,6 +96,13 @@ def convert(self, model: nn.Module):
         from torchao.prototype.mx_formats.config import MXLinearConfig
         from torchao.quantization import quantize_
 
+        # MoE conversion must take place before MXLinear conversion, otherwise the MXLinear will
+        # be converted back to nn.Linear:
+        # https://github.com/pytorch/ao/blob/c2a6568a04075acc371a338206216bb65536fb27/torchao/quantization/quant_api.py#L294-L299
+        # TODO: add warning in torchao when this happens, or find a better way to avoid this.
+        if self.moe_fqns:
+            self._convert_moe_layers(model)
+
         assert isinstance(self.config, MXLinearConfig)
         quantize_(
             model,
@@ -102,6 +111,36 @@ def convert(self, model: nn.Module):
         )
         logger.info("Swapped to MXLinear layers")
 
+    def _convert_moe_layers(self, model: nn.Module):
+        """
+        Mutates the model inplace replacing instances of nn.Parameter with ScaledGroupedMMTensor,
+        to perform dynamic float8 rowwise quantization + scaled grouped GEMMs for the target MoE FQNs.
+        """
+        from torchao.quantization.quant_api import quantize_
+
+        try:
+            from torchao.prototype.moe_training.conversion_utils import (
+                MoEScalingType,
+                MoETrainingConfig,
+            )
+        except ImportError as e:
+            raise ImportError(
+                "torchao installation does not have MoE training support. Please install torchao nightly build."
+            ) from e
+
+        def moe_module_filter_fn(mod: nn.Module, cur_fqn: str) -> bool:
+            for target_fqn in self.moe_fqns:
+                if target_fqn in cur_fqn:
+                    return True
+            return False
+
+        config = MoETrainingConfig(scaling_type=MoEScalingType.MXFP8)
+        quantize_(model, config=config, filter_fn=moe_module_filter_fn)
+        logger.info(
+            f"Converted MoE layers matching FQNS {self.moe_fqns} "
+            "to use dynamic MXFP8 quantization with scaled grouped GEMMs"
+        )
+
     def post_optimizer_hook(self, model: nn.Module | list[nn.Module]):
         """
         MXFP8 doesn't require any post-optimizer hooks at the moment