[JAX] Support for checkpointing quantizations (#2356)

jberchtold-nvidia · nvjax · web-flow · commit 67d63d02f3ef · 2025-11-13T13:58:32.000-08:00
* Support for checkpointing quantizations Signed-off-by: Jeremy Berchtold <jberchtold@nvidia.com> * Add jaxpr test for quant checkpoint name Signed-off-by: Jeremy Berchtold <jberchtold@nvidia.com> * Revert "Support for checkpointing quantizations" This reverts commit f7b7849. Signed-off-by: JAX Toolbox <jax@nvidia.com> * Checkpoint quantizations Signed-off-by: Jeremy Berchtold <jberchtold@nvidia.com> * lint Signed-off-by: Jeremy Berchtold <jberchtold@nvidia.com> * revert other files Signed-off-by: Jeremy Berchtold <jberchtold@nvidia.com> * move checkpointing to VJPs Signed-off-by: Jeremy Berchtold <jberchtold@nvidia.com> * fix ci failure Signed-off-by: Jeremy Berchtold <jberchtold@nvidia.com> --------- Signed-off-by: Jeremy Berchtold <jberchtold@nvidia.com> Signed-off-by: JAX Toolbox <jax@nvidia.com> Co-authored-by: JAX Toolbox <jax@nvidia.com>
diff --git a/tests/jax/test_recipe_characteristics.py b/tests/jax/test_recipe_characteristics.py
@@ -263,23 +263,16 @@ def test_autocast_nvfp4_block_scaling(self):
 class TestJaxprAndHlo:
     """Tests to verify Jaxpr and/or HLO of compiled modules apply expected recipe functionality and optimizations."""
 
-    @pytest_parametrize_wrapper(
-        "quantization_recipe",
-        [
-            quantization_recipe
-            for quantization_recipe in SUPPORTED_RECIPES
-            if isinstance(quantization_recipe, NVFP4BlockScaling)
-        ],
-    )
-    def test_layernorm_mlp_reuses_amax_nvfp4(self, quantization_recipe):
-        """Tests that layernorm_mlp reuses the amax computed in layernorm and the activation and does not recompute it during quantizaton."""
-
+    def _generate_jaxpr_for_layernorm_mlp_fwd_bwd(self, quantization_recipe, ln_mlp_kwargs=None):
+        """Generates the jaxpr for a forward and backward pass of LayerNormMLP under the given quantization recipe."""
+        ln_mlp_kwargs = ln_mlp_kwargs or {}
         with te.autocast(enabled=True, recipe=quantization_recipe, mesh_resource=te.MeshResource()):
             model = te_flax.LayerNormMLP(
                 layernorm_type="rmsnorm",
                 return_layernorm_output=False,
                 intermediate_dropout_rate=0.0,
                 dtype=jnp.bfloat16,
+                **ln_mlp_kwargs,
             )
 
             var_collect = model.init(
@@ -292,29 +285,83 @@ def loss_fn(x, rngs):
 
             x = jax.random.normal(jax.random.PRNGKey(0), (128, 128), dtype=jnp.bfloat16)
             rngs = {"sr_rng": jax.random.PRNGKey(1), "dropout": jax.random.PRNGKey(2)}
-            jaxpr = jax.make_jaxpr(jax.value_and_grad(loss_fn))(x, rngs=rngs)
-
-            rht_amax_eqns = [
-                eqn for eqn in jaxpr.jaxpr.eqns if eqn.primitive.name == "te_rht_amax_ffi_wrapper"
-            ]
-
-            assert len(rht_amax_eqns) == 4, f"Expected 4 rht_amax_eqns, got {len(rht_amax_eqns)}"
-
-            def assert_param(index, tensor_name, expected_value: bool):
-                if expected_value:
-                    assert rht_amax_eqns[index].params["produce_regular_amax"] == True, (
-                        f"Expected produce_regular_amax for {tensor_name} to be True, indicating no"
-                        " reuse of amax as this tensor does not have a previous operation to fuse"
-                        " with"
-                    )
-                else:
-                    assert rht_amax_eqns[index].params["produce_regular_amax"] == False, (
-                        f"Expected produce_regular_amax for {tensor_name} to be False, indicating"
-                        " reuse of amax"
-                    )
-
-            assert_param(0, "fwd ln+q", False)
-            assert_param(1, "fwd act+q", False)
-            # No previous op before incoming dgrad in the backward so amax is not reused
-            assert_param(2, "bwd dgrad", True)
-            assert_param(3, "bwd dact+q", False)
+            return jax.make_jaxpr(jax.value_and_grad(loss_fn))(x, rngs=rngs)
+
+    @pytest_parametrize_wrapper(
+        "quantization_recipe",
+        [
+            quantization_recipe
+            for quantization_recipe in SUPPORTED_RECIPES
+            if isinstance(quantization_recipe, NVFP4BlockScaling)
+        ],
+    )
+    def test_layernorm_mlp_reuses_amax_nvfp4(self, quantization_recipe):
+        """Tests that layernorm_mlp reuses the amax computed in layernorm and the activation and does not recompute it during quantizaton."""
+
+        jaxpr = self._generate_jaxpr_for_layernorm_mlp_fwd_bwd(quantization_recipe)
+
+        rht_amax_eqns = [
+            eqn for eqn in jaxpr.jaxpr.eqns if eqn.primitive.name == "te_rht_amax_ffi_wrapper"
+        ]
+
+        assert len(rht_amax_eqns) == 4, f"Expected 4 rht_amax_eqns, got {len(rht_amax_eqns)}"
+
+        def assert_param(index, tensor_name, expected_value: bool):
+            if expected_value:
+                assert rht_amax_eqns[index].params["produce_regular_amax"] == True, (
+                    f"Expected produce_regular_amax for {tensor_name} to be True, indicating no"
+                    " reuse of amax as this tensor does not have a previous operation to fuse"
+                    " with"
+                )
+            else:
+                assert rht_amax_eqns[index].params["produce_regular_amax"] == False, (
+                    f"Expected produce_regular_amax for {tensor_name} to be False, indicating"
+                    " reuse of amax"
+                )
+
+        assert_param(0, "fwd ln+q", False)
+        assert_param(1, "fwd act+q", False)
+        # No previous op before incoming dgrad in the backward so amax is not reused
+        assert_param(2, "bwd dgrad", True)
+        assert_param(3, "bwd dact+q", False)
+
+    @pytest_parametrize_wrapper("quantization_recipe", SUPPORTED_RECIPES)
+    @pytest_parametrize_wrapper(
+        "quantization_checkpoint_name",
+        [None, "quantization", "some_arbitrary_user_checkpoint_name"],
+    )
+    def test_recipe_supports_quantization_checkpointing(
+        self, quantization_recipe, quantization_checkpoint_name
+    ):
+        """Tests that all supported quantization recipes correctly use checkpoint_name."""
+
+        kwargs = {
+            "quantization_checkpoint_name": quantization_checkpoint_name,
+        }
+        jaxpr = self._generate_jaxpr_for_layernorm_mlp_fwd_bwd(quantization_recipe, kwargs)
+
+        checkpoint_name_eqns = [
+            eqn
+            for eqn in jaxpr.jaxpr.eqns
+            if eqn.primitive.name == "name" and eqn.params["name"] == quantization_checkpoint_name
+        ]
+
+        if quantization_checkpoint_name is None:
+            assert len(checkpoint_name_eqns) == 0, (
+                "Expected 0 checkpoint_name eqns when quantization_checkpoint_name is None, got"
+                f" {len(checkpoint_name_eqns)}"
+            )
+            return
+
+        # 12 checkpointed values:
+        # - Fwd pass:
+        #   - Input RMSNorm+Q -> 3 possible output tensors that will be used in the backward
+        #   - Kernel Q -> 3 possible output tensors that will be used in the backward
+        #   - Input Activation+Q -> 3 possible output tensors that will be used in the backward
+        #   - Kernel Q -> 3 possible output tensors that will be used in the backward
+        expected_checkpoint_eqn_count = 12
+
+        assert len(checkpoint_name_eqns) == expected_checkpoint_eqn_count, (
+            f"Expected {expected_checkpoint_eqn_count} checkpoint_name eqns when"
+            f" quantization_checkpoint_name is set, got {len(checkpoint_name_eqns)}"
+        )
diff --git a/transformer_engine/jax/dense.py b/transformer_engine/jax/dense.py
@@ -19,6 +19,7 @@
 from .cpp_extensions.amax import AmaxScope
 from .quantize import (
     ScaledTensorFactory,
+    ScaledTensor,
     ScalingMode,
     QuantizeLayout,
     QuantizerSet,
@@ -227,8 +228,8 @@ def _dense_fwd_rule(
         output += jnp.reshape(bias, bias_new_shape)
 
     ctx = (
-        casted_x.get_tensor(usage=TensorUsage.LHS_TRANS),
-        casted_kernel.get_tensor(usage=TensorUsage.RHS_TRANS),
+        casted_x.get_tensor(usage=TensorUsage.LHS_TRANS).checkpoint(quantizer_set.x),
+        casted_kernel.get_tensor(usage=TensorUsage.RHS_TRANS).checkpoint(quantizer_set.kernel),
         x.shape,
         kernel.shape,
         use_bias,
@@ -529,8 +530,12 @@ def _grouped_dense_fwd_rule(
 
     ctx = (
         group_sizes,
-        ctx_x,
-        ctx_kernel,
+        ctx_x.checkpoint(quantizer_set.x) if isinstance(ctx_x, ScaledTensor) else ctx_x,
+        (
+            ctx_kernel.checkpoint(quantizer_set.kernel)
+            if isinstance(ctx_kernel, ScaledTensor)
+            else ctx_kernel
+        ),
         x.shape,
         kernel.shape,
         use_bias,
diff --git a/transformer_engine/jax/flax/module.py b/transformer_engine/jax/flax/module.py
@@ -6,7 +6,7 @@
 """
 from functools import reduce
 import operator
-from typing import Any, Callable, Iterable, List, Sequence, Tuple, Union, NewType
+from typing import Any, Callable, Iterable, List, Sequence, Tuple, Union, NewType, Optional
 
 import numpy as np
 import jax.numpy as jnp
@@ -345,7 +345,11 @@ class TransformerEngineBase(nn.Module):  # pylint: disable=too-few-public-method
     """
 
     def generate_quantizer_set(
-        self, postfix: str = "", variable_collection: str = None, fp8_recipe=None
+        self,
+        postfix: str = "",
+        variable_collection: str = None,
+        quantization_checkpoint_name: Optional[str] = None,
+        fp8_recipe=None,
     ):
         """
         Generate a set of FP8 meta for a GEMM.
@@ -375,7 +379,9 @@ def generate_quantizer_set(
         quantize_meta_set = QuantizeMetaSet(x=x_meta, kernel=kernel_meta, grad=grad_meta)
 
         quantizer_set = QuantizerFactory.create_set(
-            fp8_recipe=fp8_recipe, quantize_meta_set=quantize_meta_set
+            fp8_recipe=fp8_recipe,
+            quantize_meta_set=quantize_meta_set,
+            checkpoint_name=quantization_checkpoint_name,
         )
         return quantizer_set
 
@@ -424,6 +430,8 @@ class DenseGeneral(TransformerEngineBase):
         The data type used to allocate the initial parameters.
     transpose_batch_sequence: bool, default = False
         Indicate whether to transpose the batch and sequence dimensions of the input tensor.
+    quantization_checkpoint_name: Optional[str], default = None
+        The name for checkpointing quantizations.
     """
 
     features: Union[Iterable[int], int]
@@ -439,6 +447,7 @@ class DenseGeneral(TransformerEngineBase):
     dtype: DType = jnp.float32
     input_axes: Tuple[str, ...] = ()
     transpose_batch_sequence: bool = False
+    quantization_checkpoint_name: Optional[str] = None
 
     def __post_init__(self):
         if self.kernel_init is None:
@@ -496,7 +505,9 @@ def __call__(self, inputs: Array) -> Array:
         else:
             bias = None
 
-        quantizer_set = self.generate_quantizer_set()
+        quantizer_set = self.generate_quantizer_set(
+            quantization_checkpoint_name=self.quantization_checkpoint_name
+        )
         contract_ind = tuple(range(0, len(axis)))
         y = dense(
             inputs,
@@ -628,6 +639,8 @@ class LayerNormDenseGeneral(TransformerEngineBase):
         value or None. When None is set, then no scaling is applied.
     transpose_batch_sequence: bool, default = False
         Indicate whether to transpose the batch and sequence dimensions of the input tensor.
+    quantization_checkpoint_name: Optional[str], default = None
+        The name for checkpointing quantizations.
     """
 
     features: Union[Iterable[int], int]
@@ -654,6 +667,7 @@ class LayerNormDenseGeneral(TransformerEngineBase):
     dot_input_axes: Tuple[str, ...] = None
     depth_scaling: float = None
     transpose_batch_sequence: bool = False
+    quantization_checkpoint_name: Optional[str] = None
 
     def __post_init__(self):
         if self.kernel_init is None:
@@ -693,7 +707,9 @@ def __call__(self, inputs: Array) -> Array:
         input_dtype = inputs.dtype
         ln_output = None
 
-        quantizer_set = self.generate_quantizer_set()
+        quantizer_set = self.generate_quantizer_set(
+            quantization_checkpoint_name=self.quantization_checkpoint_name
+        )
 
         fuse_layernorm = (
             get_quantize_config().is_fp8_enabled()
@@ -941,6 +957,8 @@ class LayerNormMLP(TransformerEngineBase):
         The data type used to allocate the initial parameters.
     transpose_batch_sequence: bool, default = False
         Indicate whether to transpose the batch and sequence dimensions of the input tensor.
+    quantization_checkpoint_name: Optional[str], default = None
+        The name for checkpointing quantizations.
     """
 
     intermediate_dim: int = 2048
@@ -976,6 +994,7 @@ class LayerNormMLP(TransformerEngineBase):
     ffn1_ckpt_name: str = "ffn1"
     ffn2_ckpt_name: str = "ffn2"
     transpose_batch_sequence: bool = False
+    quantization_checkpoint_name: Optional[str] = None
 
     def __post_init__(self):
         if self.kernel_init is None:
@@ -1010,8 +1029,12 @@ def __call__(self, inputs: Array, deterministic: bool = False) -> Array:
         """
         assert self.axis == -1, "Only support axis == -1 at this moment"
 
-        ffn1_quantizer_set = self.generate_quantizer_set("_0")
-        ffn2_quantizer_set = self.generate_quantizer_set("_1")
+        ffn1_quantizer_set = self.generate_quantizer_set(
+            "_0", quantization_checkpoint_name=self.quantization_checkpoint_name
+        )
+        ffn2_quantizer_set = self.generate_quantizer_set(
+            "_1", quantization_checkpoint_name=self.quantization_checkpoint_name
+        )
 
         input_dtype = inputs.dtype
         ln_output = None
diff --git a/transformer_engine/jax/layernorm_dense.py b/transformer_engine/jax/layernorm_dense.py
@@ -236,8 +236,8 @@ def _layernorm_dense_fwd_rule(
         output += jnp.reshape(bias, bias_new_shape)
 
     ctx = (
-        casted_ln_out.get_tensor(TensorUsage.LHS_TRANS),
-        casted_kernel.get_tensor(TensorUsage.RHS_TRANS),
+        casted_ln_out.get_tensor(TensorUsage.LHS_TRANS).checkpoint(quantizer_set.x),
+        casted_kernel.get_tensor(TensorUsage.RHS_TRANS).checkpoint(quantizer_set.kernel),
         x.shape,
         kernel.shape,
         mu,
diff --git a/transformer_engine/jax/layernorm_mlp.py b/transformer_engine/jax/layernorm_mlp.py
@@ -390,11 +390,11 @@ def _layernorm_mlp_fwd_rule(
         rsigma,
         gamma,
         beta,
-        casted_ln_out.get_tensor(TensorUsage.LHS_TRANS),
-        casted_kernel_1.get_tensor(TensorUsage.RHS_TRANS),
+        casted_ln_out.get_tensor(TensorUsage.LHS_TRANS).checkpoint(ffn1_quantizer_set.x),
+        casted_kernel_1.get_tensor(TensorUsage.RHS_TRANS).checkpoint(ffn1_quantizer_set.kernel),
         dot_1_output,
-        casted_act_out.get_tensor(TensorUsage.LHS_TRANS),
-        casted_kernel_2.get_tensor(TensorUsage.RHS_TRANS),
+        casted_act_out.get_tensor(TensorUsage.LHS_TRANS).checkpoint(ffn2_quantizer_set.x),
+        casted_kernel_2.get_tensor(TensorUsage.RHS_TRANS).checkpoint(ffn2_quantizer_set.kernel),
         x_contracting_dims,
         k_contracting_dims,
         kernel_1.shape,
diff --git a/transformer_engine/jax/quantize/quantizer.py b/transformer_engine/jax/quantize/quantizer.py
diff --git a/transformer_engine/jax/quantize/tensor.py b/transformer_engine/jax/quantize/tensor.py