pytorch · Xia-Weiwen · Sep 21, 2025 · Sep 9, 2025 · Sep 18, 2025 · Sep 18, 2025
diff --git a/test/dtypes/test_affine_quantized_float.py b/test/dtypes/test_affine_quantized_float.py
@@ -733,6 +733,46 @@ def test_preprocess_scale_3d_reshape(self):
         expected_shape = (8, 1)  # Flattened (2*2*2, 1)
         self.assertEqual(result.shape, expected_shape)
 
+    @common_utils.parametrize("float8_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
+    @common_utils.parametrize("hp_dtype", [torch.float32, torch.bfloat16])
+    def test_quantize_dequantize_fp8_inductor(self, float8_dtype, hp_dtype):
+        quantize_affine_float8 = torch.ops.torchao.quantize_affine_float8_cpu
+        dequantize_affine_float8 = torch.ops.torchao.dequantize_affine_float8_cpu
+        input = torch.randn(10, 10)
+        with torch.no_grad():
+            torch._dynamo.reset()
+            expected_scale = torch.tensor(2.0)
+            expected_quantized = quantize_affine_float8(
+                input,
+                expected_scale,
+                float8_dtype=float8_dtype,
+            )
+            expected_dequantized = dequantize_affine_float8(
+                expected_quantized,
+                expected_scale,
+                output_dtype=hp_dtype,
+            )
+            test_q, (code_q,) = torch._inductor.utils.run_and_get_code(
+                torch.compile(quantize_affine_float8),
+                input,
+                expected_scale,
+                float8_dtype=float8_dtype,
+            )
+            torch.testing.FileCheck().check(
+                "torch.ops.torchao.quantize_affine_float8_cpu.default"
+            ).run(code_q)
+            test_dq, (code_dq,) = torch._inductor.utils.run_and_get_code(
+                torch.compile(dequantize_affine_float8),
+                test_q,
+                expected_scale,
+                hp_dtype,
+            )
+            torch.testing.FileCheck().check(
+                "torch.ops.torchao.dequantize_affine_float8_cpu.default"
+            ).run(code_dq)
+            torch.testing.assert_close(expected_quantized, test_q)
+            torch.testing.assert_close(expected_dequantized, test_dq)
+
     @torch.no_grad()
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     @unittest.skipIf(

diff --git a/torchao/quantization/quant_primitives.py b/torchao/quantization/quant_primitives.py
@@ -2310,8 +2310,6 @@ def _quantize_affine_float8(
     return _RoundToFloat8.apply(tensor_clamped, float8_dtype)
 
 
-# TODO: don't register as custom op?
-@_register_custom_op(quant_lib, False)
 def _dequantize_affine_float8(
     tensor: torch.Tensor,
     scale: torch.Tensor,
@@ -2329,8 +2327,49 @@ def _dequantize_affine_float8(
     return hp_tensor.to(output_dtype)
 
 
-@_register_meta_op(quant_lib, "dequantize_affine_float8")
-def _dequantize_affine_float8_meta(
+@_register_custom_op(quant_lib, False, "CPU")
+def _quantize_affine_float8_cpu(
+    tensor: torch.Tensor,
+    scale: torch.Tensor,
+    float8_dtype: torch.dtype = torch.float8_e4m3fn,
+) -> torch.Tensor:
+    """
+    Quantizes the high precision floating point tensor to a float8 tensor, using the given scaling factor.
+    """
+    return _quantize_affine_float8(
+        tensor=tensor,
+        scale=scale,
+        float8_dtype=float8_dtype,
+    )
+
+
+@_register_meta_op(quant_lib, "quantize_affine_float8_cpu")
+def _quantize_affine_float8_cpu_meta(
+    tensor: torch.Tensor,
+    scale: torch.Tensor,
+    float8_dtype: torch.dtype = torch.float8_e4m3fn,
+) -> torch.Tensor:
+    return torch.empty_like(tensor, dtype=float8_dtype)
+
+
+@_register_custom_op(quant_lib, False, "CPU")
+def _dequantize_affine_float8_cpu(
+    tensor: torch.Tensor,
+    scale: torch.Tensor,
+    output_dtype: torch.dtype = torch.float32,
+) -> torch.Tensor:
+    """
+    Dequantizes the float8 tensor to high precision tensor.
+    """
+    return _dequantize_affine_float8(
+        tensor=tensor,
+        scale=scale,
+        output_dtype=output_dtype,
+    )
+
+
+@_register_meta_op(quant_lib, "dequantize_affine_float8_cpu")
+def _dequantize_affine_float8_cpu_meta(
     tensor: torch.Tensor,
     scale: torch.Tensor,
     output_dtype: torch.dtype = torch.float32,

diff --git a/torchao/utils.py b/torchao/utils.py
@@ -181,7 +181,9 @@ def find_multiple(n: int, *args: int) -> int:
     return n + k - (n % k)
 
 
-def _register_custom_op(lib, inductor_decomposed=True):
+def _register_custom_op(
+    lib, inductor_decomposed=True, dispatch_key="CompositeImplicitAutograd"
+):
     """This decorator is used to preserve some high level operators for torch.export.export
     while still allow them to be decomposed for inductor path
 
@@ -208,12 +210,6 @@ def _the_op_that_needs_to_be_preserved(...)
     """
     from torch._inductor.decomposition import register_decomposition
 
-    dispatch_key = (
-        "CompositeImplicitAutograd"
-        if inductor_decomposed
-        else "CompositeExplicitAutograd"
-    )
-
     def decorator(fn):
         from torch._library.infer_schema import infer_schema