pytorch
diff --git a/‎test/prototype/moe_training/test_kernels.py‎
Lines changed: 58 additions & 2 deletions b/‎test/prototype/moe_training/test_kernels.py‎
Lines changed: 58 additions & 2 deletions
diff --git a/‎torchao/csrc/cuda/mx_kernels/mxfp8_cuda.cu‎
Lines changed: 68 additions & 0 deletions b/‎torchao/csrc/cuda/mx_kernels/mxfp8_cuda.cu‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎torchao/csrc/cuda/mx_kernels/mxfp8_extension.cpp‎
Lines changed: 66 additions & 0 deletions b/‎torchao/csrc/cuda/mx_kernels/mxfp8_extension.cpp‎
Lines changed: 66 additions & 0 deletions
@@ -12,7 +12,6 @@
 if not (torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 9):
     pytest.skip("Unsupported PyTorch version", allow_module_level=True)
 
-
 from torchao.prototype.moe_training.kernels.float8_rowwise import (
     triton_fp8_rowwise_3d_transpose_rhs,
     triton_fp8_rowwise_3d_transpose_rhs_fused_reduction,
@@ -38,8 +37,11 @@
     torch_to_float8_per_group_colwise,
     torch_to_float8_per_group_rowwise,
 )
-from torchao.prototype.mx_formats.mx_tensor import to_mx
+from torchao.prototype.mx_formats.mx_tensor import ScaleCalculationMode, to_mx
 from torchao.testing.utils import skip_if_rocm
+from torchao.utils import (
+    is_sm_at_least_100,
+)
 
 
 @skip_if_rocm("ROCm enablement in progress")
@@ -313,3 +315,57 @@ def test_mxfp8_per_group_blocked_scales_2d2d(
         output_group_offsets,
     )
     assert torch.equal(ref_out_scales, triton_out_scales), "blocked scales not equal"
+
+
+@pytest.mark.skipif(
+    not is_sm_at_least_100(),
+    reason="MXFP8 requires CUDA capability 10.0 or greater",
+)
+@pytest.mark.parametrize(
+    "E",
+    (
+        1,
+        2,
+    ),
+)
+@pytest.mark.parametrize("N", (32, 64))
+@pytest.mark.parametrize("K", (32, 64))
+@pytest.mark.parametrize("input_dtype", (torch.bfloat16,))
+@pytest.mark.parametrize("scaling_mode", (ScaleCalculationMode.FLOOR,))
+def test_cuda_mx_dim1_3d_numerics(E, N, K, input_dtype, scaling_mode):
+    from torchao.prototype import mxfp8_cuda
+
+    scaling_mode_str = (
+        "floor" if scaling_mode == ScaleCalculationMode.FLOOR else "rceil"
+    )
+    block_size = 32
+
+    # Use disinct incrementing values from 0 to E*M*K-1 to make debugging easier.
+    x = (
+        torch.arange(0, E * N * K, dtype=input_dtype, device="cuda")
+        .reshape(E, N, K)
+        .contiguous()
+    )
+
+    # Reference implementation
+    s_d1_ref, y_d1_ref = to_mx(
+        x.transpose(-2, -1).contiguous(),
+        elem_dtype=torch.float8_e4m3fn,
+        block_size=block_size,
+    )
+    y_d1_ref = y_d1_ref.transpose(
+        -2, -1
+    )  # (E, K, N//block_size) -> (E, N//block_size, K)
+
+    # CUDA implementation (should work with any stride pattern)
+    y_d1, s_d1 = mxfp8_cuda.quantize_3d(
+        x, scale_dim_n=block_size, scaling_mode=scaling_mode_str
+    )
+    s_d1 = s_d1.transpose(-2, -1)
+
+    # Check scales
+    torch.testing.assert_close(s_d1, s_d1_ref, rtol=0, atol=0)
+
+    # Check quantized values
+    torch.testing.assert_close(y_d1, y_d1_ref, rtol=0, atol=0)
+    assert y_d1.stride() == y_d1_ref.stride(), "quantized tensor strides do not match"
@@ -109,4 +109,72 @@ void mxfp8_quantize_cuda(const torch::Tensor &input,
                            stream);
 }
 
+void mxfp8_quantize_3d_cuda(const torch::Tensor &input,
+                             torch::Tensor &output_colwise,
+                             torch::Tensor &scales_colwise,
+                             int64_t scale_dim_n,
+                             const std::string &fp8_format,
+                             const std::string &scaling_mode) {
+
+  // Get tensor properties for 3D tensor (E, N, K)
+  const int64_t E = input.size(0);
+  const int64_t N = input.size(1);
+  const int64_t K = input.size(2);
+
+  // Get data pointers
+  const void *input_ptr = input.data_ptr();
+  void *output_colwise_ptr = output_colwise.data_ptr();
+  e8m0_t *scales_colwise_ptr =
+      reinterpret_cast<e8m0_t *>(scales_colwise.data_ptr());
+
+  // Get CUDA stream
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // Get strides of scales tensor
+  int64_t scales_colwise_stride_dim0 = scales_colwise.stride(0);
+  int64_t scales_colwise_stride_dim1 = scales_colwise.stride(1);
+  int64_t scales_colwise_stride_dim2 = scales_colwise.stride(2);
+
+  // Get input tensor strides for generic layout support
+  int64_t input_stride_dim0 = input.stride(0);  // E dimension stride
+  int64_t input_stride_dim1 = input.stride(1);  // N dimension stride
+  int64_t input_stride_dim2 = input.stride(2);  // K dimension stride
+
+  // Get output tensor strides (shoudl be col major)
+  int64_t output_stride_dim0 = output_colwise.stride(0);  // E dimension stride
+  int64_t output_stride_dim1 = output_colwise.stride(1);  // N dimension stride
+  int64_t output_stride_dim2 = output_colwise.stride(2);  // K dimension stride
+
+
+#if defined(DEBUG)
+  printf("mxfp8_quantize_3d_cuda:\n");
+  printf("Quantizing 3D input tensor of size %ld x %ld x %ld\n", E, N, K);
+  printf("scaling_mode: %s\n", scaling_mode.c_str());
+  printf("Scale dim n: %ld\n", scale_dim_n);
+  printf("Output scale shape: %ld x %ld x %ld\n",
+         scales_colwise.sizes()[0], scales_colwise.sizes()[1], scales_colwise.sizes()[2]);
+  printf("scales_colwise_stride_dim0 = %ld\n", scales_colwise_stride_dim0);
+  printf("scales_colwise_stride_dim1 = %ld\n", scales_colwise_stride_dim1);
+  printf("input_stride_dim0 = %ld\n", input_stride_dim0);
+  printf("input_stride_dim1 = %ld\n", input_stride_dim1);
+  printf("input_stride_dim2 = %ld\n", input_stride_dim2);
+  printf("output_stride_dim0 = %ld\n", output_stride_dim0);
+  printf("output_stride_dim1 = %ld\n", output_stride_dim1);
+  printf("output_stride_dim2 = %ld\n", output_stride_dim2);
+#endif
+
+  // Call the 3D quantization kernel
+  MXFP8Quantizer::quantize_3d(input_ptr,
+                               output_colwise_ptr,
+                               scales_colwise_ptr,
+                               E, N, K,
+                               input_stride_dim0, input_stride_dim1, input_stride_dim2,
+                               output_stride_dim0, output_stride_dim1, output_stride_dim2,
+                               scales_colwise_stride_dim0, scales_colwise_stride_dim1, scales_colwise_stride_dim2,
+                               get_input_dtype(input), get_output_dtype(fp8_format),
+                               scale_dim_n,
+                               get_scaling_mode(scaling_mode),
+                               stream);
+}
+
 } // namespace mxfp8
@@ -18,6 +18,13 @@ void mxfp8_quantize_cuda(const torch::Tensor &input,
                          const std::string &fp8_format,
                          const std::string &scaling_mode);
 
+void mxfp8_quantize_3d_cuda(const torch::Tensor &input,
+                             torch::Tensor &output_colwise,
+                             torch::Tensor &scales_colwise,
+                             int64_t scale_dim_n,
+                             const std::string &fp8_format,
+                             const std::string &scaling_mode);
+
 // Helper for tensor validation
 void check_cuda_tensor(const torch::Tensor &t, const char *name) {
   TORCH_CHECK(t.is_cuda(), name, " must be a CUDA tensor");
@@ -115,6 +122,60 @@ mxfp8_quantize(torch::Tensor input, bool rowwise, bool colwise,
                          scales_colwise);
 }
 
+// 3D tensor quantization function
+std::tuple<torch::Tensor, torch::Tensor>
+mxfp8_quantize_3d(torch::Tensor input, int64_t scale_dim_n,
+                  const std::string &fp8_format,
+                  const std::string &scaling_mode) {
+
+  // Validate inputs
+  TORCH_CHECK(input.is_cuda(), "input must be a CUDA tensor");
+  // Note: We don't check contiguous for 3D as it may have column major strides
+  TORCH_CHECK(input.dim() == 3, "input must be 3D");
+  TORCH_CHECK(input.scalar_type() == torch::kFloat32 ||
+                  input.scalar_type() == torch::kFloat16 ||
+                  input.scalar_type() == torch::kBFloat16,
+              "Input must be float32, float16, or bfloat16");
+  TORCH_CHECK(scale_dim_n == 32, "scale_dim_n must be 32 for now");
+
+  validate_fp8_format(fp8_format);
+
+  const int64_t E = input.size(0);
+  const int64_t N = input.size(1);
+  const int64_t K = input.size(2);
+
+  // Check dimensions are valid for 3D kernel
+  TORCH_CHECK((N >= 32) && (N % 32 == 0), "N must be a multiple of 32");
+  TORCH_CHECK((K >= 32) && (K % 32 == 0), "K must be a multiple of 32");
+
+  // The kernel should work with any stride pattern - no layout requirements
+
+  c10::cuda::CUDAGuard device_guard(input.device());
+
+  // Create tensor options
+  const auto options_fp8 = torch::TensorOptions()
+                               .dtype(torch::kFloat8_e4m3fn)
+                               .device(input.device());
+
+  const auto options_scale = torch::TensorOptions()
+                                 .dtype(torch::kFloat8_e8m0fnu)
+                                 .device(input.device());
+
+  // Create output tensor with column major layout (required for downstream ops)
+  torch::Tensor output_colwise = torch::empty_strided(
+      {E, N, K}, {N * K, 1, N}, options_fp8);
+
+  // Create scales tensor with shape (E, num_n_blocks, K)
+  const int64_t num_n_blocks = (N + scale_dim_n - 1) / scale_dim_n;
+  torch::Tensor scales_colwise = torch::empty({E, num_n_blocks, K}, options_scale);
+
+  // Call CUDA kernel
+  mxfp8_quantize_3d_cuda(input, output_colwise, scales_colwise,
+                         scale_dim_n, fp8_format, scaling_mode);
+
+  return std::make_tuple(output_colwise, scales_colwise);
+}
+
 } // namespace mxfp8
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
@@ -125,4 +186,9 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
         py::arg("scale_dim_x") = 32, py::arg("scale_dim_y") = 32,
         py::arg("fp8_format") = "e4m3",
         py::arg("scaling_mode") = "floor");
+
+  m.def("quantize_3d", &mxfp8::mxfp8_quantize_3d, "MXFP8 3D quantization",
+        py::arg("input"), py::arg("scale_dim_n") = 32,
+        py::arg("fp8_format") = "e4m3",
+        py::arg("scaling_mode") = "floor");
 }