intel · CuiYifeng · Nov 13, 2025 · Nov 3, 2025 · Nov 7, 2025
diff --git a/src/ATen/native/xpu/sycl/CopyKernel.cpp b/src/ATen/native/xpu/sycl/CopyKernel.cpp
@@ -111,6 +111,16 @@ void float8_copy_kernel_xpu(TensorIteratorBase& iter) {
   }
 }
 
+void float4_copy_kernel_xpu(TensorIteratorBase& iter) {
+  ScalarType src_dtype = iter.dtype(1);
+
+  if (src_dtype == kFloat4_e2m1fn_x2) {
+    gpu_kernel_nocast(iter, CopyScalarFunc<Float4_e2m1fn_x2>());
+  } else {
+    TORCH_CHECK(false, "Copy from ", src_dtype, " to Float4_e2m1fn_x2 has not been supported.");
+  }
+}
+
 void copy_kernel(TensorIteratorBase& iter) {
   ScalarType dtype = iter.common_dtype();
   if (isQIntType(dtype)) {
@@ -119,6 +129,8 @@ void copy_kernel(TensorIteratorBase& iter) {
     });
   } else if (isFloat8Type(iter.dtype(0))) {
     float8_copy_kernel_xpu(iter);
+  } else if (iter.dtype(0) == kFloat4_e2m1fn_x2) {
+    float4_copy_kernel_xpu(iter);
   } else {
     AT_DISPATCH_V2(
         dtype,

diff --git a/test/regressions/test_copy.py b/test/regressions/test_copy.py
@@ -14,8 +14,8 @@
 class TestSimpleCopy(TestCase):
     @dtypes(*float8_types_and(torch.float8_e8m0fnu, torch.float32))
     def test_copy_and_clone(self, dtype):
-        a_cpu = torch.randn(16, 64, 28, 28)
-        b_cpu = torch.randn(16, 64, 28, 28)
+        a_cpu = torch.randn(16, 64, 28, 28).to(dtype)
+        b_cpu = torch.randn(16, 64, 28, 28).to(dtype)
         a_xpu = a_cpu.to(xpu_device)
         b_xpu = b_cpu.to(xpu_device)
         # naive
@@ -27,6 +27,22 @@ def test_copy_and_clone(self, dtype):
         b_xpu = a_xpu.clone(memory_format=torch.channels_last)
         self.assertEqual(b_cpu, b_xpu.to(cpu_device))
 
+    def test_copy_and_clone_float4(self):
+        # Float4_e2m1fn_x2 copy is not implemented by CPU
+        a_cpu = torch.randn(16, 64, 28, 28).to(torch.uint8)
+        b_cpu = torch.randn(16, 64, 28, 28).to(torch.uint8)
+        a_xpu = a_cpu.to(xpu_device).view(torch.float4_e2m1fn_x2)
+        b_xpu = b_cpu.to(xpu_device).view(torch.float4_e2m1fn_x2)
+
+        b_cpu.copy_(a_cpu)
+        b_xpu.copy_(a_xpu)
+        # Float4_e2m1fn_x2 compare is not implemented CPU
+        self.assertEqual(b_cpu, b_xpu.view(torch.uint8).to(cpu_device))
+
+        b_cpu = a_cpu.clone(memory_format=torch.channels_last)
+        b_xpu = a_xpu.clone(memory_format=torch.channels_last)
+        self.assertEqual(b_cpu, b_xpu.view(torch.uint8).to(cpu_device))
+
 
 instantiate_device_type_tests(TestSimpleCopy, globals(), only_for="xpu", allow_xpu=True)