Don't pass pre-allocated GPU tensor to CPU decoding

scotts · scotts · commit a032cb7db1f2 · 2025-09-22T20:36:35.000-07:00
diff --git a/src/torchcodec/_core/CpuDeviceInterface.cpp b/src/torchcodec/_core/CpuDeviceInterface.cpp
@@ -169,15 +169,18 @@ void CpuDeviceInterface::convertAVFrameToFrameOutput(
         outputDims_.width,
         outputDims_.height);
 
+
     outputTensor = preAllocatedOutputTensor.value_or(
         allocateEmptyHWCTensor(outputDims_, torch::kCPU));
 
+
     if (!swsContext_ || prevSwsFrameContext_ != swsFrameContext) {
       createSwsContext(swsFrameContext, avFrame->colorspace);
       prevSwsFrameContext_ = swsFrameContext;
     }
     int resultHeight =
         convertAVFrameToTensorUsingSwScale(avFrame, outputTensor);
+
     // If this check failed, it would mean that the frame wasn't reshaped to
     // the expected height.
     // TODO: Can we do the same check for width?
diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -231,9 +231,18 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
 
     FrameOutput cpuFrameOutput;
     cpuInterface->convertAVFrameToFrameOutput(
-        avFrame, cpuFrameOutput, preAllocatedOutputTensor);
+        avFrame, cpuFrameOutput);
+
+    // TODO: explain that the pre-allocated tensor is on the GPU, but we need
+    // to do the decoding on the CPU, and we can't pass the pre-allocated tensor
+    // to do it. BUT WHY did it work before?
+    if (preAllocatedOutputTensor.has_value()) {
+      preAllocatedOutputTensor.value().copy_(cpuFrameOutput.data);
+      frameOutput.data = preAllocatedOutputTensor.value();
+    } else {
+      frameOutput.data = cpuFrameOutput.data.to(device_);
+    }
 
-    frameOutput.data = cpuFrameOutput.data.to(device_);
     return;
   }
 
diff --git a/test/test_ops.py b/test/test_ops.py
@@ -146,9 +146,13 @@ def test_get_frame_with_info_at_index(self, device):
 
     @pytest.mark.parametrize("device", all_supported_devices())
     def test_get_frames_at_indices(self, device):
+        print("test_get_frames_at_indices")
         decoder = create_from_file(str(NASA_VIDEO.path))
+        print("decoder created")
         add_video_stream(decoder, device=device)
+        print("stream added")
         frames0and180, *_ = get_frames_at_indices(decoder, frame_indices=[0, 180])
+        print("frames retrieved")
         reference_frame0 = NASA_VIDEO.get_frame_data_by_index(0)
         reference_frame180 = NASA_VIDEO.get_frame_data_by_index(
             INDEX_OF_FRAME_AT_6_SECONDS