Better comments; refactor toTensor

scotts · scotts · commit 48e3ea3eedf9 · 2025-09-26T08:19:56.000-07:00
diff --git a/src/torchcodec/_core/CpuDeviceInterface.cpp b/src/torchcodec/_core/CpuDeviceInterface.cpp
@@ -56,14 +56,9 @@ void CpuDeviceInterface::initialize(
   timeBase_ = timeBase;
   outputDims_ = outputDims;
 
-  // TODO: rationalize comment below with new stuff.
-  // By default, we want to use swscale for color conversion because it is
-  // faster. However, it has width requirements, so we may need to fall back
-  // to filtergraph. We also need to respect what was requested from the
-  // options; we respect the options unconditionally, so it's possible for
-  // swscale's width requirements to be violated. We don't expose the ability to
-  // choose color conversion library publicly; we only use this ability
-  // internally.
+  // We want to use swscale for color conversion if possible because it is
+  // faster than filtergraph. The following are the conditions we need to meet
+  // to use it.
 
   // We can only use swscale when we have a single resize transform. Note that
   // this means swscale will not support the case of having several,
@@ -76,12 +71,14 @@ void CpuDeviceInterface::initialize(
   // https://stackoverflow.com/questions/74351955/turn-off-sw-scale-conversion-to-planar-yuv-32-byte-alignment-requirements
   bool isWidthSwScaleCompatible = (outputDims_.width % 32) == 0;
 
+  // Note that we do not expose this capability in the public API, only through
+  // the core API.
   bool userRequestedSwScale = videoStreamOptions_.colorConversionLibrary ==
       ColorConversionLibrary::SWSCALE;
 
   // Note that we treat the transform limitation differently from the width
   // limitation. That is, we consider the transforms being compatible with
-  // sws_scale as a hard requirement. If the transforms are not compatiable,
+  // swscale as a hard requirement. If the transforms are not compatiable,
   // then we will end up not applying the transforms, and that is wrong.
   //
   // The width requirement, however, is a soft requirement. Even if we don't
@@ -94,7 +91,7 @@ void CpuDeviceInterface::initialize(
     colorConversionLibrary_ = ColorConversionLibrary::SWSCALE;
 
     // We established above that if the transforms are swscale compatible and
-    // non-empty, then they must have only one transforms, and that transform is
+    // non-empty, then they must have only one transform, and that transform is
     // ResizeTransform.
     if (!transforms.empty()) {
       auto resize = dynamic_cast<ResizeTransform*>(transforms[0].get());
@@ -207,7 +204,7 @@ void CpuDeviceInterface::convertAVFrameToFrameOutput(
           std::make_unique<FilterGraph>(filtersContext, videoStreamOptions_);
       prevFiltersContext_ = std::move(filtersContext);
     }
-    outputTensor = toTensor(filterGraphContext_->convert(avFrame));
+    outputTensor = rgbAVFrameToTensor(filterGraphContext_->convert(avFrame));
 
     // Similarly to above, if this check fails it means the frame wasn't
     // reshaped to its expected dimensions by filtergraph.
@@ -256,21 +253,6 @@ int CpuDeviceInterface::convertAVFrameToTensorUsingSwScale(
   return resultHeight;
 }
 
-torch::Tensor CpuDeviceInterface::toTensor(const UniqueAVFrame& avFrame) {
-  TORCH_CHECK_EQ(avFrame->format, AV_PIX_FMT_RGB24);
-
-  int height = avFrame->height;
-  int width = avFrame->width;
-  std::vector<int64_t> shape = {height, width, 3};
-  std::vector<int64_t> strides = {avFrame->linesize[0], 3, 1};
-  AVFrame* avFrameClone = av_frame_clone(avFrame.get());
-  auto deleter = [avFrameClone](void*) {
-    UniqueAVFrame avFrameToDelete(avFrameClone);
-  };
-  return torch::from_blob(
-      avFrameClone->data[0], shape, strides, deleter, {torch::kUInt8});
-}
-
 void CpuDeviceInterface::createSwsContext(
     const SwsFrameContext& swsFrameContext,
     const enum AVColorSpace colorspace) {
diff --git a/src/torchcodec/_core/CpuDeviceInterface.h b/src/torchcodec/_core/CpuDeviceInterface.h
@@ -36,8 +36,6 @@ class CpuDeviceInterface : public DeviceInterface {
       std::optional<torch::Tensor> preAllocatedOutputTensor =
           std::nullopt) override;
 
-  torch::Tensor toTensor(const UniqueAVFrame& avFrame);
-
  private:
   int convertAVFrameToTensorUsingSwScale(
       const UniqueAVFrame& avFrame,
diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -5,7 +5,6 @@
 #include <mutex>
 
 #include "src/torchcodec/_core/Cache.h"
-#include "src/torchcodec/_core/CpuDeviceInterface.h"
 #include "src/torchcodec/_core/CudaDeviceInterface.h"
 #include "src/torchcodec/_core/FFMPEGCommon.h"
 
@@ -344,7 +343,7 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
     // handle it. We send the frame back to the CUDA device when we're done.
     //
     // TODO: Perhaps we should cache cpuInterface?
-    auto cpuInterface = std::make_unique<CpuDeviceInterface>(torch::kCPU);
+    auto cpuInterface = createDeviceInterface(torch::kCPU);
     TORCH_CHECK(
         cpuInterface != nullptr, "Failed to create CPU device interface");
     cpuInterface->initialize(
@@ -360,7 +359,7 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
         avFrame->height == outputDims_.height) {
       // Reason 1 above. The frame is already in the format and dimensions that
       // we need, we just need to convert it to a tensor.
-      cpuFrameOutput.data = cpuInterface->toTensor(avFrame);
+      cpuFrameOutput.data = rgbAVFrameToTensor(avFrame);
     } else {
       // Reason 2 above. We need to do a full conversion.
       cpuInterface->convertAVFrameToFrameOutput(avFrame, cpuFrameOutput);
diff --git a/src/torchcodec/_core/DeviceInterface.cpp b/src/torchcodec/_core/DeviceInterface.cpp
@@ -76,4 +76,19 @@ std::unique_ptr<DeviceInterface> createDeviceInterface(
   return std::unique_ptr<DeviceInterface>(deviceMap[deviceType](device));
 }
 
+torch::Tensor rgbAVFrameToTensor(const UniqueAVFrame& avFrame) {
+  TORCH_CHECK_EQ(avFrame->format, AV_PIX_FMT_RGB24);
+
+  int height = avFrame->height;
+  int width = avFrame->width;
+  std::vector<int64_t> shape = {height, width, 3};
+  std::vector<int64_t> strides = {avFrame->linesize[0], 3, 1};
+  AVFrame* avFrameClone = av_frame_clone(avFrame.get());
+  auto deleter = [avFrameClone](void*) {
+    UniqueAVFrame avFrameToDelete(avFrameClone);
+  };
+  return torch::from_blob(
+      avFrameClone->data[0], shape, strides, deleter, {torch::kUInt8});
+}
+
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/DeviceInterface.h b/src/torchcodec/_core/DeviceInterface.h
@@ -60,4 +60,6 @@ torch::Device createTorchDevice(const std::string device);
 std::unique_ptr<DeviceInterface> createDeviceInterface(
     const torch::Device& device);
 
+torch::Tensor rgbAVFrameToTensor(const UniqueAVFrame& avFrame);
+
 } // namespace facebook::torchcodec