Refactor pybind_ops to only deal with file like context holders

scotts · scotts · commit 2828dc5d29a1 · 2025-09-10T08:45:13.000-07:00
diff --git a/src/torchcodec/_core/AVIOFileLikeContext.h b/src/torchcodec/_core/AVIOFileLikeContext.h
@@ -17,7 +17,8 @@ namespace facebook::torchcodec {
 
 // Enables uers to pass in a Python file-like object. We then forward all read
 // and seek calls back up to the methods on the Python object.
-class AVIOFileLikeContext : public AVIOContextHolder {
+class __attribute__((visibility("hidden"))) AVIOFileLikeContext
+    : public AVIOContextHolder {
  public:
   explicit AVIOFileLikeContext(const py::object& fileLike, bool isForWriting);
 
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
@@ -10,6 +10,7 @@
 #include <string>
 #include "c10/core/SymIntArrayRef.h"
 #include "c10/util/Exception.h"
+#include "src/torchcodec/_core/AVIOFileLikeContext.h"
 #include "src/torchcodec/_core/AVIOTensorContext.h"
 #include "src/torchcodec/_core/Encoder.h"
 #include "src/torchcodec/_core/SingleStreamDecoder.h"
@@ -33,8 +34,12 @@ TORCH_LIBRARY(torchcodec_ns, m) {
       "encode_audio_to_file(Tensor samples, int sample_rate, str filename, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> ()");
   m.def(
       "encode_audio_to_tensor(Tensor samples, int sample_rate, str format, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> Tensor");
+  m.def(
+      "_encode_audio_to_file_like(Tensor samples, int sample_rate, str format, int file_like_context, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> ()");
   m.def(
       "create_from_tensor(Tensor video_tensor, str? seek_mode=None) -> Tensor");
+  m.def(
+      "_create_from_file_like(int file_like_context, str? seek_mode=None) -> Tensor");
   m.def("_convert_to_tensor(int decoder_ptr) -> Tensor");
   m.def(
       "_add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str? device=None, (Tensor, Tensor, Tensor)? custom_frame_mappings=None, str? color_conversion_library=None) -> ()");
@@ -210,6 +215,24 @@ at::Tensor create_from_tensor(
   return wrapDecoderPointerToTensor(std::move(uniqueDecoder));
 }
 
+at::Tensor _create_from_file_like(
+    int64_t file_like_context,
+    std::optional<std::string_view> seek_mode) {
+  auto fileLikeContext =
+      reinterpret_cast<AVIOFileLikeContext*>(file_like_context);
+  TORCH_CHECK(fileLikeContext != nullptr, "file_like must be a valid pointer");
+  std::unique_ptr<AVIOFileLikeContext> contextHolder(fileLikeContext);
+
+  SingleStreamDecoder::SeekMode realSeek = SingleStreamDecoder::SeekMode::exact;
+  if (seek_mode.has_value()) {
+    realSeek = seekModeFromString(seek_mode.value());
+  }
+
+  std::unique_ptr<SingleStreamDecoder> uniqueDecoder =
+      std::make_unique<SingleStreamDecoder>(std::move(contextHolder), realSeek);
+  return wrapDecoderPointerToTensor(std::move(uniqueDecoder));
+}
+
 at::Tensor _convert_to_tensor(int64_t decoder_ptr) {
   auto decoder = reinterpret_cast<SingleStreamDecoder*>(decoder_ptr);
   std::unique_ptr<SingleStreamDecoder> uniqueDecoder(decoder);
@@ -441,6 +464,36 @@ at::Tensor encode_audio_to_tensor(
       .encodeToTensor();
 }
 
+void _encode_audio_to_file_like(
+    const at::Tensor& samples,
+    int64_t sample_rate,
+    std::string_view format,
+    int64_t file_like_context,
+    std::optional<int64_t> bit_rate = std::nullopt,
+    std::optional<int64_t> num_channels = std::nullopt,
+    std::optional<int64_t> desired_sample_rate = std::nullopt) {
+  auto fileLikeContext =
+      reinterpret_cast<AVIOFileLikeContext*>(file_like_context);
+  TORCH_CHECK(
+      fileLikeContext != nullptr, "file_like_context must be a valid pointer");
+  std::unique_ptr<AVIOFileLikeContext> avioContextHolder(fileLikeContext);
+
+  AudioStreamOptions audioStreamOptions;
+  audioStreamOptions.bitRate = validateOptionalInt64ToInt(bit_rate, "bit_rate");
+  audioStreamOptions.numChannels =
+      validateOptionalInt64ToInt(num_channels, "num_channels");
+  audioStreamOptions.sampleRate =
+      validateOptionalInt64ToInt(desired_sample_rate, "desired_sample_rate");
+
+  AudioEncoder encoder(
+      samples,
+      validateInt64ToInt(sample_rate, "sample_rate"),
+      format,
+      std::move(avioContextHolder),
+      audioStreamOptions);
+  encoder.encode();
+}
+
 // For testing only. We need to implement this operation as a core library
 // function because what we're testing is round-tripping pts values as
 // double-precision floating point numbers from C++ to Python and back to C++.
@@ -694,6 +747,7 @@ void scan_all_streams_to_update_metadata(at::Tensor& decoder) {
 TORCH_LIBRARY_IMPL(torchcodec_ns, BackendSelect, m) {
   m.impl("create_from_file", &create_from_file);
   m.impl("create_from_tensor", &create_from_tensor);
+  m.impl("_create_from_file_like", &_create_from_file_like);
   m.impl("_convert_to_tensor", &_convert_to_tensor);
   m.impl(
       "_get_json_ffmpeg_library_versions", &_get_json_ffmpeg_library_versions);
@@ -702,6 +756,7 @@ TORCH_LIBRARY_IMPL(torchcodec_ns, BackendSelect, m) {
 TORCH_LIBRARY_IMPL(torchcodec_ns, CPU, m) {
   m.impl("encode_audio_to_file", &encode_audio_to_file);
   m.impl("encode_audio_to_tensor", &encode_audio_to_tensor);
+  m.impl("_encode_audio_to_file_like", &_encode_audio_to_file_like);
   m.impl("seek_to_pts", &seek_to_pts);
   m.impl("add_video_stream", &add_video_stream);
   m.impl("_add_video_stream", &_add_video_stream);
diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py
@@ -95,9 +95,15 @@ def load_torchcodec_shared_libraries():
 encode_audio_to_tensor = torch._dynamo.disallow_in_graph(
     torch.ops.torchcodec_ns.encode_audio_to_tensor.default
 )
+_encode_audio_to_file_like = torch._dynamo.disallow_in_graph(
+    torch.ops.torchcodec_ns._encode_audio_to_file_like.default
+)
 create_from_tensor = torch._dynamo.disallow_in_graph(
     torch.ops.torchcodec_ns.create_from_tensor.default
 )
+_create_from_file_like = torch._dynamo.disallow_in_graph(
+    torch.ops.torchcodec_ns._create_from_file_like.default
+)
 _convert_to_tensor = torch._dynamo.disallow_in_graph(
     torch.ops.torchcodec_ns._convert_to_tensor.default
 )
@@ -148,7 +154,12 @@ def create_from_file_like(
     file_like: Union[io.RawIOBase, io.BufferedReader], seek_mode: Optional[str] = None
 ) -> torch.Tensor:
     assert _pybind_ops is not None
-    return _convert_to_tensor(_pybind_ops.create_from_file_like(file_like, seek_mode))
+    return _create_from_file_like(
+        _pybind_ops.create_file_like_context(
+            file_like, False  # False means not for writing
+        ),
+        seek_mode,
+    )
 
 
 def encode_audio_to_file_like(
@@ -176,36 +187,16 @@ def encode_audio_to_file_like(
     if samples.dtype != torch.float32:
         raise ValueError(f"samples must have dtype torch.float32, got {samples.dtype}")
 
-    # We're having the same problem as with the decoder's create_from_file_like:
-    # We should be able to pass a tensor directly, but this leads to a pybind
-    # error. In order to work around this, we pass the pointer to the tensor's
-    # data, and its shape, in order to re-construct it in C++. For this to work:
-    # - the tensor must be float32
-    # - the tensor  must be contiguous, which is why we call contiguous().
-    #   In theory we could avoid this restriction by also passing the strides?
-    # - IMPORTANT: the input samples tensor and its underlying data must be
-    #   alive during the call.
-    #
-    # A more elegant solution would be to cast the tensor into a py::object, but
-    # casting the py::object backk to a tensor in C++ seems to lead to the same
-    # pybing error.
-
-    samples = samples.contiguous()
-    _pybind_ops.encode_audio_to_file_like(
-        samples.data_ptr(),
-        list(samples.shape),
+    _encode_audio_to_file_like(
+        samples,
         sample_rate,
         format,
-        file_like,
+        _pybind_ops.create_file_like_context(file_like, True),  # True means for writing
         bit_rate,
         num_channels,
         desired_sample_rate,
     )
 
-    # This check is useless but it's critical to keep it to ensures that samples
-    # is still alive during the call to encode_audio_to_file_like.
-    assert samples.is_contiguous()
-
 
 # ==============================
 # Abstract impl for the operators. Needed by torch.compile.
@@ -215,6 +206,13 @@ def create_from_file_abstract(filename: str, seek_mode: Optional[str]) -> torch.
     return torch.empty([], dtype=torch.long)
 
 
+@register_fake("torchcodec_ns::_create_from_file_like")
+def _create_from_file_like_abstract(
+    file_like: int, seek_mode: Optional[str]
+) -> torch.Tensor:
+    return torch.empty([], dtype=torch.long)
+
+
 @register_fake("torchcodec_ns::encode_audio_to_file")
 def encode_audio_to_file_abstract(
     samples: torch.Tensor,
@@ -239,6 +237,19 @@ def encode_audio_to_tensor_abstract(
     return torch.empty([], dtype=torch.long)
 
 
+@register_fake("torchcodec_ns::_encode_audio_to_file_like")
+def _encode_audio_to_file_like_abstract(
+    samples: torch.Tensor,
+    sample_rate: int,
+    format: str,
+    file_like_context: int,
+    bit_rate: Optional[int] = None,
+    num_channels: Optional[int] = None,
+    desired_sample_rate: Optional[int] = None,
+) -> None:
+    return torch.empty([], dtype=torch.long)
+
+
 @register_fake("torchcodec_ns::create_from_tensor")
 def create_from_tensor_abstract(
     video_tensor: torch.Tensor, seek_mode: Optional[str]
diff --git a/src/torchcodec/_core/pybind_ops.cpp b/src/torchcodec/_core/pybind_ops.cpp
@@ -7,13 +7,8 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 #include <cstdint>
-#include <string>
 
 #include "src/torchcodec/_core/AVIOFileLikeContext.h"
-#include "src/torchcodec/_core/Encoder.h"
-#include "src/torchcodec/_core/SingleStreamDecoder.h"
-#include "src/torchcodec/_core/StreamOptions.h"
-#include "src/torchcodec/_core/ValidationUtils.h"
 
 namespace py = pybind11;
 
@@ -26,62 +21,18 @@ namespace facebook::torchcodec {
 //
 // So we instead launder the pointer through an int, and then use a conversion
 // function on the custom ops side to launder that int into a tensor.
-int64_t create_from_file_like(
-    py::object file_like,
-    std::optional<std::string_view> seek_mode) {
-  SingleStreamDecoder::SeekMode realSeek = SingleStreamDecoder::SeekMode::exact;
-  if (seek_mode.has_value()) {
-    realSeek = seekModeFromString(seek_mode.value());
-  }
-
-  auto avioContextHolder =
-      std::make_unique<AVIOFileLikeContext>(file_like, /*isForWriting=*/false);
-
-  SingleStreamDecoder* decoder =
-      new SingleStreamDecoder(std::move(avioContextHolder), realSeek);
-  return reinterpret_cast<int64_t>(decoder);
-}
-
-void encode_audio_to_file_like(
-    int64_t data_ptr,
-    const std::vector<int64_t>& shape,
-    int64_t sample_rate,
-    std::string_view format,
-    py::object file_like,
-    std::optional<int64_t> bit_rate = std::nullopt,
-    std::optional<int64_t> num_channels = std::nullopt,
-    std::optional<int64_t> desired_sample_rate = std::nullopt) {
-  // We assume float32 *and* contiguity, this must be enforced by the caller.
-  auto tensor_options = torch::TensorOptions().dtype(torch::kFloat32);
-  auto samples = torch::from_blob(
-      reinterpret_cast<void*>(data_ptr), shape, tensor_options);
-
-  AudioStreamOptions audioStreamOptions;
-  audioStreamOptions.bitRate = validateOptionalInt64ToInt(bit_rate, "bit_rate");
-  audioStreamOptions.numChannels =
-      validateOptionalInt64ToInt(num_channels, "num_channels");
-  audioStreamOptions.sampleRate =
-      validateOptionalInt64ToInt(desired_sample_rate, "desired_sample_rate");
-
-  auto avioContextHolder =
-      std::make_unique<AVIOFileLikeContext>(file_like, /*isForWriting=*/true);
-
-  AudioEncoder encoder(
-      samples,
-      validateInt64ToInt(sample_rate, "sample_rate"),
-      format,
-      std::move(avioContextHolder),
-      audioStreamOptions);
-  encoder.encode();
+int64_t create_file_like_context(py::object file_like, bool is_for_writing) {
+  AVIOFileLikeContext* context =
+      new AVIOFileLikeContext(file_like, is_for_writing);
+  return reinterpret_cast<int64_t>(context);
 }
 
 #ifndef PYBIND_OPS_MODULE_NAME
 #error PYBIND_OPS_MODULE_NAME must be defined!
 #endif
 
 PYBIND11_MODULE(PYBIND_OPS_MODULE_NAME, m) {
-  m.def("create_from_file_like", &create_from_file_like);
-  m.def("encode_audio_to_file_like", &encode_audio_to_file_like);
+  m.def("create_file_like_context", &create_file_like_context);
 }
 
 } // namespace facebook::torchcodec