BETA CUDA interface: AV1 support (#920)

NicolasHug · web-flow · commit bcbb88989d8b · 2025-10-04T10:34:37.000+01:00
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -143,8 +143,9 @@ cudaVideoCodec validateCodecSupport(AVCodecID codecId) {
       return cudaVideoCodec_H264;
     case AV_CODEC_ID_HEVC:
       return cudaVideoCodec_HEVC;
+    case AV_CODEC_ID_AV1:
+      return cudaVideoCodec_AV1;
     // TODONVDEC P0: support more codecs
-    // case AV_CODEC_ID_AV1: return cudaVideoCodec_AV1;
     // case AV_CODEC_ID_MPEG4: return cudaVideoCodec_MPEG4;
     // case AV_CODEC_ID_VP8: return cudaVideoCodec_VP8;
     // case AV_CODEC_ID_VP9: return cudaVideoCodec_VP9;
@@ -195,6 +196,7 @@ void BetaCudaDeviceInterface::initialize(
 
   TORCH_CHECK(avStream != nullptr, "AVStream cannot be null");
   timeBase_ = avStream->time_base;
+  frameRateAvgFromFFmpeg_ = avStream->r_frame_rate;
 
   const AVCodecParameters* codecPar = avStream->codecpar;
   TORCH_CHECK(codecPar != nullptr, "CodecParameters cannot be null");
@@ -494,14 +496,19 @@ UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame(
   avFrame->format = AV_PIX_FMT_CUDA;
   avFrame->pts = dispInfo.timestamp;
 
-  // TODONVDEC P0: Zero division error!!!
-  // TODONVDEC P0: Move AVRational arithmetic to FFMPEGCommon, and put the
-  // similar SingleStreamDecoder stuff there too.
-  unsigned int frameRateNum = videoFormat_.frame_rate.numerator;
-  unsigned int frameRateDen = videoFormat_.frame_rate.denominator;
-  int64_t duration = static_cast<int64_t>((frameRateDen * timeBase_.den)) /
-      (frameRateNum * timeBase_.num);
-  setDuration(avFrame, duration);
+  // TODONVDEC P2: We compute the duration based on average frame rate info:
+  // either from NVCUVID if it's valid, otherwise from FFmpeg as fallback. But
+  // both of these are based on average frame rate, so if the video has
+  // variable frame rate, the durations may be off. We should try to see if we
+  // can set the duration more accurately. Unfortunately it's not given by
+  // dispInfo. One option would be to set it based on the pts difference between
+  // consecutive frames, if the next frame is already available.
+  int frameRateNum = static_cast<int>(videoFormat_.frame_rate.numerator);
+  int frameRateDen = static_cast<int>(videoFormat_.frame_rate.denominator);
+  AVRational frameRate = (frameRateNum > 0 && frameRateDen > 0)
+      ? AVRational{frameRateNum, frameRateDen}
+      : frameRateAvgFromFFmpeg_;
+  setDuration(avFrame, computeSafeDuration(frameRate, timeBase_));
 
   // We need to assign the frame colorspace. This is crucial for proper color
   // conversion. NVCUVID stores that in the matrix_coefficients field, but
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h
@@ -84,7 +84,8 @@ class BetaCudaDeviceInterface : public DeviceInterface {
   // isFlushing_)
   bool isFlushing_ = false;
 
-  AVRational timeBase_ = {0, 0};
+  AVRational timeBase_ = {0, 1};
+  AVRational frameRateAvgFromFFmpeg_ = {0, 1};
 
   UniqueAVBSFContext bitstreamFilter_;
 
diff --git a/src/torchcodec/_core/FFMPEGCommon.cpp b/src/torchcodec/_core/FFMPEGCommon.cpp
@@ -501,4 +501,26 @@ AVIOContext* avioAllocContext(
       seek);
 }
 
+double ptsToSeconds(int64_t pts, const AVRational& timeBase) {
+  // To perform the multiplication before the division, av_q2d is not used
+  return static_cast<double>(pts) * timeBase.num / timeBase.den;
+}
+
+int64_t secondsToClosestPts(double seconds, const AVRational& timeBase) {
+  return static_cast<int64_t>(
+      std::round(seconds * timeBase.den / timeBase.num));
+}
+
+int64_t computeSafeDuration(
+    const AVRational& frameRate,
+    const AVRational& timeBase) {
+  if (frameRate.num <= 0 || frameRate.den <= 0 || timeBase.num <= 0 ||
+      timeBase.den <= 0) {
+    return 0;
+  } else {
+    return (static_cast<int64_t>(frameRate.den) * timeBase.den) /
+        (static_cast<int64_t>(timeBase.num) * frameRate.num);
+  }
+}
+
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/FFMPEGCommon.h b/src/torchcodec/_core/FFMPEGCommon.h
@@ -232,4 +232,10 @@ AVIOContext* avioAllocContext(
     AVIOWriteFunction write_packet,
     AVIOSeekFunction seek);
 
+double ptsToSeconds(int64_t pts, const AVRational& timeBase);
+int64_t secondsToClosestPts(double seconds, const AVRational& timeBase);
+int64_t computeSafeDuration(
+    const AVRational& frameRate,
+    const AVRational& timeBase);
+
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -17,16 +17,6 @@
 namespace facebook::torchcodec {
 namespace {
 
-double ptsToSeconds(int64_t pts, const AVRational& timeBase) {
-  // To perform the multiplication before the division, av_q2d is not used
-  return static_cast<double>(pts) * timeBase.num / timeBase.den;
-}
-
-int64_t secondsToClosestPts(double seconds, const AVRational& timeBase) {
-  return static_cast<int64_t>(
-      std::round(seconds * timeBase.den / timeBase.num));
-}
-
 // Some videos aren't properly encoded and do not specify pts values for
 // packets, and thus for frames. Unset values correspond to INT64_MIN. When that
 // happens, we fallback to the dts value which hopefully exists and is correct.
diff --git a/test/test_decoders.py b/test/test_decoders.py
@@ -1417,13 +1417,23 @@ def test_get_frames_at_tensor_indices(self):
 
     @needs_cuda
     @pytest.mark.parametrize(
-        "asset", (NASA_VIDEO, TEST_SRC_2_720P, BT709_FULL_RANGE, TEST_SRC_2_720P_H265)
+        "asset",
+        (
+            NASA_VIDEO,
+            TEST_SRC_2_720P,
+            BT709_FULL_RANGE,
+            TEST_SRC_2_720P_H265,
+            AV1_VIDEO,
+        ),
     )
     @pytest.mark.parametrize("contiguous_indices", (True, False))
     @pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
     def test_beta_cuda_interface_get_frame_at(
         self, asset, contiguous_indices, seek_mode
     ):
+        if asset == AV1_VIDEO and seek_mode == "approximate":
+            pytest.skip("AV1 asset doesn't work with approximate mode")
+
         ref_decoder = VideoDecoder(asset.path, device="cuda", seek_mode=seek_mode)
         beta_decoder = VideoDecoder(
             asset.path, device="cuda:0:beta", seek_mode=seek_mode
@@ -1449,13 +1459,23 @@ def test_beta_cuda_interface_get_frame_at(
 
     @needs_cuda
     @pytest.mark.parametrize(
-        "asset", (NASA_VIDEO, TEST_SRC_2_720P, BT709_FULL_RANGE, TEST_SRC_2_720P_H265)
+        "asset",
+        (
+            NASA_VIDEO,
+            TEST_SRC_2_720P,
+            BT709_FULL_RANGE,
+            TEST_SRC_2_720P_H265,
+            AV1_VIDEO,
+        ),
     )
     @pytest.mark.parametrize("contiguous_indices", (True, False))
     @pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
     def test_beta_cuda_interface_get_frames_at(
         self, asset, contiguous_indices, seek_mode
     ):
+        if asset == AV1_VIDEO and seek_mode == "approximate":
+            pytest.skip("AV1 asset doesn't work with approximate mode")
+
         ref_decoder = VideoDecoder(asset.path, device="cuda", seek_mode=seek_mode)
         beta_decoder = VideoDecoder(
             asset.path, device="cuda:0:beta", seek_mode=seek_mode
@@ -1482,10 +1502,20 @@ def test_beta_cuda_interface_get_frames_at(
 
     @needs_cuda
     @pytest.mark.parametrize(
-        "asset", (NASA_VIDEO, TEST_SRC_2_720P, BT709_FULL_RANGE, TEST_SRC_2_720P_H265)
+        "asset",
+        (
+            NASA_VIDEO,
+            TEST_SRC_2_720P,
+            BT709_FULL_RANGE,
+            TEST_SRC_2_720P_H265,
+            AV1_VIDEO,
+        ),
     )
     @pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
     def test_beta_cuda_interface_get_frame_played_at(self, asset, seek_mode):
+        if asset == AV1_VIDEO and seek_mode == "approximate":
+            pytest.skip("AV1 asset doesn't work with approximate mode")
+
         ref_decoder = VideoDecoder(asset.path, device="cuda", seek_mode=seek_mode)
         beta_decoder = VideoDecoder(
             asset.path, device="cuda:0:beta", seek_mode=seek_mode
@@ -1499,17 +1529,30 @@ def test_beta_cuda_interface_get_frame_played_at(self, asset, seek_mode):
         for pts in timestamps:
             ref_frame = ref_decoder.get_frame_played_at(pts)
             beta_frame = beta_decoder.get_frame_played_at(pts)
-            torch.testing.assert_close(beta_frame.data, ref_frame.data, rtol=0, atol=0)
+            if get_ffmpeg_major_version() > 4:  # TODONVDEC P1 see above
+                torch.testing.assert_close(
+                    beta_frame.data, ref_frame.data, rtol=0, atol=0
+                )
 
             assert beta_frame.pts_seconds == ref_frame.pts_seconds
             assert beta_frame.duration_seconds == ref_frame.duration_seconds
 
     @needs_cuda
     @pytest.mark.parametrize(
-        "asset", (NASA_VIDEO, TEST_SRC_2_720P, BT709_FULL_RANGE, TEST_SRC_2_720P_H265)
+        "asset",
+        (
+            NASA_VIDEO,
+            TEST_SRC_2_720P,
+            BT709_FULL_RANGE,
+            TEST_SRC_2_720P_H265,
+            AV1_VIDEO,
+        ),
     )
     @pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
     def test_beta_cuda_interface_get_frames_played_at(self, asset, seek_mode):
+        if asset == AV1_VIDEO and seek_mode == "approximate":
+            pytest.skip("AV1 asset doesn't work with approximate mode")
+
         ref_decoder = VideoDecoder(asset.path, device="cuda", seek_mode=seek_mode)
         beta_decoder = VideoDecoder(
             asset.path, device="cuda:0:beta", seek_mode=seek_mode
@@ -1523,18 +1566,30 @@ def test_beta_cuda_interface_get_frames_played_at(self, asset, seek_mode):
 
         ref_frames = ref_decoder.get_frames_played_at(timestamps)
         beta_frames = beta_decoder.get_frames_played_at(timestamps)
-        torch.testing.assert_close(beta_frames.data, ref_frames.data, rtol=0, atol=0)
+        if get_ffmpeg_major_version() > 4:  # TODONVDEC P1 see above
+            torch.testing.assert_close(
+                beta_frames.data, ref_frames.data, rtol=0, atol=0
+            )
         torch.testing.assert_close(beta_frames.pts_seconds, ref_frames.pts_seconds)
         torch.testing.assert_close(
             beta_frames.duration_seconds, ref_frames.duration_seconds
         )
 
     @needs_cuda
     @pytest.mark.parametrize(
-        "asset", (NASA_VIDEO, TEST_SRC_2_720P, BT709_FULL_RANGE, TEST_SRC_2_720P_H265)
+        "asset",
+        (
+            NASA_VIDEO,
+            TEST_SRC_2_720P,
+            BT709_FULL_RANGE,
+            TEST_SRC_2_720P_H265,
+            AV1_VIDEO,
+        ),
     )
     @pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
     def test_beta_cuda_interface_backwards(self, asset, seek_mode):
+        if asset == AV1_VIDEO and seek_mode == "approximate":
+            pytest.skip("AV1 asset doesn't work with approximate mode")
 
         ref_decoder = VideoDecoder(asset.path, device="cuda", seek_mode=seek_mode)
         beta_decoder = VideoDecoder(
@@ -1543,11 +1598,20 @@ def test_beta_cuda_interface_backwards(self, asset, seek_mode):
 
         assert ref_decoder.metadata == beta_decoder.metadata
 
-        for frame_index in [0, 100, 10, 50, 20, 200, 150, 389]:
+        for frame_index in [0, 1, 2, 1, 0, 100, 10, 50, 20, 200, 150, 150, 150, 389, 2]:
+            # This is ugly, but OK: the indices values above are relevant for
+            # the NASA_VIDEO.  We need to avoid going out of bounds for other
+            # videos so we cap the frame_index. This test still serves its
+            # purpose: no matter what the range of the video, we're still doing
+            # backwards seeks.
             frame_index = min(frame_index, len(ref_decoder) - 1)
+
             ref_frame = ref_decoder.get_frame_at(frame_index)
             beta_frame = beta_decoder.get_frame_at(frame_index)
-            torch.testing.assert_close(beta_frame.data, ref_frame.data, rtol=0, atol=0)
+            if get_ffmpeg_major_version() > 4:  # TODONVDEC P1 see above
+                torch.testing.assert_close(
+                    beta_frame.data, ref_frame.data, rtol=0, atol=0
+                )
 
             assert beta_frame.pts_seconds == ref_frame.pts_seconds
             assert beta_frame.duration_seconds == ref_frame.duration_seconds
@@ -1568,8 +1632,6 @@ def test_beta_cuda_interface_small_h265(self):
 
     @needs_cuda
     def test_beta_cuda_interface_error(self):
-        with pytest.raises(RuntimeError, match="Unsupported codec type: av1"):
-            VideoDecoder(AV1_VIDEO.path, device="cuda:0:beta")
         with pytest.raises(RuntimeError, match="Unsupported device"):
             VideoDecoder(NASA_VIDEO.path, device="cuda:0:bad_variant")