Skip to content

Commit 7494259

Browse files
committed
Merge branch 'main' of github.com:pytorch/torchcodec into aeaenjfjanef
2 parents b5fe9bc + e13e410 commit 7494259

File tree

8 files changed

+293
-32
lines changed

8 files changed

+293
-32
lines changed

.github/workflows/linux_cuda_wheel.yaml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -67,10 +67,9 @@ jobs:
6767
# For the actual release we should add that label and change this to
6868
# include more python versions.
6969
python-version: ['3.10']
70-
# We test against 12.6 to avoid having too big of a CI matrix,
70+
# We test against 12.6 and 13.0 to avoid having too big of a CI matrix,
7171
# but for releases we should add 12.8.
72-
# TODO add 13.0!
73-
cuda-version: ['12.6']
72+
cuda-version: ['12.6', '13.0']
7473
# TODO: put back ffmpeg 5 https://github.com/pytorch/torchcodec/issues/325
7574
ffmpeg-version-for-tests: ['4.4.2', '6', '7']
7675

docs/source/conf.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ def __call__(self, filename):
8181
"approximate_mode.py",
8282
"sampling.py",
8383
"parallel_decoding.py",
84+
"custom_frame_mappings.py",
8485
]
8586
else:
8687
assert "examples/encoding" in self.src_dir
Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
"""
8+
====================================
9+
Decoding with custom frame mappings
10+
====================================
11+
12+
In this example, we will describe the ``custom_frame_mappings`` parameter of the
13+
:class:`~torchcodec.decoders.VideoDecoder` class.
14+
This parameter allows you to provide pre-computed frame mapping information to
15+
speed up :class:`~torchcodec.decoders.VideoDecoder` instantiation, while
16+
maintaining the frame seeking accuracy of ``seek_mode="exact"``.
17+
18+
This makes it ideal for workflows where:
19+
20+
1. Frame accuracy is critical, so :doc:`approximate mode <approximate_mode>` cannot be used
21+
2. Videos can be preprocessed once and then decoded many times
22+
"""
23+
24+
# %%
25+
# First, some boilerplate: we'll download a short video from the web, and
26+
# use ffmpeg to create a longer version by repeating it multiple times. We'll end up
27+
# with two videos: a short one of approximately 14 seconds and a long one of about 12 minutes.
28+
# You can ignore this part and skip below to :ref:`frame_mappings_creation`.
29+
30+
import tempfile
31+
from pathlib import Path
32+
import subprocess
33+
import requests
34+
35+
# Video source: https://www.pexels.com/video/dog-eating-854132/
36+
# License: CC0. Author: Coverr.
37+
url = "https://videos.pexels.com/video-files/854132/854132-sd_640_360_25fps.mp4"
38+
response = requests.get(url, headers={"User-Agent": ""})
39+
if response.status_code != 200:
40+
raise RuntimeError(f"Failed to download video. {response.status_code = }.")
41+
42+
temp_dir = tempfile.mkdtemp()
43+
short_video_path = Path(temp_dir) / "short_video.mp4"
44+
with open(short_video_path, 'wb') as f:
45+
for chunk in response.iter_content():
46+
f.write(chunk)
47+
48+
long_video_path = Path(temp_dir) / "long_video.mp4"
49+
ffmpeg_command = [
50+
"ffmpeg",
51+
"-stream_loop", "50", # repeat video 50 times to get a ~12 min video
52+
"-i", f"{short_video_path}",
53+
"-c", "copy",
54+
f"{long_video_path}"
55+
]
56+
subprocess.run(ffmpeg_command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
57+
58+
from torchcodec.decoders import VideoDecoder
59+
print(f"Short video duration: {VideoDecoder(short_video_path).metadata.duration_seconds} seconds")
60+
print(f"Long video duration: {VideoDecoder(long_video_path).metadata.duration_seconds / 60} minutes")
61+
62+
# %%
63+
# .. _frame_mappings_creation:
64+
#
65+
# Creating custom frame mappings with ffprobe
66+
# -------------------------------------------
67+
#
68+
# To generate JSON files containing the required video metadata, we recommend using ffprobe.
69+
# The following frame metadata fields are needed
70+
# (the ``pkt_`` prefix is needed for older versions of FFmpeg):
71+
#
72+
# - ``pts`` / ``pkt_pts``: Presentation timestamps for each frame
73+
# - ``duration`` / ``pkt_duration``: Duration of each frame
74+
# - ``key_frame``: Boolean indicating which frames are key frames
75+
76+
from pathlib import Path
77+
import subprocess
78+
import tempfile
79+
from time import perf_counter_ns
80+
import json
81+
82+
83+
# Lets define a simple function to run ffprobe on a video's first stream index, then writes the results in output_json_path.
84+
def generate_frame_mappings(video_path, output_json_path, stream_index):
85+
ffprobe_cmd = ["ffprobe", "-i", f"{video_path}", "-select_streams", f"{stream_index}", "-show_frames", "-show_entries", "frame=pts,duration,key_frame", "-of", "json"]
86+
print(f"Running ffprobe:\n{' '.join(ffprobe_cmd)}\n")
87+
ffprobe_result = subprocess.run(ffprobe_cmd, check=True, capture_output=True, text=True)
88+
with open(output_json_path, "w") as f:
89+
f.write(ffprobe_result.stdout)
90+
91+
92+
stream_index = 0
93+
long_json_path = Path(temp_dir) / "long_custom_frame_mappings.json"
94+
short_json_path = Path(temp_dir) / "short_custom_frame_mappings.json"
95+
96+
generate_frame_mappings(long_video_path, long_json_path, stream_index)
97+
generate_frame_mappings(short_video_path, short_json_path, stream_index)
98+
with open(short_json_path) as f:
99+
sample_data = json.loads(f.read())
100+
print("Sample of fields in custom frame mappings:")
101+
for frame in sample_data["frames"][:3]:
102+
print(f"{frame['key_frame'] = }, {frame['pts'] = }, {frame['duration'] = }")
103+
104+
# %%
105+
# .. _custom_frame_mappings_perf_creation:
106+
#
107+
# Performance: ``VideoDecoder`` creation
108+
# --------------------------------------
109+
#
110+
# Custom frame mappings affect the **creation** of a :class:`~torchcodec.decoders.VideoDecoder`
111+
# object. As video length or resolution increases, the performance gain compared to exact mode increases.
112+
#
113+
114+
import torch
115+
116+
117+
# Here, we define a benchmarking function, with the option to seek to the start of a file_like.
118+
def bench(f, file_like=False, average_over=50, warmup=2, **f_kwargs):
119+
for _ in range(warmup):
120+
f(**f_kwargs)
121+
if file_like:
122+
f_kwargs["custom_frame_mappings"].seek(0)
123+
124+
times = []
125+
for _ in range(average_over):
126+
start = perf_counter_ns()
127+
f(**f_kwargs)
128+
end = perf_counter_ns()
129+
times.append(end - start)
130+
if file_like:
131+
f_kwargs["custom_frame_mappings"].seek(0)
132+
133+
times = torch.tensor(times) * 1e-6 # ns to ms
134+
std = times.std().item()
135+
med = times.median().item()
136+
print(f"{med = :.2f}ms +- {std:.2f}")
137+
138+
139+
for video_path, json_path in ((short_video_path, short_json_path), (long_video_path, long_json_path)):
140+
print(f"\nRunning benchmarks on {Path(video_path).name}")
141+
142+
print("Creating a VideoDecoder object with custom_frame_mappings:")
143+
with open(json_path, "r") as f:
144+
bench(VideoDecoder, file_like=True, source=video_path, stream_index=stream_index, custom_frame_mappings=f)
145+
146+
# Compare against exact seek_mode
147+
print("Creating a VideoDecoder object with seek_mode='exact':")
148+
bench(VideoDecoder, source=video_path, stream_index=stream_index, seek_mode="exact")
149+
150+
# %%
151+
# Performance: Frame decoding with custom frame mappings
152+
# ------------------------------------------------------
153+
#
154+
# Although using ``custom_frame_mappings`` only impacts the initialization speed of
155+
# :class:`~torchcodec.decoders.VideoDecoder`, decoding workflows
156+
# involve creating a :class:`~torchcodec.decoders.VideoDecoder` instance,
157+
# so the performance benefits are realized.
158+
159+
160+
def decode_frames(video_path, seek_mode = "exact", custom_frame_mappings = None):
161+
decoder = VideoDecoder(
162+
source=video_path,
163+
seek_mode=seek_mode,
164+
custom_frame_mappings=custom_frame_mappings
165+
)
166+
decoder.get_frames_in_range(start=0, stop=10)
167+
168+
169+
for video_path, json_path in ((short_video_path, short_json_path), (long_video_path, long_json_path)):
170+
print(f"\nRunning benchmarks on {Path(video_path).name}")
171+
print("Decoding frames with custom_frame_mappings:")
172+
with open(json_path, "r") as f:
173+
bench(decode_frames, file_like=True, video_path=video_path, custom_frame_mappings=f)
174+
175+
print("Decoding frames with seek_mode='exact':")
176+
bench(decode_frames, video_path=video_path, seek_mode="exact")
177+
178+
# %%
179+
# Accuracy: Metadata and frame retrieval
180+
# --------------------------------------
181+
#
182+
# In addition to the instantiation speed up compared to ``seek_mode="exact"``, using custom frame mappings
183+
# also retains the benefit of exact metadata and frame seeking.
184+
#
185+
186+
print("Metadata of short video with custom_frame_mappings:")
187+
with open(short_json_path, "r") as f:
188+
print(VideoDecoder(short_video_path, custom_frame_mappings=f).metadata)
189+
print("Metadata of short video with seek_mode='exact':")
190+
print(VideoDecoder(short_video_path, seek_mode="exact").metadata)
191+
192+
with open(short_json_path, "r") as f:
193+
custom_frame_mappings_decoder = VideoDecoder(short_video_path, custom_frame_mappings=f)
194+
exact_decoder = VideoDecoder(short_video_path, seek_mode="exact")
195+
for i in range(len(exact_decoder)):
196+
torch.testing.assert_close(
197+
exact_decoder.get_frame_at(i).data,
198+
custom_frame_mappings_decoder.get_frame_at(i).data,
199+
atol=0, rtol=0,
200+
)
201+
print("Frame seeking is the same for this video!")
202+
203+
# %%
204+
# How do custom_frame_mappings help?
205+
# ----------------------------------
206+
#
207+
# Custom frame mappings contain the same frame index information
208+
# that would normally be computed during the :term:`scan` operation in exact mode.
209+
# By providing this information to the :class:`~torchcodec.decoders.VideoDecoder`
210+
# as a JSON, it eliminates the need for the expensive scan while preserving the
211+
# accuracy benefits.
212+
#
213+
# Which mode should I use?
214+
# ------------------------
215+
#
216+
# - For fastest decoding when speed is more important than exact seeking accuracy,
217+
# "approximate" mode is recommended.
218+
#
219+
# - For exact frame seeking, custom frame mappings will benefit workflows where the
220+
# same videos are decoded repeatedly, and some preprocessing work can be done.
221+
#
222+
# - For exact frame seeking without preprocessing, use "exact" mode.
223+
224+
# %%

src/torchcodec/_core/SingleStreamDecoder.cpp

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -322,19 +322,35 @@ void SingleStreamDecoder::scanFileAndUpdateMetadataAndIndex() {
322322
void SingleStreamDecoder::readCustomFrameMappingsUpdateMetadataAndIndex(
323323
int streamIndex,
324324
FrameMappings customFrameMappings) {
325-
auto& all_frames = customFrameMappings.all_frames;
326-
auto& is_key_frame = customFrameMappings.is_key_frame;
327-
auto& duration = customFrameMappings.duration;
325+
TORCH_CHECK(
326+
customFrameMappings.all_frames.dtype() == torch::kLong &&
327+
customFrameMappings.is_key_frame.dtype() == torch::kBool &&
328+
customFrameMappings.duration.dtype() == torch::kLong,
329+
"all_frames and duration tensors must be int64 dtype, and is_key_frame tensor must be a bool dtype.");
330+
const torch::Tensor& all_frames =
331+
customFrameMappings.all_frames.to(torch::kLong);
332+
const torch::Tensor& is_key_frame =
333+
customFrameMappings.is_key_frame.to(torch::kBool);
334+
const torch::Tensor& duration = customFrameMappings.duration.to(torch::kLong);
328335
TORCH_CHECK(
329336
all_frames.size(0) == is_key_frame.size(0) &&
330337
is_key_frame.size(0) == duration.size(0),
331338
"all_frames, is_key_frame, and duration from custom_frame_mappings were not same size.");
332339

340+
// Allocate vectors using num frames to reduce reallocations
341+
int64_t numFrames = all_frames.size(0);
342+
streamInfos_[streamIndex].allFrames.reserve(numFrames);
343+
streamInfos_[streamIndex].keyFrames.reserve(numFrames);
344+
// Use accessor to efficiently access tensor elements
345+
auto pts_data = all_frames.accessor<int64_t, 1>();
346+
auto is_key_frame_data = is_key_frame.accessor<bool, 1>();
347+
auto duration_data = duration.accessor<int64_t, 1>();
348+
333349
auto& streamMetadata = containerMetadata_.allStreamMetadata[streamIndex];
334350

335-
streamMetadata.beginStreamPtsFromContent = all_frames[0].item<int64_t>();
351+
streamMetadata.beginStreamPtsFromContent = pts_data[0];
336352
streamMetadata.endStreamPtsFromContent =
337-
all_frames[-1].item<int64_t>() + duration[-1].item<int64_t>();
353+
pts_data[numFrames - 1] + duration_data[numFrames - 1];
338354

339355
auto avStream = formatContext_->streams[streamIndex];
340356
streamMetadata.beginStreamPtsSecondsFromContent = ptsToSeconds(
@@ -343,17 +359,16 @@ void SingleStreamDecoder::readCustomFrameMappingsUpdateMetadataAndIndex(
343359
streamMetadata.endStreamPtsSecondsFromContent = ptsToSeconds(
344360
*streamMetadata.endStreamPtsFromContent, avStream->time_base);
345361

346-
streamMetadata.numFramesFromContent = all_frames.size(0);
347-
for (int64_t i = 0; i < all_frames.size(0); ++i) {
362+
streamMetadata.numFramesFromContent = numFrames;
363+
for (int64_t i = 0; i < numFrames; ++i) {
348364
FrameInfo frameInfo;
349-
frameInfo.pts = all_frames[i].item<int64_t>();
350-
frameInfo.isKeyFrame = is_key_frame[i].item<bool>();
365+
frameInfo.pts = pts_data[i];
366+
frameInfo.isKeyFrame = is_key_frame_data[i];
351367
streamInfos_[streamIndex].allFrames.push_back(frameInfo);
352368
if (frameInfo.isKeyFrame) {
353369
streamInfos_[streamIndex].keyFrames.push_back(frameInfo);
354370
}
355371
}
356-
// Sort all frames by their pts
357372
sortAllFrames();
358373
}
359374

@@ -505,7 +520,7 @@ void SingleStreamDecoder::addVideoStream(
505520
customFrameMappings.has_value(),
506521
"Missing frame mappings when custom_frame_mappings seek mode is set.");
507522
readCustomFrameMappingsUpdateMetadataAndIndex(
508-
streamIndex, customFrameMappings.value());
523+
activeStreamIndex_, customFrameMappings.value());
509524
}
510525
}
511526

src/torchcodec/decoders/_video_decoder.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,8 @@ class VideoDecoder:
8282
}
8383
8484
Alternative field names "pkt_pts" and "pkt_duration" are also supported.
85+
Read more about this parameter in:
86+
:ref:`sphx_glr_generated_examples_decoding_custom_frame_mappings.py`
8587
8688
Attributes:
8789
metadata (VideoStreamMetadata): Metadata of the video stream.
@@ -494,11 +496,15 @@ def _read_custom_frame_mappings(
494496
"Invalid custom frame mappings. The 'pts'/'pkt_pts', 'duration'/'pkt_duration', and 'key_frame' keys are required in the frame metadata."
495497
)
496498

497-
frame_data = [
498-
(float(frame[pts_key]), frame["key_frame"], float(frame[duration_key]))
499-
for frame in input_data["frames"]
500-
]
501-
all_frames, is_key_frame, duration = map(torch.tensor, zip(*frame_data))
499+
all_frames = torch.tensor(
500+
[int(frame[pts_key]) for frame in input_data["frames"]], dtype=torch.int64
501+
)
502+
is_key_frame = torch.tensor(
503+
[int(frame["key_frame"]) for frame in input_data["frames"]], dtype=torch.bool
504+
)
505+
duration = torch.tensor(
506+
[int(frame[duration_key]) for frame in input_data["frames"]], dtype=torch.int64
507+
)
502508
if not (len(all_frames) == len(is_key_frame) == len(duration)):
503509
raise ValueError("Mismatched lengths in frame index data")
504510
return all_frames, is_key_frame, duration

test/test_decoders.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1224,7 +1224,9 @@ def test_full_and_studio_range_bt709_video(self, asset):
12241224
gpu_frame = decoder_gpu.get_frame_at(frame_index).data.cpu()
12251225
cpu_frame = decoder_cpu.get_frame_at(frame_index).data
12261226

1227-
if cuda_version_used_for_building_torch() >= (12, 9):
1227+
if cuda_version_used_for_building_torch() >= (13, 0):
1228+
torch.testing.assert_close(gpu_frame, cpu_frame, rtol=0, atol=3)
1229+
elif cuda_version_used_for_building_torch() >= (12, 9):
12281230
torch.testing.assert_close(gpu_frame, cpu_frame, rtol=0, atol=2)
12291231
elif cuda_version_used_for_building_torch() == (12, 8):
12301232
assert psnr(gpu_frame, cpu_frame) > 20

0 commit comments

Comments
 (0)