|
| 1 | +# Copyright (c) Meta Platforms, Inc. and affiliates. |
| 2 | +# All rights reserved. |
| 3 | +# |
| 4 | +# This source code is licensed under the BSD-style license found in the |
| 5 | +# LICENSE file in the root directory of this source tree. |
| 6 | + |
| 7 | +""" |
| 8 | +==================================== |
| 9 | +Decoding with custom frame mappings |
| 10 | +==================================== |
| 11 | +
|
| 12 | +In this example, we will describe the ``custom_frame_mappings`` parameter of the |
| 13 | +:class:`~torchcodec.decoders.VideoDecoder` class. |
| 14 | +This parameter allows you to provide pre-computed frame mapping information to |
| 15 | +speed up :class:`~torchcodec.decoders.VideoDecoder` instantiation, while |
| 16 | +maintaining the frame seeking accuracy of ``seek_mode="exact"``. |
| 17 | +
|
| 18 | +This makes it ideal for workflows where: |
| 19 | +
|
| 20 | + 1. Frame accuracy is critical, so :doc:`approximate mode <approximate_mode>` cannot be used |
| 21 | + 2. Videos can be preprocessed once and then decoded many times |
| 22 | +""" |
| 23 | + |
| 24 | +# %% |
| 25 | +# First, some boilerplate: we'll download a short video from the web, and |
| 26 | +# use ffmpeg to create a longer version by repeating it multiple times. We'll end up |
| 27 | +# with two videos: a short one of approximately 14 seconds and a long one of about 12 minutes. |
| 28 | +# You can ignore this part and skip below to :ref:`frame_mappings_creation`. |
| 29 | + |
| 30 | +import tempfile |
| 31 | +from pathlib import Path |
| 32 | +import subprocess |
| 33 | +import requests |
| 34 | + |
| 35 | +# Video source: https://www.pexels.com/video/dog-eating-854132/ |
| 36 | +# License: CC0. Author: Coverr. |
| 37 | +url = "https://videos.pexels.com/video-files/854132/854132-sd_640_360_25fps.mp4" |
| 38 | +response = requests.get(url, headers={"User-Agent": ""}) |
| 39 | +if response.status_code != 200: |
| 40 | + raise RuntimeError(f"Failed to download video. {response.status_code = }.") |
| 41 | + |
| 42 | +temp_dir = tempfile.mkdtemp() |
| 43 | +short_video_path = Path(temp_dir) / "short_video.mp4" |
| 44 | +with open(short_video_path, 'wb') as f: |
| 45 | + for chunk in response.iter_content(): |
| 46 | + f.write(chunk) |
| 47 | + |
| 48 | +long_video_path = Path(temp_dir) / "long_video.mp4" |
| 49 | +ffmpeg_command = [ |
| 50 | + "ffmpeg", |
| 51 | + "-stream_loop", "50", # repeat video 50 times to get a ~12 min video |
| 52 | + "-i", f"{short_video_path}", |
| 53 | + "-c", "copy", |
| 54 | + f"{long_video_path}" |
| 55 | +] |
| 56 | +subprocess.run(ffmpeg_command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) |
| 57 | + |
| 58 | +from torchcodec.decoders import VideoDecoder |
| 59 | +print(f"Short video duration: {VideoDecoder(short_video_path).metadata.duration_seconds} seconds") |
| 60 | +print(f"Long video duration: {VideoDecoder(long_video_path).metadata.duration_seconds / 60} minutes") |
| 61 | + |
| 62 | +# %% |
| 63 | +# .. _frame_mappings_creation: |
| 64 | +# |
| 65 | +# Creating custom frame mappings with ffprobe |
| 66 | +# ------------------------------------------- |
| 67 | +# |
| 68 | +# To generate JSON files containing the required video metadata, we recommend using ffprobe. |
| 69 | +# The following frame metadata fields are needed |
| 70 | +# (the ``pkt_`` prefix is needed for older versions of FFmpeg): |
| 71 | +# |
| 72 | +# - ``pts`` / ``pkt_pts``: Presentation timestamps for each frame |
| 73 | +# - ``duration`` / ``pkt_duration``: Duration of each frame |
| 74 | +# - ``key_frame``: Boolean indicating which frames are key frames |
| 75 | + |
| 76 | +from pathlib import Path |
| 77 | +import subprocess |
| 78 | +import tempfile |
| 79 | +from time import perf_counter_ns |
| 80 | +import json |
| 81 | + |
| 82 | + |
| 83 | +# Lets define a simple function to run ffprobe on a video's first stream index, then writes the results in output_json_path. |
| 84 | +def generate_frame_mappings(video_path, output_json_path, stream_index): |
| 85 | + ffprobe_cmd = ["ffprobe", "-i", f"{video_path}", "-select_streams", f"{stream_index}", "-show_frames", "-show_entries", "frame=pts,duration,key_frame", "-of", "json"] |
| 86 | + print(f"Running ffprobe:\n{' '.join(ffprobe_cmd)}\n") |
| 87 | + ffprobe_result = subprocess.run(ffprobe_cmd, check=True, capture_output=True, text=True) |
| 88 | + with open(output_json_path, "w") as f: |
| 89 | + f.write(ffprobe_result.stdout) |
| 90 | + |
| 91 | + |
| 92 | +stream_index = 0 |
| 93 | +long_json_path = Path(temp_dir) / "long_custom_frame_mappings.json" |
| 94 | +short_json_path = Path(temp_dir) / "short_custom_frame_mappings.json" |
| 95 | + |
| 96 | +generate_frame_mappings(long_video_path, long_json_path, stream_index) |
| 97 | +generate_frame_mappings(short_video_path, short_json_path, stream_index) |
| 98 | +with open(short_json_path) as f: |
| 99 | + sample_data = json.loads(f.read()) |
| 100 | +print("Sample of fields in custom frame mappings:") |
| 101 | +for frame in sample_data["frames"][:3]: |
| 102 | + print(f"{frame['key_frame'] = }, {frame['pts'] = }, {frame['duration'] = }") |
| 103 | + |
| 104 | +# %% |
| 105 | +# .. _custom_frame_mappings_perf_creation: |
| 106 | +# |
| 107 | +# Performance: ``VideoDecoder`` creation |
| 108 | +# -------------------------------------- |
| 109 | +# |
| 110 | +# Custom frame mappings affect the **creation** of a :class:`~torchcodec.decoders.VideoDecoder` |
| 111 | +# object. As video length or resolution increases, the performance gain compared to exact mode increases. |
| 112 | +# |
| 113 | + |
| 114 | +import torch |
| 115 | + |
| 116 | + |
| 117 | +# Here, we define a benchmarking function, with the option to seek to the start of a file_like. |
| 118 | +def bench(f, file_like=False, average_over=50, warmup=2, **f_kwargs): |
| 119 | + for _ in range(warmup): |
| 120 | + f(**f_kwargs) |
| 121 | + if file_like: |
| 122 | + f_kwargs["custom_frame_mappings"].seek(0) |
| 123 | + |
| 124 | + times = [] |
| 125 | + for _ in range(average_over): |
| 126 | + start = perf_counter_ns() |
| 127 | + f(**f_kwargs) |
| 128 | + end = perf_counter_ns() |
| 129 | + times.append(end - start) |
| 130 | + if file_like: |
| 131 | + f_kwargs["custom_frame_mappings"].seek(0) |
| 132 | + |
| 133 | + times = torch.tensor(times) * 1e-6 # ns to ms |
| 134 | + std = times.std().item() |
| 135 | + med = times.median().item() |
| 136 | + print(f"{med = :.2f}ms +- {std:.2f}") |
| 137 | + |
| 138 | + |
| 139 | +for video_path, json_path in ((short_video_path, short_json_path), (long_video_path, long_json_path)): |
| 140 | + print(f"\nRunning benchmarks on {Path(video_path).name}") |
| 141 | + |
| 142 | + print("Creating a VideoDecoder object with custom_frame_mappings:") |
| 143 | + with open(json_path, "r") as f: |
| 144 | + bench(VideoDecoder, file_like=True, source=video_path, stream_index=stream_index, custom_frame_mappings=f) |
| 145 | + |
| 146 | + # Compare against exact seek_mode |
| 147 | + print("Creating a VideoDecoder object with seek_mode='exact':") |
| 148 | + bench(VideoDecoder, source=video_path, stream_index=stream_index, seek_mode="exact") |
| 149 | + |
| 150 | +# %% |
| 151 | +# Performance: Frame decoding with custom frame mappings |
| 152 | +# ------------------------------------------------------ |
| 153 | +# |
| 154 | +# Although using ``custom_frame_mappings`` only impacts the initialization speed of |
| 155 | +# :class:`~torchcodec.decoders.VideoDecoder`, decoding workflows |
| 156 | +# involve creating a :class:`~torchcodec.decoders.VideoDecoder` instance, |
| 157 | +# so the performance benefits are realized. |
| 158 | + |
| 159 | + |
| 160 | +def decode_frames(video_path, seek_mode = "exact", custom_frame_mappings = None): |
| 161 | + decoder = VideoDecoder( |
| 162 | + source=video_path, |
| 163 | + seek_mode=seek_mode, |
| 164 | + custom_frame_mappings=custom_frame_mappings |
| 165 | + ) |
| 166 | + decoder.get_frames_in_range(start=0, stop=10) |
| 167 | + |
| 168 | + |
| 169 | +for video_path, json_path in ((short_video_path, short_json_path), (long_video_path, long_json_path)): |
| 170 | + print(f"\nRunning benchmarks on {Path(video_path).name}") |
| 171 | + print("Decoding frames with custom_frame_mappings:") |
| 172 | + with open(json_path, "r") as f: |
| 173 | + bench(decode_frames, file_like=True, video_path=video_path, custom_frame_mappings=f) |
| 174 | + |
| 175 | + print("Decoding frames with seek_mode='exact':") |
| 176 | + bench(decode_frames, video_path=video_path, seek_mode="exact") |
| 177 | + |
| 178 | +# %% |
| 179 | +# Accuracy: Metadata and frame retrieval |
| 180 | +# -------------------------------------- |
| 181 | +# |
| 182 | +# In addition to the instantiation speed up compared to ``seek_mode="exact"``, using custom frame mappings |
| 183 | +# also retains the benefit of exact metadata and frame seeking. |
| 184 | +# |
| 185 | + |
| 186 | +print("Metadata of short video with custom_frame_mappings:") |
| 187 | +with open(short_json_path, "r") as f: |
| 188 | + print(VideoDecoder(short_video_path, custom_frame_mappings=f).metadata) |
| 189 | +print("Metadata of short video with seek_mode='exact':") |
| 190 | +print(VideoDecoder(short_video_path, seek_mode="exact").metadata) |
| 191 | + |
| 192 | +with open(short_json_path, "r") as f: |
| 193 | + custom_frame_mappings_decoder = VideoDecoder(short_video_path, custom_frame_mappings=f) |
| 194 | +exact_decoder = VideoDecoder(short_video_path, seek_mode="exact") |
| 195 | +for i in range(len(exact_decoder)): |
| 196 | + torch.testing.assert_close( |
| 197 | + exact_decoder.get_frame_at(i).data, |
| 198 | + custom_frame_mappings_decoder.get_frame_at(i).data, |
| 199 | + atol=0, rtol=0, |
| 200 | + ) |
| 201 | +print("Frame seeking is the same for this video!") |
| 202 | + |
| 203 | +# %% |
| 204 | +# How do custom_frame_mappings help? |
| 205 | +# ---------------------------------- |
| 206 | +# |
| 207 | +# Custom frame mappings contain the same frame index information |
| 208 | +# that would normally be computed during the :term:`scan` operation in exact mode. |
| 209 | +# By providing this information to the :class:`~torchcodec.decoders.VideoDecoder` |
| 210 | +# as a JSON, it eliminates the need for the expensive scan while preserving the |
| 211 | +# accuracy benefits. |
| 212 | +# |
| 213 | +# Which mode should I use? |
| 214 | +# ------------------------ |
| 215 | +# |
| 216 | +# - For fastest decoding when speed is more important than exact seeking accuracy, |
| 217 | +# "approximate" mode is recommended. |
| 218 | +# |
| 219 | +# - For exact frame seeking, custom frame mappings will benefit workflows where the |
| 220 | +# same videos are decoded repeatedly, and some preprocessing work can be done. |
| 221 | +# |
| 222 | +# - For exact frame seeking without preprocessing, use "exact" mode. |
| 223 | + |
| 224 | +# %% |
0 commit comments