diff --git a/.github/scripts/setup-env.sh b/.github/scripts/setup-env.sh index e1c5855f31c..64b4aa7fa90 100755 --- a/.github/scripts/setup-env.sh +++ b/.github/scripts/setup-env.sh @@ -23,15 +23,13 @@ case $(uname) in esac echo '::group::Create build environment' -# See https://github.com/pytorch/vision/issues/7296 for ffmpeg conda create \ --name ci \ --quiet --yes \ python="${PYTHON_VERSION}" pip \ ninja cmake \ libpng \ - libwebp \ - 'ffmpeg<4.3' + libwebp conda activate ci conda install --quiet --yes libjpeg-turbo -c pytorch pip install --progress-bar=off --upgrade setuptools==72.1.0 diff --git a/.github/scripts/unittest.sh b/.github/scripts/unittest.sh index 43968762a8b..e5ec6eedacd 100755 --- a/.github/scripts/unittest.sh +++ b/.github/scripts/unittest.sh @@ -14,5 +14,4 @@ echo '::endgroup::' python test/smoke_test.py -# We explicitly ignore the video tests until we resolve https://github.com/pytorch/vision/issues/8162 -pytest --ignore-glob="*test_video*" --ignore-glob="*test_onnx*" --junit-xml="${RUNNER_TEST_RESULTS_DIR}/test-results.xml" -v --durations=25 -k "not TestFxFeatureExtraction" +pytest --ignore-glob="*test_onnx*" --junit-xml="${RUNNER_TEST_RESULTS_DIR}/test-results.xml" -v --durations=25 -k "not TestFxFeatureExtraction" diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 8b341622181..1b246cc01a6 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -34,12 +34,6 @@ jobs: CONDA_PATH=$(which conda) eval "$(${CONDA_PATH} shell.bash hook)" conda activate ci - # FIXME: not sure why we need this. `ldd torchvision/video_reader.so` shows that it - # already links against the one pulled from conda. However, at runtime it pulls from - # /lib64 - # Should we maybe always do this in `./.github/scripts/setup-env.sh` so that we don't - # have to pay attention in all other workflows? - export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}" cd docs diff --git a/docs/source/io.rst b/docs/source/io.rst index 478321a4e6d..72a6edd965b 100644 --- a/docs/source/io.rst +++ b/docs/source/io.rst @@ -1,10 +1,10 @@ -Decoding / Encoding images and videos -===================================== +Decoding / Encoding images +========================== .. currentmodule:: torchvision.io The :mod:`torchvision.io` module provides utilities for decoding and encoding -images and videos. +images. Image Decoding -------------- @@ -92,7 +92,7 @@ Video - DEPREACTED .. warning:: DEPRECATED: All the video decoding and encoding capabilities of torchvision - are deprecated from version 0.22 and will be removed in version 0.24. We + are deprecated from version 0.22 and will be removed in version 0.25. We recommend that you migrate to `TorchCodec `__, where we'll consolidate the future decoding/encoding capabilities of PyTorch @@ -101,19 +101,4 @@ Video - DEPREACTED :toctree: generated/ :template: function.rst - read_video - read_video_timestamps write_video - - -**Fine-grained video API** - -In addition to the :mod:`read_video` function, we provide a high-performance -lower-level API for more fine-grained control compared to the :mod:`read_video` function. -It does all this whilst fully supporting torchscript. - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - VideoReader diff --git a/mypy.ini b/mypy.ini index e25212a169d..a68e48f27ef 100644 --- a/mypy.ini +++ b/mypy.ini @@ -41,10 +41,6 @@ ignore_errors = True ignore_errors = True -[mypy-torchvision.io.video_reader] - -ignore_errors = True - [mypy-torchvision.models.*] ignore_errors=True diff --git a/packaging/pre_build_script.sh b/packaging/pre_build_script.sh index fcacf4bf8a4..b9046aa81c5 100644 --- a/packaging/pre_build_script.sh +++ b/packaging/pre_build_script.sh @@ -17,7 +17,6 @@ if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then # Installing webp also installs a non-turbo jpeg, so we uninstall jpeg stuff # before re-installing them conda uninstall libjpeg-turbo libjpeg -y - conda install -y ffmpeg=4.2 -c pytorch conda install -y libjpeg-turbo -c pytorch # Copy binaries to be included in the wheel distribution @@ -30,7 +29,7 @@ else if [[ "$ARCH" == "aarch64" ]]; then conda install libpng -y - conda install -y ffmpeg=4.2 libjpeg-turbo -c pytorch-nightly + conda install -y libjpeg-turbo -c pytorch-nightly fi conda install libwebp -y diff --git a/packaging/wheel/relocate.py b/packaging/wheel/relocate.py index 4587f3798da..3b9e00c3b59 100644 --- a/packaging/wheel/relocate.py +++ b/packaging/wheel/relocate.py @@ -316,8 +316,9 @@ def patch_linux(): output_dir = osp.join(PACKAGE_ROOT, "dist", ".wheel-process") image_binary = "image.so" - video_binary = "video_reader.so" - torchvision_binaries = [image_binary, video_binary] + torchvision_binaries = [ + image_binary, + ] for wheel in wheels: if osp.exists(output_dir): shutil.rmtree(output_dir) @@ -352,8 +353,9 @@ def patch_win(): output_dir = osp.join(PACKAGE_ROOT, "dist", ".wheel-process") image_binary = "image.pyd" - video_binary = "video_reader.pyd" - torchvision_binaries = [image_binary, video_binary] + torchvision_binaries = [ + image_binary, + ] for wheel in wheels: if osp.exists(output_dir): shutil.rmtree(output_dir) diff --git a/setup.py b/setup.py index 5e69fa50f52..5e1cb9dfba6 100644 --- a/setup.py +++ b/setup.py @@ -22,14 +22,6 @@ USE_WEBP = os.getenv("TORCHVISION_USE_WEBP", "1") == "1" USE_NVJPEG = os.getenv("TORCHVISION_USE_NVJPEG", "1") == "1" NVCC_FLAGS = os.getenv("NVCC_FLAGS", None) -# Note: the GPU video decoding stuff used to be called "video codec", which -# isn't an accurate or descriptive name considering there are at least 2 other -# video decoding backends in torchvision. I'm renaming this to "gpu video -# decoder" where possible, keeping user facing names (like the env var below) to -# the old scheme for BC. -USE_GPU_VIDEO_DECODER = os.getenv("TORCHVISION_USE_VIDEO_CODEC", "1") == "1" -# Same here: "use ffmpeg" was used to denote "use cpu video decoder". -USE_CPU_VIDEO_DECODER = os.getenv("TORCHVISION_USE_FFMPEG", "1") == "1" TORCHVISION_INCLUDE = os.environ.get("TORCHVISION_INCLUDE", "") TORCHVISION_LIBRARY = os.environ.get("TORCHVISION_LIBRARY", "") @@ -52,8 +44,6 @@ print(f"{USE_WEBP = }") print(f"{USE_NVJPEG = }") print(f"{NVCC_FLAGS = }") -print(f"{USE_CPU_VIDEO_DECODER = }") -print(f"{USE_GPU_VIDEO_DECODER = }") print(f"{TORCHVISION_INCLUDE = }") print(f"{TORCHVISION_LIBRARY = }") print(f"{IS_ROCM = }") @@ -371,158 +361,6 @@ def make_image_extension(): ) -def make_video_decoders_extensions(): - print("Building video decoder extensions") - - build_without_extensions_msg = "Building without video decoders extensions." - if sys.platform != "linux" or (sys.version_info.major == 3 and sys.version_info.minor == 9): - # FIXME: Building torchvision with ffmpeg on MacOS or with Python 3.9 - # FIXME: causes crash. See the following GitHub issues for more details. - # FIXME: https://github.com/pytorch/pytorch/issues/65000 - # FIXME: https://github.com/pytorch/vision/issues/3367 - print("Can only build video decoder extensions on linux and Python != 3.9") - return [] - - ffmpeg_exe = shutil.which("ffmpeg") - if ffmpeg_exe is None: - print(f"{build_without_extensions_msg} Couldn't find ffmpeg binary.") - return [] - - def find_ffmpeg_libraries(): - ffmpeg_libraries = {"libavcodec", "libavformat", "libavutil", "libswresample", "libswscale"} - - ffmpeg_bin = os.path.dirname(ffmpeg_exe) - ffmpeg_root = os.path.dirname(ffmpeg_bin) - ffmpeg_include_dir = os.path.join(ffmpeg_root, "include") - ffmpeg_library_dir = os.path.join(ffmpeg_root, "lib") - - gcc = os.environ.get("CC", shutil.which("gcc")) - platform_tag = subprocess.run([gcc, "-print-multiarch"], stdout=subprocess.PIPE) - platform_tag = platform_tag.stdout.strip().decode("utf-8") - - if platform_tag: - # Most probably a Debian-based distribution - ffmpeg_include_dir = [ffmpeg_include_dir, os.path.join(ffmpeg_include_dir, platform_tag)] - ffmpeg_library_dir = [ffmpeg_library_dir, os.path.join(ffmpeg_library_dir, platform_tag)] - else: - ffmpeg_include_dir = [ffmpeg_include_dir] - ffmpeg_library_dir = [ffmpeg_library_dir] - - for library in ffmpeg_libraries: - library_found = False - for search_path in ffmpeg_include_dir + TORCHVISION_INCLUDE: - full_path = os.path.join(search_path, library, "*.h") - library_found |= len(glob.glob(full_path)) > 0 - - if not library_found: - print(f"{build_without_extensions_msg}") - print(f"{library} header files were not found.") - return None, None - - return ffmpeg_include_dir, ffmpeg_library_dir - - ffmpeg_include_dir, ffmpeg_library_dir = find_ffmpeg_libraries() - if ffmpeg_include_dir is None or ffmpeg_library_dir is None: - return [] - - print("Found ffmpeg:") - print(f" ffmpeg include path: {ffmpeg_include_dir}") - print(f" ffmpeg library_dir: {ffmpeg_library_dir}") - - extensions = [] - if USE_CPU_VIDEO_DECODER: - print("Building with CPU video decoder support") - - # TorchVision base decoder + video reader - video_reader_src_dir = os.path.join(ROOT_DIR, "torchvision", "csrc", "io", "video_reader") - video_reader_src = glob.glob(os.path.join(video_reader_src_dir, "*.cpp")) - base_decoder_src_dir = os.path.join(ROOT_DIR, "torchvision", "csrc", "io", "decoder") - base_decoder_src = glob.glob(os.path.join(base_decoder_src_dir, "*.cpp")) - # Torchvision video API - videoapi_src_dir = os.path.join(ROOT_DIR, "torchvision", "csrc", "io", "video") - videoapi_src = glob.glob(os.path.join(videoapi_src_dir, "*.cpp")) - # exclude tests - base_decoder_src = [x for x in base_decoder_src if "_test.cpp" not in x] - - combined_src = video_reader_src + base_decoder_src + videoapi_src - - extensions.append( - CppExtension( - # This is an awful name. It should be "cpu_video_decoder". Keeping for BC. - "torchvision.video_reader", - combined_src, - include_dirs=[ - base_decoder_src_dir, - video_reader_src_dir, - videoapi_src_dir, - str(CSRS_DIR), - *ffmpeg_include_dir, - *TORCHVISION_INCLUDE, - ], - library_dirs=ffmpeg_library_dir + TORCHVISION_LIBRARY, - libraries=[ - "avcodec", - "avformat", - "avutil", - "swresample", - "swscale", - ], - extra_compile_args=["-std=c++17"] if os.name != "nt" else ["/std:c++17", "/MP"], - extra_link_args=["-std=c++17" if os.name != "nt" else "/std:c++17"], - ) - ) - - if USE_GPU_VIDEO_DECODER: - # Locating GPU video decoder headers and libraries - # CUDA_HOME should be set to the cuda root directory. - # TORCHVISION_INCLUDE and TORCHVISION_LIBRARY should include the locations - # to the headers and libraries below - if not ( - BUILD_CUDA_SOURCES - and CUDA_HOME is not None - and any([os.path.exists(os.path.join(folder, "cuviddec.h")) for folder in TORCHVISION_INCLUDE]) - and any([os.path.exists(os.path.join(folder, "nvcuvid.h")) for folder in TORCHVISION_INCLUDE]) - and any([os.path.exists(os.path.join(folder, "libnvcuvid.so")) for folder in TORCHVISION_LIBRARY]) - and any([os.path.exists(os.path.join(folder, "libavcodec", "bsf.h")) for folder in ffmpeg_include_dir]) - ): - print("Could not find necessary dependencies. Refer the setup.py to check which ones are needed.") - print("Building without GPU video decoder support") - return extensions - print("Building torchvision with GPU video decoder support") - - gpu_decoder_path = os.path.join(CSRS_DIR, "io", "decoder", "gpu") - gpu_decoder_src = glob.glob(os.path.join(gpu_decoder_path, "*.cpp")) - cuda_libs = os.path.join(CUDA_HOME, "lib64") - cuda_inc = os.path.join(CUDA_HOME, "include") - - _, extra_compile_args = get_macros_and_flags() - extensions.append( - CUDAExtension( - "torchvision.gpu_decoder", - gpu_decoder_src, - include_dirs=[CSRS_DIR] + TORCHVISION_INCLUDE + [gpu_decoder_path] + [cuda_inc] + ffmpeg_include_dir, - library_dirs=ffmpeg_library_dir + TORCHVISION_LIBRARY + [cuda_libs], - libraries=[ - "avcodec", - "avformat", - "avutil", - "swresample", - "swscale", - "nvcuvid", - "cuda", - "cudart", - "z", - "pthread", - "dl", - "nppicc", - ], - extra_compile_args=extra_compile_args, - ) - ) - - return extensions - - class clean(distutils.command.clean.clean): def run(self): with open(".gitignore") as f: @@ -550,7 +388,6 @@ def run(self): extensions = [ make_C_extension(), make_image_extension(), - *make_video_decoders_extensions(), ] setup( diff --git a/test/test_datasets_video_utils_opt.py b/test/test_datasets_video_utils_opt.py deleted file mode 100644 index 5e6b19bfb95..00000000000 --- a/test/test_datasets_video_utils_opt.py +++ /dev/null @@ -1,12 +0,0 @@ -import unittest - -import test_datasets_video_utils -from torchvision import set_video_backend # noqa: 401 - -# Disabling the video backend switching temporarily -# set_video_backend('video_reader') - - -if __name__ == "__main__": - suite = unittest.TestLoader().loadTestsFromModule(test_datasets_video_utils) - unittest.TextTestRunner(verbosity=1).run(suite) diff --git a/test/test_io.py b/test/test_io.py deleted file mode 100644 index d2950ac9595..00000000000 --- a/test/test_io.py +++ /dev/null @@ -1,292 +0,0 @@ -import contextlib -import os -import sys -import tempfile - -import pytest -import torch -import torchvision.io as io -from common_utils import assert_equal, cpu_and_cuda -from torchvision import get_video_backend - - -try: - import av - - # Do a version test too - io.video._check_av_available() -except ImportError: - av = None - - -VIDEO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "videos") - - -def _create_video_frames(num_frames, height, width): - y, x = torch.meshgrid(torch.linspace(-2, 2, height), torch.linspace(-2, 2, width), indexing="ij") - data = [] - for i in range(num_frames): - xc = float(i) / num_frames - yc = 1 - float(i) / (2 * num_frames) - d = torch.exp(-((x - xc) ** 2 + (y - yc) ** 2) / 2) * 255 - data.append(d.unsqueeze(2).repeat(1, 1, 3).byte()) - - return torch.stack(data, 0) - - -@contextlib.contextmanager -def temp_video(num_frames, height, width, fps, lossless=False, video_codec=None, options=None): - if lossless: - if video_codec is not None: - raise ValueError("video_codec can't be specified together with lossless") - if options is not None: - raise ValueError("options can't be specified together with lossless") - video_codec = "libx264rgb" - options = {"crf": "0"} - - if video_codec is None: - if get_video_backend() == "pyav": - video_codec = "libx264" - else: - # when video_codec is not set, we assume it is libx264rgb which accepts - # RGB pixel formats as input instead of YUV - video_codec = "libx264rgb" - if options is None: - options = {} - - data = _create_video_frames(num_frames, height, width) - with tempfile.NamedTemporaryFile(suffix=".mp4") as f: - f.close() - io.write_video(f.name, data, fps=fps, video_codec=video_codec, options=options) - yield f.name, data - os.unlink(f.name) - - -@pytest.mark.skipif( - get_video_backend() != "pyav" and not io._HAS_CPU_VIDEO_DECODER, reason="video_reader backend not available" -) -@pytest.mark.skipif(av is None, reason="PyAV unavailable") -class TestVideo: - # compression adds artifacts, thus we add a tolerance of - # 6 in 0-255 range - TOLERANCE = 6 - - def test_write_read_video(self): - with temp_video(10, 300, 300, 5, lossless=True) as (f_name, data): - lv, _, info = io.read_video(f_name) - assert_equal(data, lv) - assert info["video_fps"] == 5 - - @pytest.mark.skipif(not io._HAS_CPU_VIDEO_DECODER, reason="video_reader backend is not chosen") - def test_probe_video_from_file(self): - with temp_video(10, 300, 300, 5) as (f_name, data): - video_info = io._probe_video_from_file(f_name) - assert pytest.approx(2, rel=0.0, abs=0.1) == video_info.video_duration - assert pytest.approx(5, rel=0.0, abs=0.1) == video_info.video_fps - - @pytest.mark.skipif(not io._HAS_CPU_VIDEO_DECODER, reason="video_reader backend is not chosen") - def test_probe_video_from_memory(self): - with temp_video(10, 300, 300, 5) as (f_name, data): - with open(f_name, "rb") as fp: - filebuffer = fp.read() - video_info = io._probe_video_from_memory(filebuffer) - assert pytest.approx(2, rel=0.0, abs=0.1) == video_info.video_duration - assert pytest.approx(5, rel=0.0, abs=0.1) == video_info.video_fps - - def test_read_timestamps(self): - with temp_video(10, 300, 300, 5) as (f_name, data): - pts, _ = io.read_video_timestamps(f_name) - # note: not all formats/codecs provide accurate information for computing the - # timestamps. For the format that we use here, this information is available, - # so we use it as a baseline - with av.open(f_name) as container: - stream = container.streams[0] - pts_step = int(round(float(1 / (stream.average_rate * stream.time_base)))) - num_frames = int(round(float(stream.average_rate * stream.time_base * stream.duration))) - expected_pts = [i * pts_step for i in range(num_frames)] - - assert pts == expected_pts - - @pytest.mark.parametrize("start", range(5)) - @pytest.mark.parametrize("offset", range(1, 4)) - def test_read_partial_video(self, start, offset): - with temp_video(10, 300, 300, 5, lossless=True) as (f_name, data): - pts, _ = io.read_video_timestamps(f_name) - - lv, _, _ = io.read_video(f_name, pts[start], pts[start + offset - 1]) - s_data = data[start : (start + offset)] - assert len(lv) == offset - assert_equal(s_data, lv) - - if get_video_backend() == "pyav": - # for "video_reader" backend, we don't decode the closest early frame - # when the given start pts is not matching any frame pts - lv, _, _ = io.read_video(f_name, pts[4] + 1, pts[7]) - assert len(lv) == 4 - assert_equal(data[4:8], lv) - - @pytest.mark.parametrize("start", range(0, 80, 20)) - @pytest.mark.parametrize("offset", range(1, 4)) - def test_read_partial_video_bframes(self, start, offset): - # do not use lossless encoding, to test the presence of B-frames - options = {"bframes": "16", "keyint": "10", "min-keyint": "4"} - with temp_video(100, 300, 300, 5, options=options) as (f_name, data): - pts, _ = io.read_video_timestamps(f_name) - - lv, _, _ = io.read_video(f_name, pts[start], pts[start + offset - 1]) - s_data = data[start : (start + offset)] - assert len(lv) == offset - assert_equal(s_data, lv, rtol=0.0, atol=self.TOLERANCE) - - lv, _, _ = io.read_video(f_name, pts[4] + 1, pts[7]) - # TODO fix this - if get_video_backend() == "pyav": - assert len(lv) == 4 - assert_equal(data[4:8], lv, rtol=0.0, atol=self.TOLERANCE) - else: - assert len(lv) == 3 - assert_equal(data[5:8], lv, rtol=0.0, atol=self.TOLERANCE) - - def test_read_packed_b_frames_divx_file(self): - name = "hmdb51_Turnk_r_Pippi_Michel_cartwheel_f_cm_np2_le_med_6.avi" - f_name = os.path.join(VIDEO_DIR, name) - pts, fps = io.read_video_timestamps(f_name) - - assert pts == sorted(pts) - assert fps == 30 - - def test_read_timestamps_from_packet(self): - with temp_video(10, 300, 300, 5, video_codec="mpeg4") as (f_name, data): - pts, _ = io.read_video_timestamps(f_name) - # note: not all formats/codecs provide accurate information for computing the - # timestamps. For the format that we use here, this information is available, - # so we use it as a baseline - with av.open(f_name) as container: - stream = container.streams[0] - # make sure we went through the optimized codepath - assert b"Lavc" in stream.codec_context.extradata - pts_step = int(round(float(1 / (stream.average_rate * stream.time_base)))) - num_frames = int(round(float(stream.average_rate * stream.time_base * stream.duration))) - expected_pts = [i * pts_step for i in range(num_frames)] - - assert pts == expected_pts - - def test_read_video_pts_unit_sec(self): - with temp_video(10, 300, 300, 5, lossless=True) as (f_name, data): - lv, _, info = io.read_video(f_name, pts_unit="sec") - - assert_equal(data, lv) - assert info["video_fps"] == 5 - assert info == {"video_fps": 5} - - def test_read_timestamps_pts_unit_sec(self): - with temp_video(10, 300, 300, 5) as (f_name, data): - pts, _ = io.read_video_timestamps(f_name, pts_unit="sec") - - with av.open(f_name) as container: - stream = container.streams[0] - pts_step = int(round(float(1 / (stream.average_rate * stream.time_base)))) - num_frames = int(round(float(stream.average_rate * stream.time_base * stream.duration))) - expected_pts = [i * pts_step * stream.time_base for i in range(num_frames)] - - assert pts == expected_pts - - @pytest.mark.parametrize("start", range(5)) - @pytest.mark.parametrize("offset", range(1, 4)) - def test_read_partial_video_pts_unit_sec(self, start, offset): - with temp_video(10, 300, 300, 5, lossless=True) as (f_name, data): - pts, _ = io.read_video_timestamps(f_name, pts_unit="sec") - - lv, _, _ = io.read_video(f_name, pts[start], pts[start + offset - 1], pts_unit="sec") - s_data = data[start : (start + offset)] - assert len(lv) == offset - assert_equal(s_data, lv) - - with av.open(f_name) as container: - stream = container.streams[0] - lv, _, _ = io.read_video( - f_name, int(pts[4] * (1.0 / stream.time_base) + 1) * stream.time_base, pts[7], pts_unit="sec" - ) - if get_video_backend() == "pyav": - # for "video_reader" backend, we don't decode the closest early frame - # when the given start pts is not matching any frame pts - assert len(lv) == 4 - assert_equal(data[4:8], lv) - - def test_read_video_corrupted_file(self): - with tempfile.NamedTemporaryFile(suffix=".mp4") as f: - f.write(b"This is not an mpg4 file") - video, audio, info = io.read_video(f.name) - assert isinstance(video, torch.Tensor) - assert isinstance(audio, torch.Tensor) - assert video.numel() == 0 - assert audio.numel() == 0 - assert info == {} - - def test_read_video_timestamps_corrupted_file(self): - with tempfile.NamedTemporaryFile(suffix=".mp4") as f: - f.write(b"This is not an mpg4 file") - video_pts, video_fps = io.read_video_timestamps(f.name) - assert video_pts == [] - assert video_fps is None - - @pytest.mark.skip(reason="Temporarily disabled due to new pyav") - def test_read_video_partially_corrupted_file(self): - with temp_video(5, 4, 4, 5, lossless=True) as (f_name, data): - with open(f_name, "r+b") as f: - size = os.path.getsize(f_name) - bytes_to_overwrite = size // 10 - # seek to the middle of the file - f.seek(5 * bytes_to_overwrite) - # corrupt 10% of the file from the middle - f.write(b"\xff" * bytes_to_overwrite) - # this exercises the container.decode assertion check - video, audio, info = io.read_video(f.name, pts_unit="sec") - # check that size is not equal to 5, but 3 - # TODO fix this - if get_video_backend() == "pyav": - assert len(video) == 3 - else: - assert len(video) == 4 - # but the valid decoded content is still correct - assert_equal(video[:3], data[:3]) - # and the last few frames are wrong - with pytest.raises(AssertionError): - assert_equal(video, data) - - @pytest.mark.skipif(sys.platform == "win32", reason="temporarily disabled on Windows") - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_write_video_with_audio(self, device, tmpdir): - f_name = os.path.join(VIDEO_DIR, "R6llTwEh07w.mp4") - video_tensor, audio_tensor, info = io.read_video(f_name, pts_unit="sec") - - out_f_name = os.path.join(tmpdir, "testing.mp4") - io.video.write_video( - out_f_name, - video_tensor.to(device), - round(info["video_fps"]), - video_codec="libx264rgb", - options={"crf": "0"}, - audio_array=audio_tensor.to(device), - audio_fps=info["audio_fps"], - audio_codec="aac", - ) - - out_video_tensor, out_audio_tensor, out_info = io.read_video(out_f_name, pts_unit="sec") - - assert info["video_fps"] == out_info["video_fps"] - assert_equal(video_tensor, out_video_tensor) - - audio_stream = av.open(f_name).streams.audio[0] - out_audio_stream = av.open(out_f_name).streams.audio[0] - - assert info["audio_fps"] == out_info["audio_fps"] - assert audio_stream.rate == out_audio_stream.rate - assert pytest.approx(out_audio_stream.frames, rel=0.0, abs=1) == audio_stream.frames - assert audio_stream.frame_size == out_audio_stream.frame_size - - # TODO add tests for audio - - -if __name__ == "__main__": - pytest.main(__file__) diff --git a/test/test_io_opt.py b/test/test_io_opt.py deleted file mode 100644 index f4e3d305295..00000000000 --- a/test/test_io_opt.py +++ /dev/null @@ -1,13 +0,0 @@ -import unittest - -import test_io -from torchvision import set_video_backend # noqa: 401 - - -# Disabling the video backend switching temporarily -# set_video_backend('video_reader') - - -if __name__ == "__main__": - suite = unittest.TestLoader().loadTestsFromModule(test_io) - unittest.TextTestRunner(verbosity=1).run(suite) diff --git a/test/test_video_gpu_decoder.py b/test/test_video_gpu_decoder.py deleted file mode 100644 index aa6d0aee9e0..00000000000 --- a/test/test_video_gpu_decoder.py +++ /dev/null @@ -1,97 +0,0 @@ -import math -import os - -import pytest -import torch -import torchvision -from torchvision.io import _HAS_GPU_VIDEO_DECODER, VideoReader - -try: - import av -except ImportError: - av = None - -VIDEO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "videos") - - -@pytest.mark.skipif(_HAS_GPU_VIDEO_DECODER is False, reason="Didn't compile with support for gpu decoder") -class TestVideoGPUDecoder: - @pytest.mark.skipif(av is None, reason="PyAV unavailable") - @pytest.mark.parametrize( - "video_file", - [ - "RATRACE_wave_f_nm_np1_fr_goo_37.avi", - "TrumanShow_wave_f_nm_np1_fr_med_26.avi", - "v_SoccerJuggling_g23_c01.avi", - "v_SoccerJuggling_g24_c01.avi", - "R6llTwEh07w.mp4", - "SOX5yA1l24A.mp4", - "WUzgd7C1pWA.mp4", - ], - ) - def test_frame_reading(self, video_file): - torchvision.set_video_backend("cuda") - full_path = os.path.join(VIDEO_DIR, video_file) - decoder = VideoReader(full_path) - with av.open(full_path) as container: - for av_frame in container.decode(container.streams.video[0]): - av_frames = torch.tensor(av_frame.to_rgb(src_colorspace="ITU709").to_ndarray()) - vision_frames = next(decoder)["data"] - mean_delta = torch.mean(torch.abs(av_frames.float() - vision_frames.cpu().float())) - assert mean_delta < 0.75 - - @pytest.mark.skipif(av is None, reason="PyAV unavailable") - @pytest.mark.parametrize("keyframes", [True, False]) - @pytest.mark.parametrize( - "full_path, duration", - [ - (os.path.join(VIDEO_DIR, x), y) - for x, y in [ - ("v_SoccerJuggling_g23_c01.avi", 8.0), - ("v_SoccerJuggling_g24_c01.avi", 8.0), - ("R6llTwEh07w.mp4", 10.0), - ("SOX5yA1l24A.mp4", 11.0), - ("WUzgd7C1pWA.mp4", 11.0), - ] - ], - ) - def test_seek_reading(self, keyframes, full_path, duration): - torchvision.set_video_backend("cuda") - decoder = VideoReader(full_path) - time = duration / 2 - decoder.seek(time, keyframes_only=keyframes) - with av.open(full_path) as container: - container.seek(int(time * 1000000), any_frame=not keyframes, backward=False) - for av_frame in container.decode(container.streams.video[0]): - av_frames = torch.tensor(av_frame.to_rgb(src_colorspace="ITU709").to_ndarray()) - vision_frames = next(decoder)["data"] - mean_delta = torch.mean(torch.abs(av_frames.float() - vision_frames.cpu().float())) - assert mean_delta < 0.75 - - @pytest.mark.skipif(av is None, reason="PyAV unavailable") - @pytest.mark.parametrize( - "video_file", - [ - "RATRACE_wave_f_nm_np1_fr_goo_37.avi", - "TrumanShow_wave_f_nm_np1_fr_med_26.avi", - "v_SoccerJuggling_g23_c01.avi", - "v_SoccerJuggling_g24_c01.avi", - "R6llTwEh07w.mp4", - "SOX5yA1l24A.mp4", - "WUzgd7C1pWA.mp4", - ], - ) - def test_metadata(self, video_file): - torchvision.set_video_backend("cuda") - full_path = os.path.join(VIDEO_DIR, video_file) - decoder = VideoReader(full_path) - video_metadata = decoder.get_metadata()["video"] - with av.open(full_path) as container: - video = container.streams.video[0] - av_duration = float(video.duration * video.time_base) - assert math.isclose(video_metadata["duration"], av_duration, rel_tol=1e-2) - assert math.isclose(video_metadata["fps"], video.base_rate, rel_tol=1e-2) - - -if __name__ == "__main__": - pytest.main([__file__]) diff --git a/test/test_video_reader.py b/test/test_video_reader.py deleted file mode 100644 index 10995424982..00000000000 --- a/test/test_video_reader.py +++ /dev/null @@ -1,1254 +0,0 @@ -import collections -import math -import os -from fractions import Fraction - -import numpy as np -import pytest -import torch -import torchvision.io as io -from common_utils import assert_equal -from numpy.random import randint -from pytest import approx -from torchvision import set_video_backend -from torchvision.io import _HAS_CPU_VIDEO_DECODER - - -try: - import av - - # Do a version test too - io.video._check_av_available() -except ImportError: - av = None - - -VIDEO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "videos") - -CheckerConfig = [ - "duration", - "video_fps", - "audio_sample_rate", - # We find for some videos (e.g. HMDB51 videos), the decoded audio frames and pts are - # slightly different between TorchVision decoder and PyAv decoder. So omit it during check - "check_aframes", - "check_aframe_pts", -] -GroundTruth = collections.namedtuple("GroundTruth", " ".join(CheckerConfig)) - -all_check_config = GroundTruth( - duration=0, - video_fps=0, - audio_sample_rate=0, - check_aframes=True, - check_aframe_pts=True, -) - -test_videos = { - "RATRACE_wave_f_nm_np1_fr_goo_37.avi": GroundTruth( - duration=2.0, - video_fps=30.0, - audio_sample_rate=None, - check_aframes=True, - check_aframe_pts=True, - ), - "SchoolRulesHowTheyHelpUs_wave_f_nm_np1_ba_med_0.avi": GroundTruth( - duration=2.0, - video_fps=30.0, - audio_sample_rate=None, - check_aframes=True, - check_aframe_pts=True, - ), - "TrumanShow_wave_f_nm_np1_fr_med_26.avi": GroundTruth( - duration=2.0, - video_fps=30.0, - audio_sample_rate=None, - check_aframes=True, - check_aframe_pts=True, - ), - "v_SoccerJuggling_g23_c01.avi": GroundTruth( - duration=8.0, - video_fps=29.97, - audio_sample_rate=None, - check_aframes=True, - check_aframe_pts=True, - ), - "v_SoccerJuggling_g24_c01.avi": GroundTruth( - duration=8.0, - video_fps=29.97, - audio_sample_rate=None, - check_aframes=True, - check_aframe_pts=True, - ), - "R6llTwEh07w.mp4": GroundTruth( - duration=10.0, - video_fps=30.0, - audio_sample_rate=44100, - # PyAv miss one audio frame at the beginning (pts=0) - check_aframes=False, - check_aframe_pts=False, - ), - "SOX5yA1l24A.mp4": GroundTruth( - duration=11.0, - video_fps=29.97, - audio_sample_rate=48000, - # PyAv miss one audio frame at the beginning (pts=0) - check_aframes=False, - check_aframe_pts=False, - ), - "WUzgd7C1pWA.mp4": GroundTruth( - duration=11.0, - video_fps=29.97, - audio_sample_rate=48000, - # PyAv miss one audio frame at the beginning (pts=0) - check_aframes=False, - check_aframe_pts=False, - ), -} - - -DecoderResult = collections.namedtuple("DecoderResult", "vframes vframe_pts vtimebase aframes aframe_pts atimebase") - -# av_seek_frame is imprecise so seek to a timestamp earlier by a margin -# The unit of margin is second -SEEK_FRAME_MARGIN = 0.25 - - -def _read_from_stream(container, start_pts, end_pts, stream, stream_name, buffer_size=4): - """ - Args: - container: pyav container - start_pts/end_pts: the starting/ending Presentation TimeStamp where - frames are read - stream: pyav stream - stream_name: a dictionary of streams. For example, {"video": 0} means - video stream at stream index 0 - buffer_size: pts of frames decoded by PyAv is not guaranteed to be in - ascending order. We need to decode more frames even when we meet end - pts - """ - # seeking in the stream is imprecise. Thus, seek to an earlier PTS by a margin - margin = 1 - seek_offset = max(start_pts - margin, 0) - - container.seek(seek_offset, any_frame=False, backward=True, stream=stream) - frames = {} - buffer_count = 0 - for frame in container.decode(**stream_name): - if frame.pts < start_pts: - continue - if frame.pts <= end_pts: - frames[frame.pts] = frame - else: - buffer_count += 1 - if buffer_count >= buffer_size: - break - result = [frames[pts] for pts in sorted(frames)] - - return result - - -def _get_timebase_by_av_module(full_path): - container = av.open(full_path) - video_time_base = container.streams.video[0].time_base - if container.streams.audio: - audio_time_base = container.streams.audio[0].time_base - else: - audio_time_base = None - return video_time_base, audio_time_base - - -def _fraction_to_tensor(fraction): - ret = torch.zeros([2], dtype=torch.int32) - ret[0] = fraction.numerator - ret[1] = fraction.denominator - return ret - - -def _decode_frames_by_av_module( - full_path, - video_start_pts=0, - video_end_pts=None, - audio_start_pts=0, - audio_end_pts=None, -): - """ - Use PyAv to decode video frames. This provides a reference for our decoder - to compare the decoding results. - Input arguments: - full_path: video file path - video_start_pts/video_end_pts: the starting/ending Presentation TimeStamp where - frames are read - """ - if video_end_pts is None: - video_end_pts = float("inf") - if audio_end_pts is None: - audio_end_pts = float("inf") - container = av.open(full_path) - - video_frames = [] - vtimebase = torch.zeros([0], dtype=torch.int32) - if container.streams.video: - video_frames = _read_from_stream( - container, - video_start_pts, - video_end_pts, - container.streams.video[0], - {"video": 0}, - ) - # container.streams.video[0].average_rate is not a reliable estimator of - # frame rate. It can be wrong for certain codec, such as VP80 - # So we do not return video fps here - vtimebase = _fraction_to_tensor(container.streams.video[0].time_base) - - audio_frames = [] - atimebase = torch.zeros([0], dtype=torch.int32) - if container.streams.audio: - audio_frames = _read_from_stream( - container, - audio_start_pts, - audio_end_pts, - container.streams.audio[0], - {"audio": 0}, - ) - atimebase = _fraction_to_tensor(container.streams.audio[0].time_base) - - container.close() - vframes = [frame.to_rgb().to_ndarray() for frame in video_frames] - vframes = torch.as_tensor(np.stack(vframes)) - - vframe_pts = torch.tensor([frame.pts for frame in video_frames], dtype=torch.int64) - - aframes = [frame.to_ndarray() for frame in audio_frames] - if aframes: - aframes = np.transpose(np.concatenate(aframes, axis=1)) - aframes = torch.as_tensor(aframes) - else: - aframes = torch.empty((1, 0), dtype=torch.float32) - - aframe_pts = torch.tensor([audio_frame.pts for audio_frame in audio_frames], dtype=torch.int64) - - return DecoderResult( - vframes=vframes, - vframe_pts=vframe_pts, - vtimebase=vtimebase, - aframes=aframes, - aframe_pts=aframe_pts, - atimebase=atimebase, - ) - - -def _pts_convert(pts, timebase_from, timebase_to, round_func=math.floor): - """convert pts between different time bases - Args: - pts: presentation timestamp, float - timebase_from: original timebase. Fraction - timebase_to: new timebase. Fraction - round_func: rounding function. - """ - new_pts = Fraction(pts, 1) * timebase_from / timebase_to - return int(round_func(new_pts)) - - -def _get_video_tensor(video_dir, video_file): - """open a video file, and represent the video data by a PT tensor""" - full_path = os.path.join(video_dir, video_file) - - assert os.path.exists(full_path), "File not found: %s" % full_path - - with open(full_path, "rb") as fp: - video_tensor = torch.frombuffer(fp.read(), dtype=torch.uint8) - - return full_path, video_tensor - - -@pytest.mark.skipif(av is None, reason="PyAV unavailable") -@pytest.mark.skipif(_HAS_CPU_VIDEO_DECODER is False, reason="Didn't compile with ffmpeg") -class TestVideoReader: - def check_separate_decoding_result(self, tv_result, config): - """check the decoding results from TorchVision decoder""" - ( - vframes, - vframe_pts, - vtimebase, - vfps, - vduration, - aframes, - aframe_pts, - atimebase, - asample_rate, - aduration, - ) = tv_result - - video_duration = vduration.item() * Fraction(vtimebase[0].item(), vtimebase[1].item()) - assert video_duration == approx(config.duration, abs=0.5) - - assert vfps.item() == approx(config.video_fps, abs=0.5) - - if asample_rate.numel() > 0: - assert asample_rate.item() == config.audio_sample_rate - audio_duration = aduration.item() * Fraction(atimebase[0].item(), atimebase[1].item()) - assert audio_duration == approx(config.duration, abs=0.5) - - # check if pts of video frames are sorted in ascending order - for i in range(len(vframe_pts) - 1): - assert vframe_pts[i] < vframe_pts[i + 1] - - if len(aframe_pts) > 1: - # check if pts of audio frames are sorted in ascending order - for i in range(len(aframe_pts) - 1): - assert aframe_pts[i] < aframe_pts[i + 1] - - def check_probe_result(self, result, config): - vtimebase, vfps, vduration, atimebase, asample_rate, aduration = result - video_duration = vduration.item() * Fraction(vtimebase[0].item(), vtimebase[1].item()) - assert video_duration == approx(config.duration, abs=0.5) - assert vfps.item() == approx(config.video_fps, abs=0.5) - if asample_rate.numel() > 0: - assert asample_rate.item() == config.audio_sample_rate - audio_duration = aduration.item() * Fraction(atimebase[0].item(), atimebase[1].item()) - assert audio_duration == approx(config.duration, abs=0.5) - - def check_meta_result(self, result, config): - assert result.video_duration == approx(config.duration, abs=0.5) - assert result.video_fps == approx(config.video_fps, abs=0.5) - if result.has_audio > 0: - assert result.audio_sample_rate == config.audio_sample_rate - assert result.audio_duration == approx(config.duration, abs=0.5) - - def compare_decoding_result(self, tv_result, ref_result, config=all_check_config): - """ - Compare decoding results from two sources. - Args: - tv_result: decoding results from TorchVision decoder - ref_result: reference decoding results which can be from either PyAv - decoder or TorchVision decoder with getPtsOnly = 1 - config: config of decoding results checker - """ - ( - vframes, - vframe_pts, - vtimebase, - _vfps, - _vduration, - aframes, - aframe_pts, - atimebase, - _asample_rate, - _aduration, - ) = tv_result - if isinstance(ref_result, list): - # the ref_result is from new video_reader decoder - ref_result = DecoderResult( - vframes=ref_result[0], - vframe_pts=ref_result[1], - vtimebase=ref_result[2], - aframes=ref_result[5], - aframe_pts=ref_result[6], - atimebase=ref_result[7], - ) - - if vframes.numel() > 0 and ref_result.vframes.numel() > 0: - mean_delta = torch.mean(torch.abs(vframes.float() - ref_result.vframes.float())) - assert mean_delta == approx(0.0, abs=8.0) - - mean_delta = torch.mean(torch.abs(vframe_pts.float() - ref_result.vframe_pts.float())) - assert mean_delta == approx(0.0, abs=1.0) - - assert_equal(vtimebase, ref_result.vtimebase) - - if config.check_aframes and aframes.numel() > 0 and ref_result.aframes.numel() > 0: - """Audio stream is available and audio frame is required to return - from decoder""" - assert_equal(aframes, ref_result.aframes) - - if config.check_aframe_pts and aframe_pts.numel() > 0 and ref_result.aframe_pts.numel() > 0: - """Audio stream is available""" - assert_equal(aframe_pts, ref_result.aframe_pts) - - assert_equal(atimebase, ref_result.atimebase) - - @pytest.mark.parametrize("test_video", test_videos.keys()) - def test_stress_test_read_video_from_file(self, test_video): - pytest.skip( - "This stress test will iteratively decode the same set of videos." - "It helps to detect memory leak but it takes lots of time to run." - "By default, it is disabled" - ) - num_iter = 10000 - # video related - width, height, min_dimension, max_dimension = 0, 0, 0, 0 - video_start_pts, video_end_pts = 0, -1 - video_timebase_num, video_timebase_den = 0, 1 - # audio related - samples, channels = 0, 0 - audio_start_pts, audio_end_pts = 0, -1 - audio_timebase_num, audio_timebase_den = 0, 1 - - for _i in range(num_iter): - full_path = os.path.join(VIDEO_DIR, test_video) - - # pass 1: decode all frames using new decoder - torch.ops.video_reader.read_video_from_file( - full_path, - SEEK_FRAME_MARGIN, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - - @pytest.mark.parametrize("test_video,config", test_videos.items()) - def test_read_video_from_file(self, test_video, config): - """ - Test the case when decoder starts with a video file to decode frames. - """ - # video related - width, height, min_dimension, max_dimension = 0, 0, 0, 0 - video_start_pts, video_end_pts = 0, -1 - video_timebase_num, video_timebase_den = 0, 1 - # audio related - samples, channels = 0, 0 - audio_start_pts, audio_end_pts = 0, -1 - audio_timebase_num, audio_timebase_den = 0, 1 - - full_path = os.path.join(VIDEO_DIR, test_video) - - # pass 1: decode all frames using new decoder - tv_result = torch.ops.video_reader.read_video_from_file( - full_path, - SEEK_FRAME_MARGIN, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - # pass 2: decode all frames using av - pyav_result = _decode_frames_by_av_module(full_path) - # check results from TorchVision decoder - self.check_separate_decoding_result(tv_result, config) - # compare decoding results - self.compare_decoding_result(tv_result, pyav_result, config) - - @pytest.mark.parametrize("test_video,config", test_videos.items()) - @pytest.mark.parametrize("read_video_stream,read_audio_stream", [(1, 0), (0, 1)]) - def test_read_video_from_file_read_single_stream_only( - self, test_video, config, read_video_stream, read_audio_stream - ): - """ - Test the case when decoder starts with a video file to decode frames, and - only reads video stream and ignores audio stream - """ - # video related - width, height, min_dimension, max_dimension = 0, 0, 0, 0 - video_start_pts, video_end_pts = 0, -1 - video_timebase_num, video_timebase_den = 0, 1 - # audio related - samples, channels = 0, 0 - audio_start_pts, audio_end_pts = 0, -1 - audio_timebase_num, audio_timebase_den = 0, 1 - - full_path = os.path.join(VIDEO_DIR, test_video) - # decode all frames using new decoder - tv_result = torch.ops.video_reader.read_video_from_file( - full_path, - SEEK_FRAME_MARGIN, - 0, # getPtsOnly - read_video_stream, - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - read_audio_stream, - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - - ( - vframes, - vframe_pts, - vtimebase, - vfps, - vduration, - aframes, - aframe_pts, - atimebase, - asample_rate, - aduration, - ) = tv_result - - assert (vframes.numel() > 0) is bool(read_video_stream) - assert (vframe_pts.numel() > 0) is bool(read_video_stream) - assert (vtimebase.numel() > 0) is bool(read_video_stream) - assert (vfps.numel() > 0) is bool(read_video_stream) - - expect_audio_data = read_audio_stream == 1 and config.audio_sample_rate is not None - assert (aframes.numel() > 0) is bool(expect_audio_data) - assert (aframe_pts.numel() > 0) is bool(expect_audio_data) - assert (atimebase.numel() > 0) is bool(expect_audio_data) - assert (asample_rate.numel() > 0) is bool(expect_audio_data) - - @pytest.mark.parametrize("test_video", test_videos.keys()) - def test_read_video_from_file_rescale_min_dimension(self, test_video): - """ - Test the case when decoder starts with a video file to decode frames, and - video min dimension between height and width is set. - """ - # video related - width, height, min_dimension, max_dimension = 0, 0, 128, 0 - video_start_pts, video_end_pts = 0, -1 - video_timebase_num, video_timebase_den = 0, 1 - # audio related - samples, channels = 0, 0 - audio_start_pts, audio_end_pts = 0, -1 - audio_timebase_num, audio_timebase_den = 0, 1 - - full_path = os.path.join(VIDEO_DIR, test_video) - - tv_result = torch.ops.video_reader.read_video_from_file( - full_path, - SEEK_FRAME_MARGIN, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - assert min_dimension == min(tv_result[0].size(1), tv_result[0].size(2)) - - @pytest.mark.parametrize("test_video", test_videos.keys()) - def test_read_video_from_file_rescale_max_dimension(self, test_video): - """ - Test the case when decoder starts with a video file to decode frames, and - video min dimension between height and width is set. - """ - # video related - width, height, min_dimension, max_dimension = 0, 0, 0, 85 - video_start_pts, video_end_pts = 0, -1 - video_timebase_num, video_timebase_den = 0, 1 - # audio related - samples, channels = 0, 0 - audio_start_pts, audio_end_pts = 0, -1 - audio_timebase_num, audio_timebase_den = 0, 1 - - full_path = os.path.join(VIDEO_DIR, test_video) - - tv_result = torch.ops.video_reader.read_video_from_file( - full_path, - SEEK_FRAME_MARGIN, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - assert max_dimension == max(tv_result[0].size(1), tv_result[0].size(2)) - - @pytest.mark.parametrize("test_video", test_videos.keys()) - def test_read_video_from_file_rescale_both_min_max_dimension(self, test_video): - """ - Test the case when decoder starts with a video file to decode frames, and - video min dimension between height and width is set. - """ - # video related - width, height, min_dimension, max_dimension = 0, 0, 64, 85 - video_start_pts, video_end_pts = 0, -1 - video_timebase_num, video_timebase_den = 0, 1 - # audio related - samples, channels = 0, 0 - audio_start_pts, audio_end_pts = 0, -1 - audio_timebase_num, audio_timebase_den = 0, 1 - - full_path = os.path.join(VIDEO_DIR, test_video) - - tv_result = torch.ops.video_reader.read_video_from_file( - full_path, - SEEK_FRAME_MARGIN, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - assert min_dimension == min(tv_result[0].size(1), tv_result[0].size(2)) - assert max_dimension == max(tv_result[0].size(1), tv_result[0].size(2)) - - @pytest.mark.parametrize("test_video", test_videos.keys()) - def test_read_video_from_file_rescale_width(self, test_video): - """ - Test the case when decoder starts with a video file to decode frames, and - video width is set. - """ - # video related - width, height, min_dimension, max_dimension = 256, 0, 0, 0 - video_start_pts, video_end_pts = 0, -1 - video_timebase_num, video_timebase_den = 0, 1 - # audio related - samples, channels = 0, 0 - audio_start_pts, audio_end_pts = 0, -1 - audio_timebase_num, audio_timebase_den = 0, 1 - - full_path = os.path.join(VIDEO_DIR, test_video) - - tv_result = torch.ops.video_reader.read_video_from_file( - full_path, - SEEK_FRAME_MARGIN, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - assert tv_result[0].size(2) == width - - @pytest.mark.parametrize("test_video", test_videos.keys()) - def test_read_video_from_file_rescale_height(self, test_video): - """ - Test the case when decoder starts with a video file to decode frames, and - video height is set. - """ - # video related - width, height, min_dimension, max_dimension = 0, 224, 0, 0 - video_start_pts, video_end_pts = 0, -1 - video_timebase_num, video_timebase_den = 0, 1 - # audio related - samples, channels = 0, 0 - audio_start_pts, audio_end_pts = 0, -1 - audio_timebase_num, audio_timebase_den = 0, 1 - - full_path = os.path.join(VIDEO_DIR, test_video) - - tv_result = torch.ops.video_reader.read_video_from_file( - full_path, - SEEK_FRAME_MARGIN, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - assert tv_result[0].size(1) == height - - @pytest.mark.parametrize("test_video", test_videos.keys()) - def test_read_video_from_file_rescale_width_and_height(self, test_video): - """ - Test the case when decoder starts with a video file to decode frames, and - both video height and width are set. - """ - # video related - width, height, min_dimension, max_dimension = 320, 240, 0, 0 - video_start_pts, video_end_pts = 0, -1 - video_timebase_num, video_timebase_den = 0, 1 - # audio related - samples, channels = 0, 0 - audio_start_pts, audio_end_pts = 0, -1 - audio_timebase_num, audio_timebase_den = 0, 1 - - full_path = os.path.join(VIDEO_DIR, test_video) - - tv_result = torch.ops.video_reader.read_video_from_file( - full_path, - SEEK_FRAME_MARGIN, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - assert tv_result[0].size(1) == height - assert tv_result[0].size(2) == width - - @pytest.mark.parametrize("test_video", test_videos.keys()) - @pytest.mark.parametrize("samples", [9600, 96000]) - def test_read_video_from_file_audio_resampling(self, test_video, samples): - """ - Test the case when decoder starts with a video file to decode frames, and - audio waveform are resampled - """ - # video related - width, height, min_dimension, max_dimension = 0, 0, 0, 0 - video_start_pts, video_end_pts = 0, -1 - video_timebase_num, video_timebase_den = 0, 1 - # audio related - channels = 0 - audio_start_pts, audio_end_pts = 0, -1 - audio_timebase_num, audio_timebase_den = 0, 1 - - full_path = os.path.join(VIDEO_DIR, test_video) - - tv_result = torch.ops.video_reader.read_video_from_file( - full_path, - SEEK_FRAME_MARGIN, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - ( - vframes, - vframe_pts, - vtimebase, - vfps, - vduration, - aframes, - aframe_pts, - atimebase, - asample_rate, - aduration, - ) = tv_result - if aframes.numel() > 0: - assert samples == asample_rate.item() - assert 1 == aframes.size(1) - # when audio stream is found - duration = float(aframe_pts[-1]) * float(atimebase[0]) / float(atimebase[1]) - assert aframes.size(0) == approx(int(duration * asample_rate.item()), abs=0.1 * asample_rate.item()) - - @pytest.mark.parametrize("test_video,config", test_videos.items()) - def test_compare_read_video_from_memory_and_file(self, test_video, config): - """ - Test the case when video is already in memory, and decoder reads data in memory - """ - # video related - width, height, min_dimension, max_dimension = 0, 0, 0, 0 - video_start_pts, video_end_pts = 0, -1 - video_timebase_num, video_timebase_den = 0, 1 - # audio related - samples, channels = 0, 0 - audio_start_pts, audio_end_pts = 0, -1 - audio_timebase_num, audio_timebase_den = 0, 1 - - full_path, video_tensor = _get_video_tensor(VIDEO_DIR, test_video) - - # pass 1: decode all frames using cpp decoder - tv_result_memory = torch.ops.video_reader.read_video_from_memory( - video_tensor, - SEEK_FRAME_MARGIN, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - self.check_separate_decoding_result(tv_result_memory, config) - # pass 2: decode all frames from file - tv_result_file = torch.ops.video_reader.read_video_from_file( - full_path, - SEEK_FRAME_MARGIN, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - - self.check_separate_decoding_result(tv_result_file, config) - # finally, compare results decoded from memory and file - self.compare_decoding_result(tv_result_memory, tv_result_file) - - @pytest.mark.parametrize("test_video,config", test_videos.items()) - def test_read_video_from_memory(self, test_video, config): - """ - Test the case when video is already in memory, and decoder reads data in memory - """ - # video related - width, height, min_dimension, max_dimension = 0, 0, 0, 0 - video_start_pts, video_end_pts = 0, -1 - video_timebase_num, video_timebase_den = 0, 1 - # audio related - samples, channels = 0, 0 - audio_start_pts, audio_end_pts = 0, -1 - audio_timebase_num, audio_timebase_den = 0, 1 - - full_path, video_tensor = _get_video_tensor(VIDEO_DIR, test_video) - - # pass 1: decode all frames using cpp decoder - tv_result = torch.ops.video_reader.read_video_from_memory( - video_tensor, - SEEK_FRAME_MARGIN, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - # pass 2: decode all frames using av - pyav_result = _decode_frames_by_av_module(full_path) - - self.check_separate_decoding_result(tv_result, config) - self.compare_decoding_result(tv_result, pyav_result, config) - - @pytest.mark.parametrize("test_video,config", test_videos.items()) - def test_read_video_from_memory_get_pts_only(self, test_video, config): - """ - Test the case when video is already in memory, and decoder reads data in memory. - Compare frame pts between decoding for pts only and full decoding - for both pts and frame data - """ - # video related - width, height, min_dimension, max_dimension = 0, 0, 0, 0 - video_start_pts, video_end_pts = 0, -1 - video_timebase_num, video_timebase_den = 0, 1 - # audio related - samples, channels = 0, 0 - audio_start_pts, audio_end_pts = 0, -1 - audio_timebase_num, audio_timebase_den = 0, 1 - - _, video_tensor = _get_video_tensor(VIDEO_DIR, test_video) - - # pass 1: decode all frames using cpp decoder - tv_result = torch.ops.video_reader.read_video_from_memory( - video_tensor, - SEEK_FRAME_MARGIN, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - assert abs(config.video_fps - tv_result[3].item()) < 0.01 - - # pass 2: decode all frames to get PTS only using cpp decoder - tv_result_pts_only = torch.ops.video_reader.read_video_from_memory( - video_tensor, - SEEK_FRAME_MARGIN, - 1, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - - assert not tv_result_pts_only[0].numel() - assert not tv_result_pts_only[5].numel() - self.compare_decoding_result(tv_result, tv_result_pts_only) - - @pytest.mark.parametrize("test_video,config", test_videos.items()) - @pytest.mark.parametrize("num_frames", [4, 8, 16, 32, 64, 128]) - def test_read_video_in_range_from_memory(self, test_video, config, num_frames): - """ - Test the case when video is already in memory, and decoder reads data in memory. - In addition, decoder takes meaningful start- and end PTS as input, and decode - frames within that interval - """ - full_path, video_tensor = _get_video_tensor(VIDEO_DIR, test_video) - # video related - width, height, min_dimension, max_dimension = 0, 0, 0, 0 - video_start_pts, video_end_pts = 0, -1 - video_timebase_num, video_timebase_den = 0, 1 - # audio related - samples, channels = 0, 0 - audio_start_pts, audio_end_pts = 0, -1 - audio_timebase_num, audio_timebase_den = 0, 1 - # pass 1: decode all frames using new decoder - tv_result = torch.ops.video_reader.read_video_from_memory( - video_tensor, - SEEK_FRAME_MARGIN, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - ( - vframes, - vframe_pts, - vtimebase, - vfps, - vduration, - aframes, - aframe_pts, - atimebase, - asample_rate, - aduration, - ) = tv_result - assert abs(config.video_fps - vfps.item()) < 0.01 - - start_pts_ind_max = vframe_pts.size(0) - num_frames - if start_pts_ind_max <= 0: - return - # randomly pick start pts - start_pts_ind = randint(0, start_pts_ind_max) - end_pts_ind = start_pts_ind + num_frames - 1 - video_start_pts = vframe_pts[start_pts_ind] - video_end_pts = vframe_pts[end_pts_ind] - - video_timebase_num, video_timebase_den = vtimebase[0], vtimebase[1] - if len(atimebase) > 0: - # when audio stream is available - audio_timebase_num, audio_timebase_den = atimebase[0], atimebase[1] - audio_start_pts = _pts_convert( - video_start_pts.item(), - Fraction(video_timebase_num.item(), video_timebase_den.item()), - Fraction(audio_timebase_num.item(), audio_timebase_den.item()), - math.floor, - ) - audio_end_pts = _pts_convert( - video_end_pts.item(), - Fraction(video_timebase_num.item(), video_timebase_den.item()), - Fraction(audio_timebase_num.item(), audio_timebase_den.item()), - math.ceil, - ) - - # pass 2: decode frames in the randomly generated range - tv_result = torch.ops.video_reader.read_video_from_memory( - video_tensor, - SEEK_FRAME_MARGIN, - 0, # getPtsOnly - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - video_start_pts, - video_end_pts, - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - audio_start_pts, - audio_end_pts, - audio_timebase_num, - audio_timebase_den, - ) - - # pass 3: decode frames in range using PyAv - video_timebase_av, audio_timebase_av = _get_timebase_by_av_module(full_path) - - video_start_pts_av = _pts_convert( - video_start_pts.item(), - Fraction(video_timebase_num.item(), video_timebase_den.item()), - Fraction(video_timebase_av.numerator, video_timebase_av.denominator), - math.floor, - ) - video_end_pts_av = _pts_convert( - video_end_pts.item(), - Fraction(video_timebase_num.item(), video_timebase_den.item()), - Fraction(video_timebase_av.numerator, video_timebase_av.denominator), - math.ceil, - ) - if audio_timebase_av: - audio_start_pts = _pts_convert( - video_start_pts.item(), - Fraction(video_timebase_num.item(), video_timebase_den.item()), - Fraction(audio_timebase_av.numerator, audio_timebase_av.denominator), - math.floor, - ) - audio_end_pts = _pts_convert( - video_end_pts.item(), - Fraction(video_timebase_num.item(), video_timebase_den.item()), - Fraction(audio_timebase_av.numerator, audio_timebase_av.denominator), - math.ceil, - ) - - pyav_result = _decode_frames_by_av_module( - full_path, - video_start_pts_av, - video_end_pts_av, - audio_start_pts, - audio_end_pts, - ) - - assert tv_result[0].size(0) == num_frames - if pyav_result.vframes.size(0) == num_frames: - # if PyAv decodes a different number of video frames, skip - # comparing the decoding results between Torchvision video reader - # and PyAv - self.compare_decoding_result(tv_result, pyav_result, config) - - @pytest.mark.parametrize("test_video,config", test_videos.items()) - def test_probe_video_from_file(self, test_video, config): - """ - Test the case when decoder probes a video file - """ - full_path = os.path.join(VIDEO_DIR, test_video) - probe_result = torch.ops.video_reader.probe_video_from_file(full_path) - self.check_probe_result(probe_result, config) - - @pytest.mark.parametrize("test_video,config", test_videos.items()) - def test_probe_video_from_memory(self, test_video, config): - """ - Test the case when decoder probes a video in memory - """ - _, video_tensor = _get_video_tensor(VIDEO_DIR, test_video) - probe_result = torch.ops.video_reader.probe_video_from_memory(video_tensor) - self.check_probe_result(probe_result, config) - - @pytest.mark.parametrize("test_video,config", test_videos.items()) - def test_probe_video_from_memory_script(self, test_video, config): - scripted_fun = torch.jit.script(io._probe_video_from_memory) - assert scripted_fun is not None - - _, video_tensor = _get_video_tensor(VIDEO_DIR, test_video) - probe_result = scripted_fun(video_tensor) - self.check_meta_result(probe_result, config) - - @pytest.mark.parametrize("test_video", test_videos.keys()) - def test_read_video_from_memory_scripted(self, test_video): - """ - Test the case when video is already in memory, and decoder reads data in memory - """ - # video related - width, height, min_dimension, max_dimension = 0, 0, 0, 0 - video_start_pts, video_end_pts = 0, -1 - video_timebase_num, video_timebase_den = 0, 1 - # audio related - samples, channels = 0, 0 - audio_start_pts, audio_end_pts = 0, -1 - audio_timebase_num, audio_timebase_den = 0, 1 - - scripted_fun = torch.jit.script(io._read_video_from_memory) - assert scripted_fun is not None - - _, video_tensor = _get_video_tensor(VIDEO_DIR, test_video) - - # decode all frames using cpp decoder - scripted_fun( - video_tensor, - SEEK_FRAME_MARGIN, - 1, # readVideoStream - width, - height, - min_dimension, - max_dimension, - [video_start_pts, video_end_pts], - video_timebase_num, - video_timebase_den, - 1, # readAudioStream - samples, - channels, - [audio_start_pts, audio_end_pts], - audio_timebase_num, - audio_timebase_den, - ) - # FUTURE: check value of video / audio frames - - def test_invalid_file(self): - set_video_backend("video_reader") - with pytest.raises(RuntimeError): - io.read_video("foo.mp4") - - set_video_backend("pyav") - with pytest.raises(RuntimeError): - io.read_video("foo.mp4") - - @pytest.mark.parametrize("test_video", test_videos.keys()) - @pytest.mark.parametrize("backend", ["video_reader", "pyav"]) - @pytest.mark.parametrize("start_offset", [0, 500]) - @pytest.mark.parametrize("end_offset", [3000, None]) - def test_audio_present_pts(self, test_video, backend, start_offset, end_offset): - """Test if audio frames are returned with pts unit.""" - full_path = os.path.join(VIDEO_DIR, test_video) - container = av.open(full_path) - if container.streams.audio: - set_video_backend(backend) - _, audio, _ = io.read_video(full_path, start_offset, end_offset, pts_unit="pts") - assert all([dimension > 0 for dimension in audio.shape[:2]]) - - @pytest.mark.parametrize("test_video", test_videos.keys()) - @pytest.mark.parametrize("backend", ["video_reader", "pyav"]) - @pytest.mark.parametrize("start_offset", [0, 0.1]) - @pytest.mark.parametrize("end_offset", [0.3, None]) - def test_audio_present_sec(self, test_video, backend, start_offset, end_offset): - """Test if audio frames are returned with sec unit.""" - full_path = os.path.join(VIDEO_DIR, test_video) - container = av.open(full_path) - if container.streams.audio: - set_video_backend(backend) - _, audio, _ = io.read_video(full_path, start_offset, end_offset, pts_unit="sec") - assert all([dimension > 0 for dimension in audio.shape[:2]]) - - -if __name__ == "__main__": - pytest.main([__file__]) diff --git a/test/test_videoapi.py b/test/test_videoapi.py deleted file mode 100644 index aabcf6407f7..00000000000 --- a/test/test_videoapi.py +++ /dev/null @@ -1,312 +0,0 @@ -import collections -import os -import urllib - -import pytest -import torch -import torchvision -from pytest import approx -from torchvision.datasets.utils import download_url -from torchvision.io import _HAS_CPU_VIDEO_DECODER, VideoReader - - -# WARNING: these tests have been skipped forever on the CI because the video ops -# are never properly available. This is bad, but things have been in a terrible -# state for a long time already as we write this comment, and we'll hopefully be -# able to get rid of this all soon. - - -try: - import av - - # Do a version test too - torchvision.io.video._check_av_available() -except ImportError: - av = None - - -VIDEO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "videos") - -CheckerConfig = ["duration", "video_fps", "audio_sample_rate"] -GroundTruth = collections.namedtuple("GroundTruth", " ".join(CheckerConfig)) - - -def backends(): - backends_ = ["video_reader"] - if av is not None: - backends_.append("pyav") - return backends_ - - -def fate(name, path="."): - """Download and return a path to a sample from the FFmpeg test suite. - See the `FFmpeg Automated Test Environment `_ - """ - - file_name = name.split("/")[1] - download_url("http://fate.ffmpeg.org/fate-suite/" + name, path, file_name) - return os.path.join(path, file_name) - - -test_videos = { - "RATRACE_wave_f_nm_np1_fr_goo_37.avi": GroundTruth(duration=2.0, video_fps=30.0, audio_sample_rate=None), - "SchoolRulesHowTheyHelpUs_wave_f_nm_np1_ba_med_0.avi": GroundTruth( - duration=2.0, video_fps=30.0, audio_sample_rate=None - ), - "TrumanShow_wave_f_nm_np1_fr_med_26.avi": GroundTruth(duration=2.0, video_fps=30.0, audio_sample_rate=None), - "v_SoccerJuggling_g23_c01.avi": GroundTruth(duration=8.0, video_fps=29.97, audio_sample_rate=None), - "v_SoccerJuggling_g24_c01.avi": GroundTruth(duration=8.0, video_fps=29.97, audio_sample_rate=None), - "R6llTwEh07w.mp4": GroundTruth(duration=10.0, video_fps=30.0, audio_sample_rate=44100), - "SOX5yA1l24A.mp4": GroundTruth(duration=11.0, video_fps=29.97, audio_sample_rate=48000), - "WUzgd7C1pWA.mp4": GroundTruth(duration=11.0, video_fps=29.97, audio_sample_rate=48000), -} - - -@pytest.mark.skipif(_HAS_CPU_VIDEO_DECODER is False, reason="Didn't compile with ffmpeg") -class TestVideoApi: - @pytest.mark.skipif(av is None, reason="PyAV unavailable") - @pytest.mark.parametrize("test_video", test_videos.keys()) - @pytest.mark.parametrize("backend", backends()) - def test_frame_reading(self, test_video, backend): - torchvision.set_video_backend(backend) - full_path = os.path.join(VIDEO_DIR, test_video) - with av.open(full_path) as av_reader: - if av_reader.streams.video: - av_frames, vr_frames = [], [] - av_pts, vr_pts = [], [] - # get av frames - for av_frame in av_reader.decode(av_reader.streams.video[0]): - av_frames.append(torch.tensor(av_frame.to_rgb().to_ndarray()).permute(2, 0, 1)) - av_pts.append(av_frame.pts * av_frame.time_base) - - # get vr frames - video_reader = VideoReader(full_path, "video") - for vr_frame in video_reader: - vr_frames.append(vr_frame["data"]) - vr_pts.append(vr_frame["pts"]) - - # same number of frames - assert len(vr_frames) == len(av_frames) - assert len(vr_pts) == len(av_pts) - - # compare the frames and ptss - for i in range(len(vr_frames)): - assert float(av_pts[i]) == approx(vr_pts[i], abs=0.1) - - mean_delta = torch.mean(torch.abs(av_frames[i].float() - vr_frames[i].float())) - # on average the difference is very small and caused - # by decoding (around 1%) - # TODO: asses empirically how to set this? atm it's 1% - # averaged over all frames - assert mean_delta.item() < 2.55 - - del vr_frames, av_frames, vr_pts, av_pts - - # test audio reading compared to PYAV - with av.open(full_path) as av_reader: - if av_reader.streams.audio: - av_frames, vr_frames = [], [] - av_pts, vr_pts = [], [] - # get av frames - for av_frame in av_reader.decode(av_reader.streams.audio[0]): - av_frames.append(torch.tensor(av_frame.to_ndarray()).permute(1, 0)) - av_pts.append(av_frame.pts * av_frame.time_base) - av_reader.close() - - # get vr frames - video_reader = VideoReader(full_path, "audio") - for vr_frame in video_reader: - vr_frames.append(vr_frame["data"]) - vr_pts.append(vr_frame["pts"]) - - # same number of frames - assert len(vr_frames) == len(av_frames) - assert len(vr_pts) == len(av_pts) - - # compare the frames and ptss - for i in range(len(vr_frames)): - assert float(av_pts[i]) == approx(vr_pts[i], abs=0.1) - max_delta = torch.max(torch.abs(av_frames[i].float() - vr_frames[i].float())) - # we assure that there is never more than 1% difference in signal - assert max_delta.item() < 0.001 - - @pytest.mark.parametrize("stream", ["video", "audio"]) - @pytest.mark.parametrize("test_video", test_videos.keys()) - @pytest.mark.parametrize("backend", backends()) - def test_frame_reading_mem_vs_file(self, test_video, stream, backend): - torchvision.set_video_backend(backend) - full_path = os.path.join(VIDEO_DIR, test_video) - - reader = VideoReader(full_path) - reader_md = reader.get_metadata() - - if stream in reader_md: - # Test video reading from file vs from memory - vr_frames, vr_frames_mem = [], [] - vr_pts, vr_pts_mem = [], [] - # get vr frames - video_reader = VideoReader(full_path, stream) - for vr_frame in video_reader: - vr_frames.append(vr_frame["data"]) - vr_pts.append(vr_frame["pts"]) - - # get vr frames = read from memory - f = open(full_path, "rb") - fbytes = f.read() - f.close() - video_reader_from_mem = VideoReader(fbytes, stream) - - for vr_frame_from_mem in video_reader_from_mem: - vr_frames_mem.append(vr_frame_from_mem["data"]) - vr_pts_mem.append(vr_frame_from_mem["pts"]) - - # same number of frames - assert len(vr_frames) == len(vr_frames_mem) - assert len(vr_pts) == len(vr_pts_mem) - - # compare the frames and ptss - for i in range(len(vr_frames)): - assert vr_pts[i] == vr_pts_mem[i] - mean_delta = torch.mean(torch.abs(vr_frames[i].float() - vr_frames_mem[i].float())) - # on average the difference is very small and caused - # by decoding (around 1%) - # TODO: asses empirically how to set this? atm it's 1% - # averaged over all frames - assert mean_delta.item() < 2.55 - - del vr_frames, vr_pts, vr_frames_mem, vr_pts_mem - else: - del reader, reader_md - - @pytest.mark.parametrize("test_video,config", test_videos.items()) - @pytest.mark.parametrize("backend", backends()) - def test_metadata(self, test_video, config, backend): - """ - Test that the metadata returned via pyav corresponds to the one returned - by the new video decoder API - """ - torchvision.set_video_backend(backend) - full_path = os.path.join(VIDEO_DIR, test_video) - reader = VideoReader(full_path, "video") - reader_md = reader.get_metadata() - assert config.video_fps == approx(reader_md["video"]["fps"][0], abs=0.0001) - assert config.duration == approx(reader_md["video"]["duration"][0], abs=0.5) - - @pytest.mark.parametrize("test_video", test_videos.keys()) - @pytest.mark.parametrize("backend", backends()) - def test_seek_start(self, test_video, backend): - torchvision.set_video_backend(backend) - full_path = os.path.join(VIDEO_DIR, test_video) - video_reader = VideoReader(full_path, "video") - num_frames = 0 - for _ in video_reader: - num_frames += 1 - - # now seek the container to 0 and do it again - # It's often that starting seek can be inprecise - # this way and it doesn't start at 0 - video_reader.seek(0) - start_num_frames = 0 - for _ in video_reader: - start_num_frames += 1 - - assert start_num_frames == num_frames - - # now seek the container to < 0 to check for unexpected behaviour - video_reader.seek(-1) - start_num_frames = 0 - for _ in video_reader: - start_num_frames += 1 - - assert start_num_frames == num_frames - - @pytest.mark.parametrize("test_video", test_videos.keys()) - @pytest.mark.parametrize("backend", ["video_reader"]) - def test_accurateseek_middle(self, test_video, backend): - torchvision.set_video_backend(backend) - full_path = os.path.join(VIDEO_DIR, test_video) - stream = "video" - video_reader = VideoReader(full_path, stream) - md = video_reader.get_metadata() - duration = md[stream]["duration"][0] - if duration is not None: - num_frames = 0 - for _ in video_reader: - num_frames += 1 - - video_reader.seek(duration / 2) - middle_num_frames = 0 - for _ in video_reader: - middle_num_frames += 1 - - assert middle_num_frames < num_frames - assert middle_num_frames == approx(num_frames // 2, abs=1) - - video_reader.seek(duration / 2) - frame = next(video_reader) - lb = duration / 2 - 1 / md[stream]["fps"][0] - ub = duration / 2 + 1 / md[stream]["fps"][0] - assert (lb <= frame["pts"]) and (ub >= frame["pts"]) - - def test_fate_suite(self): - # TODO: remove the try-except statement once the connectivity issues are resolved - try: - video_path = fate("sub/MovText_capability_tester.mp4", VIDEO_DIR) - except (urllib.error.URLError, ConnectionError) as error: - pytest.skip(f"Skipping due to connectivity issues: {error}") - vr = VideoReader(video_path) - metadata = vr.get_metadata() - - assert metadata["subtitles"]["duration"] is not None - os.remove(video_path) - - @pytest.mark.skipif(av is None, reason="PyAV unavailable") - @pytest.mark.parametrize("test_video,config", test_videos.items()) - @pytest.mark.parametrize("backend", backends()) - def test_keyframe_reading(self, test_video, config, backend): - torchvision.set_video_backend(backend) - full_path = os.path.join(VIDEO_DIR, test_video) - - av_reader = av.open(full_path) - # reduce streams to only keyframes - av_stream = av_reader.streams.video[0] - av_stream.codec_context.skip_frame = "NONKEY" - - av_keyframes = [] - vr_keyframes = [] - if av_reader.streams.video: - - # get all keyframes using pyav. Then, seek randomly into video reader - # and assert that all the returned values are in AV_KEYFRAMES - - for av_frame in av_reader.decode(av_stream): - av_keyframes.append(float(av_frame.pts * av_frame.time_base)) - - if len(av_keyframes) > 1: - video_reader = VideoReader(full_path, "video") - for i in range(1, len(av_keyframes)): - seek_val = (av_keyframes[i] + av_keyframes[i - 1]) / 2 - data = next(video_reader.seek(seek_val, True)) - vr_keyframes.append(data["pts"]) - - data = next(video_reader.seek(config.duration, True)) - vr_keyframes.append(data["pts"]) - - assert len(av_keyframes) == len(vr_keyframes) - # NOTE: this video gets different keyframe with different - # loaders (0.333 pyav, 0.666 for us) - if test_video != "TrumanShow_wave_f_nm_np1_fr_med_26.avi": - for i in range(len(av_keyframes)): - assert av_keyframes[i] == approx(vr_keyframes[i], rel=0.001) - - def test_src(self): - with pytest.raises(ValueError, match="src cannot be empty"): - VideoReader(src="") - with pytest.raises(ValueError, match="src must be either string"): - VideoReader(src=2) - with pytest.raises(TypeError, match="unexpected keyword argument"): - VideoReader(path="path") - - -if __name__ == "__main__": - pytest.main([__file__]) diff --git a/torchvision/__init__.py b/torchvision/__init__.py index 5d06156c25f..26f51f856d4 100644 --- a/torchvision/__init__.py +++ b/torchvision/__init__.py @@ -28,8 +28,6 @@ _image_backend = "PIL" -_video_backend = "pyav" - def set_image_backend(backend): """ @@ -53,48 +51,6 @@ def get_image_backend(): return _image_backend -def set_video_backend(backend): - """ - Specifies the package used to decode videos. - - Args: - backend (string): Name of the video backend. one of {'pyav', 'video_reader'}. - The :mod:`pyav` package uses the 3rd party PyAv library. It is a Pythonic - binding for the FFmpeg libraries. - The :mod:`video_reader` package includes a native C++ implementation on - top of FFMPEG libraries, and a python API of TorchScript custom operator. - It generally decodes faster than :mod:`pyav`, but is perhaps less robust. - - .. note:: - Building with FFMPEG is disabled by default in the latest `main`. If you want to use the 'video_reader' - backend, please compile torchvision from source. - """ - global _video_backend - if backend not in ["pyav", "video_reader", "cuda"]: - raise ValueError("Invalid video backend '%s'. Options are 'pyav', 'video_reader' and 'cuda'" % backend) - if backend == "video_reader" and not io._HAS_CPU_VIDEO_DECODER: - # TODO: better messages - message = "video_reader video backend is not available. Please compile torchvision from source and try again" - raise RuntimeError(message) - elif backend == "cuda" and not io._HAS_GPU_VIDEO_DECODER: - # TODO: better messages - message = "cuda video backend is not available." - raise RuntimeError(message) - else: - _video_backend = backend - - -def get_video_backend(): - """ - Returns the currently active video backend used to decode videos. - - Returns: - str: Name of the video backend. one of {'pyav', 'video_reader'}. - """ - - return _video_backend - - def _is_tracing(): return torch._C._get_tracing_state() diff --git a/torchvision/csrc/io/decoder/audio_sampler.cpp b/torchvision/csrc/io/decoder/audio_sampler.cpp deleted file mode 100644 index d46b93ddc69..00000000000 --- a/torchvision/csrc/io/decoder/audio_sampler.cpp +++ /dev/null @@ -1,251 +0,0 @@ -#include "audio_sampler.h" -#include -#include "util.h" - -#define AVRESAMPLE_MAX_CHANNELS 32 - -// www.ffmpeg.org/doxygen/1.1/doc_2examples_2resampling_audio_8c-example.html#a24 -namespace ffmpeg { - -namespace { -int preparePlanes( - const AudioFormat& fmt, - const uint8_t* buffer, - int numSamples, - uint8_t** planes) { - int result; - if ((result = av_samples_fill_arrays( - planes, - nullptr, // linesize is not needed - buffer, - fmt.channels, - numSamples, - (AVSampleFormat)fmt.format, - 1)) < 0) { - LOG(ERROR) << "av_samples_fill_arrays failed, err: " - << Util::generateErrorDesc(result) - << ", numSamples: " << numSamples << ", fmt: " << fmt.format; - } - return result; -} -} // namespace - -AudioSampler::AudioSampler(void* logCtx) : logCtx_(logCtx) {} - -AudioSampler::~AudioSampler() { - cleanUp(); -} - -void AudioSampler::shutdown() { - cleanUp(); -} - -bool AudioSampler::init(const SamplerParameters& params) { - cleanUp(); - - if (params.type != MediaType::TYPE_AUDIO) { - LOG(ERROR) << "Invalid media type, expected MediaType::TYPE_AUDIO"; - return false; - } - -#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100) - SwrContext* swrContext_ = NULL; - AVChannelLayout channel_out; - AVChannelLayout channel_in; - av_channel_layout_default(&channel_out, params.out.audio.channels); - av_channel_layout_default(&channel_in, params.in.audio.channels); - int ret = swr_alloc_set_opts2( - &swrContext_, - &channel_out, - (AVSampleFormat)params.out.audio.format, - params.out.audio.samples, - &channel_in, - (AVSampleFormat)params.in.audio.format, - params.in.audio.samples, - 0, - logCtx_); -#else - swrContext_ = swr_alloc_set_opts( - nullptr, - av_get_default_channel_layout(params.out.audio.channels), - (AVSampleFormat)params.out.audio.format, - params.out.audio.samples, - av_get_default_channel_layout(params.in.audio.channels), - (AVSampleFormat)params.in.audio.format, - params.in.audio.samples, - 0, - logCtx_); -#endif - if (swrContext_ == nullptr) { - LOG(ERROR) << "Cannot allocate SwrContext"; - return false; - } - - int result; - if ((result = swr_init(swrContext_)) < 0) { - LOG(ERROR) << "swr_init failed, err: " << Util::generateErrorDesc(result) - << ", in -> format: " << params.in.audio.format - << ", channels: " << params.in.audio.channels - << ", samples: " << params.in.audio.samples - << ", out -> format: " << params.out.audio.format - << ", channels: " << params.out.audio.channels - << ", samples: " << params.out.audio.samples; - return false; - } - - // set formats - params_ = params; - return true; -} - -int AudioSampler::numOutputSamples(int inSamples) const { - return swr_get_out_samples(swrContext_, inSamples); -} - -int AudioSampler::sample( - const uint8_t* inPlanes[], - int inNumSamples, - ByteStorage* out, - int outNumSamples) { - int result; - int outBufferBytes = av_samples_get_buffer_size( - nullptr, - params_.out.audio.channels, - outNumSamples, - (AVSampleFormat)params_.out.audio.format, - 1); - - if (out) { - out->ensure(outBufferBytes); - - uint8_t* outPlanes[AVRESAMPLE_MAX_CHANNELS] = {nullptr}; - - if ((result = preparePlanes( - params_.out.audio, - out->writableTail(), - outNumSamples, - outPlanes)) < 0) { - return result; - } - - if ((result = swr_convert( - swrContext_, - &outPlanes[0], - outNumSamples, - inPlanes, - inNumSamples)) < 0) { - LOG(ERROR) << "swr_convert failed, err: " - << Util::generateErrorDesc(result); - return result; - } - - TORCH_CHECK_LE(result, outNumSamples); - - if (result) { - if ((result = av_samples_get_buffer_size( - nullptr, - params_.out.audio.channels, - result, - (AVSampleFormat)params_.out.audio.format, - 1)) >= 0) { - out->append(result); - } else { - LOG(ERROR) << "av_samples_get_buffer_size failed, err: " - << Util::generateErrorDesc(result); - } - } - } else { - // allocate a temporary buffer - auto* tmpBuffer = static_cast(av_malloc(outBufferBytes)); - if (!tmpBuffer) { - LOG(ERROR) << "av_alloc failed, for size: " << outBufferBytes; - return -1; - } - - uint8_t* outPlanes[AVRESAMPLE_MAX_CHANNELS] = {nullptr}; - - if ((result = preparePlanes( - params_.out.audio, tmpBuffer, outNumSamples, outPlanes)) < 0) { - av_free(tmpBuffer); - return result; - } - - if ((result = swr_convert( - swrContext_, - &outPlanes[0], - outNumSamples, - inPlanes, - inNumSamples)) < 0) { - LOG(ERROR) << "swr_convert failed, err: " - << Util::generateErrorDesc(result); - av_free(tmpBuffer); - return result; - } - - av_free(tmpBuffer); - - TORCH_CHECK_LE(result, outNumSamples); - - if (result) { - result = av_samples_get_buffer_size( - nullptr, - params_.out.audio.channels, - result, - (AVSampleFormat)params_.out.audio.format, - 1); - } - } - - return result; -} - -int AudioSampler::sample(AVFrame* frame, ByteStorage* out) { - const auto outNumSamples = numOutputSamples(frame ? frame->nb_samples : 0); - - if (!outNumSamples) { - return 0; - } - - return sample( - frame ? (const uint8_t**)&frame->data[0] : nullptr, - frame ? frame->nb_samples : 0, - out, - outNumSamples); -} - -int AudioSampler::sample(const ByteStorage* in, ByteStorage* out) { - const auto inSampleSize = - av_get_bytes_per_sample((AVSampleFormat)params_.in.audio.format); - - const auto inNumSamples = - !in ? 0 : in->length() / inSampleSize / params_.in.audio.channels; - - const auto outNumSamples = numOutputSamples(inNumSamples); - - if (!outNumSamples) { - return 0; - } - - uint8_t* inPlanes[AVRESAMPLE_MAX_CHANNELS] = {nullptr}; - int result; - if (in && - (result = preparePlanes( - params_.in.audio, in->data(), inNumSamples, inPlanes)) < 0) { - return result; - } - - return sample( - in ? (const uint8_t**)inPlanes : nullptr, - inNumSamples, - out, - outNumSamples); -} - -void AudioSampler::cleanUp() { - if (swrContext_) { - swr_free(&swrContext_); - swrContext_ = nullptr; - } -} - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/audio_sampler.h b/torchvision/csrc/io/decoder/audio_sampler.h deleted file mode 100644 index e105bbe4de2..00000000000 --- a/torchvision/csrc/io/decoder/audio_sampler.h +++ /dev/null @@ -1,39 +0,0 @@ -#pragma once - -#include "defs.h" - -namespace ffmpeg { - -/** - * Class transcode audio frames from one format into another - */ - -class AudioSampler : public MediaSampler { - public: - explicit AudioSampler(void* logCtx); - ~AudioSampler() override; - - // MediaSampler overrides - bool init(const SamplerParameters& params) override; - int sample(const ByteStorage* in, ByteStorage* out) override; - void shutdown() override; - - int sample(AVFrame* frame, ByteStorage* out); - - private: - // close resources - void cleanUp(); - // helper functions for rescaling, cropping, etc. - int numOutputSamples(int inSamples) const; - int sample( - const uint8_t* inPlanes[], - int inNumSamples, - ByteStorage* out, - int outNumSamples); - - private: - SwrContext* swrContext_{nullptr}; - void* logCtx_{nullptr}; -}; - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/audio_stream.cpp b/torchvision/csrc/io/decoder/audio_stream.cpp deleted file mode 100644 index c3a003434b8..00000000000 --- a/torchvision/csrc/io/decoder/audio_stream.cpp +++ /dev/null @@ -1,119 +0,0 @@ -#include "audio_stream.h" -#include -#include "util.h" - -namespace ffmpeg { - -namespace { -static int get_nb_channels(const AVFrame* frame, const AVCodecContext* codec) { -#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100) - return frame ? frame->ch_layout.nb_channels : codec->ch_layout.nb_channels; -#else - return frame ? frame->channels : codec->channels; -#endif -} - -bool operator==(const AudioFormat& x, const AVFrame& y) { - return x.samples == static_cast(y.sample_rate) && - x.channels == static_cast(get_nb_channels(&y, nullptr)) && - x.format == y.format; -} - -bool operator==(const AudioFormat& x, const AVCodecContext& y) { - return x.samples == static_cast(y.sample_rate) && - x.channels == static_cast(get_nb_channels(nullptr, &y)) && - x.format == y.sample_fmt; -} - -AudioFormat& toAudioFormat(AudioFormat& x, const AVFrame& y) { - x.samples = y.sample_rate; - x.channels = get_nb_channels(&y, nullptr); - x.format = y.format; - return x; -} - -AudioFormat& toAudioFormat(AudioFormat& x, const AVCodecContext& y) { - x.samples = y.sample_rate; - x.channels = get_nb_channels(nullptr, &y); - x.format = y.sample_fmt; - return x; -} -} // namespace - -AudioStream::AudioStream( - AVFormatContext* inputCtx, - int index, - bool convertPtsToWallTime, - const AudioFormat& format) - : Stream( - inputCtx, - MediaFormat::makeMediaFormat(format, index), - convertPtsToWallTime, - 0) {} - -AudioStream::~AudioStream() { - if (sampler_) { - sampler_->shutdown(); - sampler_.reset(); - } -} - -int AudioStream::initFormat() { - // set output format - if (format_.format.audio.samples == 0) { - format_.format.audio.samples = codecCtx_->sample_rate; - } -#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100) - if (format_.format.audio.channels == 0) { - format_.format.audio.channels = codecCtx_->ch_layout.nb_channels; - } -#else - if (format_.format.audio.channels == 0) { - format_.format.audio.channels = codecCtx_->channels; - } -#endif - if (format_.format.audio.format == AV_SAMPLE_FMT_NONE) { - format_.format.audio.format = codecCtx_->sample_fmt; - } - - return format_.format.audio.samples != 0 && - format_.format.audio.channels != 0 && - format_.format.audio.format != AV_SAMPLE_FMT_NONE - ? 0 - : -1; -} - -// copies audio sample bytes via swr_convert call in audio_sampler.cpp -int AudioStream::copyFrameBytes(ByteStorage* out, bool flush) { - if (!sampler_) { - sampler_ = std::make_unique(codecCtx_); - } - // check if input format gets changed - if (flush ? !(sampler_->getInputFormat().audio == *codecCtx_) - : !(sampler_->getInputFormat().audio == *frame_)) { - // - reinit sampler - SamplerParameters params; - params.type = format_.type; - params.out = format_.format; - params.in = FormatUnion(); - flush ? toAudioFormat(params.in.audio, *codecCtx_) - : toAudioFormat(params.in.audio, *frame_); - if (!sampler_->init(params)) { - return -1; - } - - VLOG(1) << "Set input audio sampler format" - << ", samples: " << params.in.audio.samples - << ", channels: " << params.in.audio.channels - << ", format: " << params.in.audio.format - << " : output audio sampler format" - << ", samples: " << format_.format.audio.samples - << ", channels: " << format_.format.audio.channels - << ", format: " << format_.format.audio.format; - } - // calls to a sampler that converts the audio samples and copies them to the - // out buffer via ffmpeg::swr_convert - return sampler_->sample(flush ? nullptr : frame_, out); -} - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/audio_stream.h b/torchvision/csrc/io/decoder/audio_stream.h deleted file mode 100644 index 2d6457b68f5..00000000000 --- a/torchvision/csrc/io/decoder/audio_stream.h +++ /dev/null @@ -1,29 +0,0 @@ -#pragma once - -#include "audio_sampler.h" -#include "stream.h" - -namespace ffmpeg { - -/** - * Class uses FFMPEG library to decode one audio stream. - */ - -class AudioStream : public Stream { - public: - AudioStream( - AVFormatContext* inputCtx, - int index, - bool convertPtsToWallTime, - const AudioFormat& format); - ~AudioStream() override; - - private: - int initFormat() override; - int copyFrameBytes(ByteStorage* out, bool flush) override; - - private: - std::unique_ptr sampler_; -}; - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/cc_stream.cpp b/torchvision/csrc/io/decoder/cc_stream.cpp deleted file mode 100644 index 89174c396fd..00000000000 --- a/torchvision/csrc/io/decoder/cc_stream.cpp +++ /dev/null @@ -1,24 +0,0 @@ -#include "cc_stream.h" - -namespace ffmpeg { - -CCStream::CCStream( - AVFormatContext* inputCtx, - int index, - bool convertPtsToWallTime, - const SubtitleFormat& format) - : SubtitleStream(inputCtx, index, convertPtsToWallTime, format) { - format_.type = TYPE_CC; -} - -AVCodec* CCStream::findCodec(AVCodecParameters* params) { - if (params->codec_id == AV_CODEC_ID_BIN_DATA && - params->codec_type == AVMEDIA_TYPE_DATA) { - // obtain subtitles codec - params->codec_id = AV_CODEC_ID_MOV_TEXT; - params->codec_type = AVMEDIA_TYPE_SUBTITLE; - } - return Stream::findCodec(params); -} - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/cc_stream.h b/torchvision/csrc/io/decoder/cc_stream.h deleted file mode 100644 index 3a1d169f014..00000000000 --- a/torchvision/csrc/io/decoder/cc_stream.h +++ /dev/null @@ -1,22 +0,0 @@ -#pragma once - -#include "subtitle_stream.h" - -namespace ffmpeg { - -/** - * Class uses FFMPEG library to decode one closed captions stream. - */ -class CCStream : public SubtitleStream { - public: - CCStream( - AVFormatContext* inputCtx, - int index, - bool convertPtsToWallTime, - const SubtitleFormat& format); - - private: - AVCodec* findCodec(AVCodecParameters* params) override; -}; - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/decoder.cpp b/torchvision/csrc/io/decoder/decoder.cpp deleted file mode 100644 index cfe762bbc6e..00000000000 --- a/torchvision/csrc/io/decoder/decoder.cpp +++ /dev/null @@ -1,763 +0,0 @@ -#include "decoder.h" -#include -#include -#include -#include -#include -#include "audio_stream.h" -#include "cc_stream.h" -#include "subtitle_stream.h" -#include "util.h" -#include "video_stream.h" - -namespace ffmpeg { - -namespace { - -constexpr size_t kIoBufferSize = 96 * 1024; -constexpr size_t kIoPaddingSize = AV_INPUT_BUFFER_PADDING_SIZE; -constexpr size_t kLogBufferSize = 1024; - -bool mapFfmpegType(AVMediaType media, MediaType* type) { - switch (media) { - case AVMEDIA_TYPE_AUDIO: - *type = TYPE_AUDIO; - return true; - case AVMEDIA_TYPE_VIDEO: - *type = TYPE_VIDEO; - return true; - case AVMEDIA_TYPE_SUBTITLE: - *type = TYPE_SUBTITLE; - return true; - case AVMEDIA_TYPE_DATA: - *type = TYPE_CC; - return true; - default: - return false; - } -} - -std::unique_ptr createStream( - MediaType type, - AVFormatContext* ctx, - int idx, - bool convertPtsToWallTime, - const FormatUnion& format, - int64_t loggingUuid) { - switch (type) { - case TYPE_AUDIO: - return std::make_unique( - ctx, idx, convertPtsToWallTime, format.audio); - case TYPE_VIDEO: - return std::make_unique( - // negative loggingUuid indicates video streams. - ctx, - idx, - convertPtsToWallTime, - format.video, - -loggingUuid); - case TYPE_SUBTITLE: - return std::make_unique( - ctx, idx, convertPtsToWallTime, format.subtitle); - case TYPE_CC: - return std::make_unique( - ctx, idx, convertPtsToWallTime, format.subtitle); - default: - return nullptr; - } -} - -} // Namespace - -/* static */ -void Decoder::logFunction(void* avcl, int level, const char* cfmt, va_list vl) { - if (!avcl) { - // Nothing can be done here - return; - } - - AVClass* avclass = *reinterpret_cast(avcl); - if (!avclass) { - // Nothing can be done here - return; - } - Decoder* decoder = nullptr; - if (strcmp(avclass->class_name, "AVFormatContext") == 0) { - AVFormatContext* context = reinterpret_cast(avcl); - if (context) { - decoder = reinterpret_cast(context->opaque); - } - } else if (strcmp(avclass->class_name, "AVCodecContext") == 0) { - AVCodecContext* context = reinterpret_cast(avcl); - if (context) { - decoder = reinterpret_cast(context->opaque); - } - } else if (strcmp(avclass->class_name, "AVIOContext") == 0) { - AVIOContext* context = reinterpret_cast(avcl); - // only if opaque was assigned to Decoder pointer - if (context && context->read_packet == Decoder::readFunction) { - decoder = reinterpret_cast(context->opaque); - } - } else if (strcmp(avclass->class_name, "SWResampler") == 0) { - // expect AVCodecContext as parent - if (avclass->parent_log_context_offset) { - AVClass** parent = - *(AVClass***)(((uint8_t*)avcl) + avclass->parent_log_context_offset); - AVCodecContext* context = reinterpret_cast(parent); - if (context) { - decoder = reinterpret_cast(context->opaque); - } - } - } else if (strcmp(avclass->class_name, "SWScaler") == 0) { - // cannot find a way to pass context pointer through SwsContext struct - } else { - VLOG(2) << "Unknown context class: " << avclass->class_name; - } - - if (decoder != nullptr && decoder->enableLogLevel(level)) { - char buf[kLogBufferSize] = {0}; - // Format the line - int* prefix = decoder->getPrintPrefix(); - *prefix = 1; - av_log_format_line(avcl, level, cfmt, vl, buf, sizeof(buf) - 1, prefix); - // pass message to the decoder instance - std::string msg(buf); - decoder->logCallback(level, msg); - } -} - -bool Decoder::enableLogLevel(int level) const { - return ssize_t(level) <= params_.logLevel; -} - -void Decoder::logCallback(int level, const std::string& message) { - LOG(INFO) << "Msg, uuid=" << params_.loggingUuid << " level=" << level - << " msg=" << message; -} - -/* static */ -int Decoder::shutdownFunction(void* ctx) { - Decoder* decoder = (Decoder*)ctx; - if (decoder == nullptr) { - return 1; - } - return decoder->shutdownCallback(); -} - -int Decoder::shutdownCallback() { - return interrupted_ ? 1 : 0; -} - -/* static */ -int Decoder::readFunction(void* opaque, uint8_t* buf, int size) { - Decoder* decoder = reinterpret_cast(opaque); - if (decoder == nullptr) { - return 0; - } - return decoder->readCallback(buf, size); -} - -/* static */ -int64_t Decoder::seekFunction(void* opaque, int64_t offset, int whence) { - Decoder* decoder = reinterpret_cast(opaque); - if (decoder == nullptr) { - return -1; - } - return decoder->seekCallback(offset, whence); -} - -int Decoder::readCallback(uint8_t* buf, int size) { - return seekableBuffer_.read(buf, size, params_.timeoutMs); -} - -int64_t Decoder::seekCallback(int64_t offset, int whence) { - return seekableBuffer_.seek(offset, whence, params_.timeoutMs); -} - -/* static */ -void Decoder::initOnce() { - static std::once_flag flagInit; - std::call_once(flagInit, []() { -#if LIBAVUTIL_VERSION_MAJOR < 56 // Before FFMPEG 4.0 - av_register_all(); - avcodec_register_all(); -#endif - avformat_network_init(); - av_log_set_callback(Decoder::logFunction); - av_log_set_level(AV_LOG_ERROR); - VLOG(1) << "Registered ffmpeg libs"; - }); -} - -Decoder::Decoder() { - initOnce(); -} - -Decoder::~Decoder() { - cleanUp(); -} - -// Initialise the format context that holds information about the container and -// fill it with minimal information about the format (codecs are not opened -// here). Function reads in information about the streams from the container -// into inputCtx and then passes it to decoder::openStreams. Finally, if seek is -// specified within the decoder parameters, it seeks into the correct frame -// (note, the seek defined here is "precise" seek). -bool Decoder::init( - const DecoderParameters& params, - DecoderInCallback&& in, - std::vector* metadata) { - cleanUp(); - - if ((params.uri.empty() || in) && (!params.uri.empty() || !in)) { - LOG(ERROR) - << "uuid=" << params_.loggingUuid - << " either external URI gets provided or explicit input callback"; - return false; - } - - // set callback and params - params_ = params; - - if (!(inputCtx_ = avformat_alloc_context())) { - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " cannot allocate format context"; - return false; - } - - AVInputFormat* fmt = nullptr; - int result = 0; - if (in) { - ImageType type = ImageType::UNKNOWN; - if ((result = seekableBuffer_.init( - std::forward(in), - params_.timeoutMs, - params_.maxSeekableBytes, - params_.isImage ? &type : nullptr)) < 0) { - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " can't initiate seekable buffer"; - cleanUp(); - return false; - } - - if (params_.isImage) { - const char* fmtName = "image2"; - switch (type) { - case ImageType::JPEG: - fmtName = "jpeg_pipe"; - break; - case ImageType::PNG: - fmtName = "png_pipe"; - break; - case ImageType::TIFF: - fmtName = "tiff_pipe"; - break; - default: - break; - } - - fmt = (AVInputFormat*)av_find_input_format(fmtName); - } - - const size_t avioCtxBufferSize = kIoBufferSize; - uint8_t* avioCtxBuffer = - (uint8_t*)av_malloc(avioCtxBufferSize + kIoPaddingSize); - if (!avioCtxBuffer) { - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " av_malloc cannot allocate " << avioCtxBufferSize - << " bytes"; - cleanUp(); - return false; - } - - if (!(avioCtx_ = avio_alloc_context( - avioCtxBuffer, - avioCtxBufferSize, - 0, - reinterpret_cast(this), - &Decoder::readFunction, - nullptr, - result == 1 ? &Decoder::seekFunction : nullptr))) { - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " avio_alloc_context failed"; - av_free(avioCtxBuffer); - cleanUp(); - return false; - } - - avioCtx_->max_packet_size = params.maxEncodedBufferSize; - - inputCtx_->pb = avioCtx_; - inputCtx_->flags |= AVFMT_FLAG_CUSTOM_IO; - } - - inputCtx_->opaque = reinterpret_cast(this); - inputCtx_->interrupt_callback.callback = Decoder::shutdownFunction; - inputCtx_->interrupt_callback.opaque = reinterpret_cast(this); - - // add network timeout - inputCtx_->flags |= AVFMT_FLAG_NONBLOCK; - - AVDictionary* options = nullptr; - if (params_.listen) { - av_dict_set_int(&options, "listen", 1, 0); - } - if (params_.timeoutMs > 0) { - av_dict_set_int(&options, "analyzeduration", params_.timeoutMs * 1000, 0); - av_dict_set_int(&options, "stimeout", params_.timeoutMs * 1000, 0); - av_dict_set_int(&options, "rw_timeout", params_.timeoutMs * 1000, 0); - if (!params_.tlsCertFile.empty()) { - av_dict_set(&options, "cert_file", params_.tlsCertFile.data(), 0); - } - if (!params_.tlsKeyFile.empty()) { - av_dict_set(&options, "key_file", params_.tlsKeyFile.data(), 0); - } - } - - av_dict_set_int(&options, "probesize", params_.probeSize, 0); - - interrupted_ = false; - - // ffmpeg avformat_open_input call can hang if media source doesn't respond - // set a guard for handle such situations, if requested - std::promise p; - std::future f = p.get_future(); - std::unique_ptr guard; - if (params_.preventStaleness) { - guard = std::make_unique([&f, this]() { - auto timeout = std::chrono::milliseconds(params_.timeoutMs); - if (std::future_status::timeout == f.wait_for(timeout)) { - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " cannot open stream within " << params_.timeoutMs - << " ms"; - interrupted_ = true; - } - }); - } - - if (fmt) { - result = avformat_open_input(&inputCtx_, nullptr, fmt, &options); - } else { - result = - avformat_open_input(&inputCtx_, params_.uri.c_str(), nullptr, &options); - } - - av_dict_free(&options); - - if (guard) { - p.set_value(true); - guard->join(); - guard.reset(); - } - - if (result < 0 || interrupted_) { - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " avformat_open_input failed, error=" - << Util::generateErrorDesc(result); - cleanUp(); - return false; - } - - result = avformat_find_stream_info(inputCtx_, nullptr); - - if (result < 0) { - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " avformat_find_stream_info failed, error=" - << Util::generateErrorDesc(result); - cleanUp(); - return false; - } - - if (!openStreams(metadata)) { - LOG(ERROR) << "uuid=" << params_.loggingUuid << " cannot activate streams"; - cleanUp(); - return false; - } - // SyncDecoder inherits Decoder which would override onInit. - onInit(); - - if (params.startOffset != 0) { - auto offset = params.startOffset <= params.seekAccuracy - ? 0 - : params.startOffset - params.seekAccuracy; - - av_seek_frame(inputCtx_, -1, offset, AVSEEK_FLAG_BACKWARD); - } - - for (unsigned int i = 0; i < inputCtx_->nb_streams; i++) { - if ( -#if LIBAVUTIL_VERSION_MAJOR < 56 // Before FFMPEG 4.0 - inputCtx_->streams[i]->codec->codec_type == AVMEDIA_TYPE_VIDEO -#else // FFMPEG 4.0+ - inputCtx_->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO -#endif - && inputCtx_->streams[i]->duration > 0) { - // There is at least two 1/r_frame_rates from the frame before the last - // one until the video duration, let's prefer to set duration after the - // frame before the last one, but as early as possible - double correction = 2 * inputCtx_->streams[i]->r_frame_rate.den / - (double)inputCtx_->streams[i]->r_frame_rate.num - - 1 / (double)AV_TIME_BASE; - videoDurationMs_ = 1000 * inputCtx_->streams[i]->duration * - inputCtx_->streams[i]->time_base.num / - (double)inputCtx_->streams[i]->time_base.den - - 1000 * correction; - break; - } - } - - VLOG(1) << "Decoder initialized, log level: " << params_.logLevel; - VLOG(1) << "Video duration: " << videoDurationMs_; - return true; -} - -// open appropriate CODEC for every type of stream and move it to the class -// variable `streams_` and make sure it is in range for decoding -bool Decoder::openStreams(std::vector* metadata) { - for (unsigned int i = 0; i < inputCtx_->nb_streams; i++) { - // - find the corespondent format at params_.formats set - MediaFormat format; -#if LIBAVUTIL_VERSION_MAJOR < 56 // Before FFMPEG 4.0 - const auto media = inputCtx_->streams[i]->codec->codec_type; -#else // FFMPEG 4.0+ - const auto media = inputCtx_->streams[i]->codecpar->codec_type; -#endif - if (!mapFfmpegType(media, &format.type)) { - VLOG(1) << "Stream media: " << media << " at index " << i - << " gets ignored, unknown type"; - - continue; // unsupported type - } - - // check format - auto it = params_.formats.find(format); - if (it == params_.formats.end()) { - VLOG(1) << "Stream type: " << format.type << " at index: " << i - << " gets ignored, caller is not interested"; - continue; // clients don't care about this media format - } - - // do we have stream of this type? - auto stream = findByType(format); - - // should we process this stream? - - if (it->stream == -2 || // all streams of this type are welcome - (!stream && (it->stream == -1 || it->stream == i))) { // new stream - VLOG(1) << "Stream type: " << format.type << " found, at index: " << i; - auto stream_2 = createStream( - format.type, - inputCtx_, - i, - params_.convertPtsToWallTime, - it->format, - params_.loggingUuid); - CHECK(stream_2); - if (stream_2->openCodec(metadata, params_.numThreads) < 0) { - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " open codec failed, stream_idx=" << i; - return false; - } - streams_.emplace(i, std::move(stream_2)); - inRange_.set(i, true); - } - } - - return true; -} - -void Decoder::shutdown() { - cleanUp(); -} - -void Decoder::interrupt() { - interrupted_ = true; -} - -void Decoder::cleanUp() { - if (!interrupted_) { - interrupted_ = true; - } - - if (inputCtx_) { - for (auto& stream : streams_) { - // Drain stream buffers. - DecoderOutputMessage msg; - while (msg.payload = nullptr, stream.second->flush(&msg, true) > 0) { - } - stream.second.reset(); - } - streams_.clear(); - avformat_close_input(&inputCtx_); - } - if (avioCtx_) { - av_freep(&avioCtx_->buffer); - av_freep(&avioCtx_); - } - - // reset callback - seekableBuffer_.shutdown(); -} - -// function does actual work, derived class calls it in working thread -// periodically. On success method returns 0, ENODATA on EOF, ETIMEDOUT if -// no frames got decoded in the specified timeout time, AVERROR_BUFFER_TOO_SMALL -// when unable to allocate packet and error on unrecoverable error -int Decoder::getFrame(size_t workingTimeInMs) { - if (inRange_.none()) { - return ENODATA; - } - // decode frames until cache is full and leave thread - // once decode() method gets called and grab some bytes - // run this method again - // init package - // update 03/22: moving memory management to ffmpeg - AVPacket* avPacket; - avPacket = av_packet_alloc(); - if (avPacket == nullptr) { - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " decoder as not able to allocate the packet."; - return AVERROR_BUFFER_TOO_SMALL; - } - avPacket->data = nullptr; - avPacket->size = 0; - - auto end = std::chrono::steady_clock::now() + - std::chrono::milliseconds(workingTimeInMs); - // return true if elapsed time less than timeout - auto watcher = [end]() -> bool { - return std::chrono::steady_clock::now() <= end; - }; - - int result = 0; - size_t decodingErrors = 0; - bool decodedFrame = false; - while (!interrupted_ && inRange_.any() && !decodedFrame) { - if (watcher() == false) { - LOG(ERROR) << "uuid=" << params_.loggingUuid << " hit ETIMEDOUT"; - result = ETIMEDOUT; - break; - } - result = av_read_frame(inputCtx_, avPacket); - if (result == AVERROR(EAGAIN)) { - VLOG(4) << "Decoder is busy..."; - std::this_thread::yield(); - result = 0; // reset error, EAGAIN is not an error at all - // reset the packet to default settings - av_packet_unref(avPacket); - continue; - } else if (result == AVERROR_EOF) { - flushStreams(); - VLOG(1) << "End of stream"; - result = ENODATA; - break; - } else if ( - result == AVERROR(EPERM) && params_.skipOperationNotPermittedPackets) { - // reset error, lets skip packets with EPERM - result = 0; - // reset the packet to default settings - av_packet_unref(avPacket); - continue; - } else if (result < 0) { - flushStreams(); - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " error detected: " << Util::generateErrorDesc(result); - break; - } - - // get stream; if stream cannot be found reset the packet to - // default settings - auto stream = findByIndex(avPacket->stream_index); - if (stream == nullptr || !inRange_.test(stream->getIndex())) { - av_packet_unref(avPacket); - continue; - } - - size_t numConsecutiveNoBytes = 0; - // it can be only partial decoding of the package bytes - do { - // decode package - bool gotFrame = false; - bool hasMsg = false; - // packet either got consumed completely or not at all - if ((result = processPacket( - stream, avPacket, &gotFrame, &hasMsg, params_.fastSeek)) < 0) { - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " processPacket failed with code: " << result; - break; - } - - if (!gotFrame && params_.maxProcessNoBytes != 0 && - ++numConsecutiveNoBytes > params_.maxProcessNoBytes) { - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " exceeding max amount of consecutive no bytes"; - break; - } - if (result > 0) { - numConsecutiveNoBytes = 0; - } - - decodedFrame |= hasMsg; - } while (result == 0); - - // post loop check - if (result < 0) { - if (params_.maxPackageErrors != 0 && // check errors - ++decodingErrors >= params_.maxPackageErrors) { // reached the limit - LOG(ERROR) << "uuid=" << params_.loggingUuid - << " exceeding max amount of consecutive package errors"; - break; - } - } else { - decodingErrors = 0; // reset on success - } - - result = 0; - - av_packet_unref(avPacket); - - if (params_.uniformSampling > 1) { - if (doSeek_) { - double duration = - videoDurationMs_ > 0 ? videoDurationMs_ : params_.expectedDuration; - double step = - (duration * AV_TIME_BASE) / (1000 * (params_.uniformSampling - 1)); - avformat_seek_file( - inputCtx_, - -1, - static_cast(step * kFramesDecoded_) + 1, - static_cast(step * (kFramesDecoded_ + 1)), - static_cast(step * (kFramesDecoded_ + 1)), - 0); - ++kFramesDecoded_; - doSeek_ = false; - } - } - } - - av_packet_free(&avPacket); - VLOG(2) << "Interrupted loop" << ", interrupted_ " << interrupted_ - << ", inRange_.any() " << inRange_.any() << ", decodedFrame " - << decodedFrame << ", result " << result; - - // loop can be terminated, either by: - // 1. explicitly interrupted - // 3. unrecoverable error or ENODATA (end of stream) or ETIMEDOUT (timeout) - // 4. decoded frames pts are out of the specified range - // 5. success decoded frame - if (interrupted_) { - return EINTR; - } - if (result != 0) { - return result; - } - if (inRange_.none()) { - return ENODATA; - } - return 0; -} - -// find stream by stream index -Stream* Decoder::findByIndex(int streamIndex) const { - auto it = streams_.find(streamIndex); - return it != streams_.end() ? it->second.get() : nullptr; -} - -// find stream by type; note finds only the first stream of a given type -Stream* Decoder::findByType(const MediaFormat& format) const { - for (auto& stream : streams_) { - if (stream.second->getMediaFormat().type == format.type) { - return stream.second.get(); - } - } - return nullptr; -} - -// given the stream and packet, decode the frame buffers into the -// DecoderOutputMessage data structure via stream::decodePacket function. -int Decoder::processPacket( - Stream* stream, - AVPacket* packet, - bool* gotFrame, - bool* hasMsg, - bool fastSeek) { - // decode package - int result; - DecoderOutputMessage msg; - msg.payload = params_.headerOnly ? nullptr : createByteStorage(0); - *hasMsg = false; - if ((result = stream->decodePacket( - packet, &msg, params_.headerOnly, gotFrame)) >= 0 && - *gotFrame) { - // check end offset - bool endInRange = - params_.endOffset <= 0 || msg.header.pts <= params_.endOffset; - inRange_.set(stream->getIndex(), endInRange); - // if fastseek is enabled, we're returning the first - // frame that we decode after (potential) seek. - // By default, we perform accurate seek to the closest - // following frame - bool startCondition = true; - if (!fastSeek) { - startCondition = msg.header.pts >= params_.startOffset; - } - if (endInRange && startCondition) { - *hasMsg = pushMsg(std::move(msg)); - } - } - return result; -} - -bool Decoder::pushMsg(DecoderOutputMessage&& msg) { - pastDecodedPTS_ = currentDecodedPTS_; - currentDecodedPTS_ = msg.header.pts; - - if (params_.uniformSampling <= 1) { - push(std::move(msg)); - return true; - } - - double duration = - videoDurationMs_ > 0 ? videoDurationMs_ : params_.expectedDuration; - double step = - (duration * AV_TIME_BASE) / (1000 * (params_.uniformSampling - 1)); - if (pastDecodedPTS_ < step * kFramesDecoded_ && - step * kFramesDecoded_ <= currentDecodedPTS_) { - push(std::move(msg)); - doSeek_ = true; - return true; - } - - return false; -} - -void Decoder::flushStreams() { - VLOG(1) << "Flushing streams..."; - for (auto& stream : streams_) { - DecoderOutputMessage msg; - while (msg.payload = (params_.headerOnly ? nullptr : createByteStorage(0)), - stream.second->flush(&msg, params_.headerOnly) > 0) { - // check end offset - bool endInRange = - params_.endOffset <= 0 || msg.header.pts <= params_.endOffset; - inRange_.set(stream.second->getIndex(), endInRange); - if (endInRange && msg.header.pts >= params_.startOffset) { - pushMsg(std::move(msg)); - } else { - msg.payload.reset(); - } - } - } -} - -int Decoder::decode_all(const DecoderOutCallback& callback) { - int result; - do { - DecoderOutputMessage out; - if (0 == (result = decode(&out, params_.timeoutMs))) { - callback(std::move(out)); - } - } while (result == 0); - return result; -} -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/decoder.h b/torchvision/csrc/io/decoder/decoder.h deleted file mode 100644 index 172a011f93e..00000000000 --- a/torchvision/csrc/io/decoder/decoder.h +++ /dev/null @@ -1,100 +0,0 @@ -#pragma once - -#include -#include -#include "seekable_buffer.h" -#include "stream.h" - -#if defined(_MSC_VER) -#include -using ssize_t = SSIZE_T; -#endif - -namespace ffmpeg { - -/** - * Class uses FFMPEG library to decode media streams. - * Media bytes can be explicitly provided through read-callback - * or fetched internally by FFMPEG library - */ -class Decoder : public MediaDecoder { - public: - Decoder(); - ~Decoder() override; - - // MediaDecoder overrides - bool init( - const DecoderParameters& params, - DecoderInCallback&& in, - std::vector* metadata) override; - int decode_all(const DecoderOutCallback& callback) override; - void shutdown() override; - void interrupt() override; - - protected: - // function does actual work, derived class calls it in working thread - // periodically. On success method returns 0, ENOADATA on EOF, ETIMEDOUT if - // no frames got decoded in the specified timeout time, and error on - // unrecoverable error. - int getFrame(size_t workingTimeInMs = 100); - - // Derived class must override method and consume the provided message - virtual void push(DecoderOutputMessage&& buffer) = 0; - - // Fires on init call - virtual void onInit() {} - - public: - // C-style FFMPEG API requires C/static methods for callbacks - static void logFunction(void* avcl, int level, const char* cfmt, va_list vl); - static int shutdownFunction(void* ctx); - static int readFunction(void* opaque, uint8_t* buf, int size); - static int64_t seekFunction(void* opaque, int64_t offset, int whence); - // can be called by any classes or API - static void initOnce(); - - int* getPrintPrefix() { - return &printPrefix; - } - double videoDurationMs_ = -1; - - private: - // mark below function for a proper invocation - bool enableLogLevel(int level) const; - void logCallback(int level, const std::string& message); - int readCallback(uint8_t* buf, int size); - int64_t seekCallback(int64_t offset, int whence); - int shutdownCallback(); - - bool openStreams(std::vector* metadata); - Stream* findByIndex(int streamIndex) const; - Stream* findByType(const MediaFormat& format) const; - int processPacket( - Stream* stream, - AVPacket* packet, - bool* gotFrame, - bool* hasMsg, - bool fastSeek = false); - void flushStreams(); - void cleanUp(); - bool pushMsg(DecoderOutputMessage&& - msg); // returns whether frame is passed to downstream - - protected: - DecoderParameters params_; - - private: - SeekableBuffer seekableBuffer_; - int printPrefix{1}; - - std::atomic interrupted_{false}; - AVFormatContext* inputCtx_{nullptr}; - AVIOContext* avioCtx_{nullptr}; - std::unordered_map> streams_; - std::bitset<64> inRange_; - int kFramesDecoded_{0}; - int64_t pastDecodedPTS_{-1}; - int64_t currentDecodedPTS_{-1}; - bool doSeek_{false}; -}; -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/defs.h b/torchvision/csrc/io/decoder/defs.h deleted file mode 100644 index d2dc5c7935b..00000000000 --- a/torchvision/csrc/io/decoder/defs.h +++ /dev/null @@ -1,415 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -extern "C" { -#include -#include -#include -#include -#include -#include -#include "libswscale/swscale.h" -} - -namespace ffmpeg { - -// bit mask of formats, keep them in form 2^n -enum MediaType : size_t { - TYPE_AUDIO = 1, - TYPE_VIDEO = 2, - TYPE_SUBTITLE = 4, - TYPE_CC = 8, // closed captions from transport streams -}; - -// audio -struct AudioFormat { - // fields are initialized for the auto detection - // caller can specify some/all of field values if specific output is desirable - bool operator==(const AudioFormat& x) const { - return x.format == format && x.samples == samples && x.channels == channels; - } - - size_t samples{0}; // number samples per second (frequency) - size_t channels{0}; // number of channels - long format{-1}; // AVSampleFormat, auto AV_SAMPLE_FMT_NONE - size_t padding[2]; - // -- alignment 40 bytes -}; - -// video -struct VideoFormat { - // fields are initialized for the auto detection - // caller can specify some/all of field values if specific output is desirable - bool operator==(const VideoFormat& x) const { - return x.format == format && x.width == width && x.height == height; - } - /* - When width = 0, height = 0, minDimension = 0, and maxDimension = 0, - keep the original frame resolution - When width = 0, height = 0, minDimension != 0, and maxDimension = 0, - keep the aspect ratio and resize the frame so that shorter edge size is - minDimension - When width = 0, height = 0, minDimension = 0, and maxDimension != 0, - keep the aspect ratio and resize the frame so that longer edge size is - maxDimension - When width = 0, height = 0, minDimension != 0, and maxDimension != 0, - resize the frame so that shorter edge size is minDimension, and - longer edge size is maxDimension. The aspect ratio may not be preserved - When width = 0, height != 0, minDimension = 0, and maxDimension = 0, - keep the aspect ratio and resize the frame so that frame height is $height - When width != 0, height = 0, minDimension = 0, and maxDimension = 0, - keep the aspect ratio and resize the frame so that frame width is $width - When width != 0, height != 0, minDimension = 0, and maxDimension = 0, - resize the frame so that frame width and height are set to $width and - $height, - respectively - */ - size_t width{0}; // width in pixels - size_t height{0}; // height in pixels - long format{-1}; // AVPixelFormat, auto AV_PIX_FMT_NONE - size_t minDimension{0}; // choose min dimension and rescale accordingly - size_t maxDimension{0}; // choose max dimension and rescale accordingly - size_t cropImage{0}; // request image crop - // -- alignment 40 bytes -}; - -// subtitle/cc -struct SubtitleFormat { - long type{0}; // AVSubtitleType, auto SUBTITLE_NONE - size_t padding[4]; - // -- alignment 40 bytes -}; - -union FormatUnion { - FormatUnion() : audio() {} - explicit FormatUnion(int) : video() {} - explicit FormatUnion(char) : subtitle() {} - explicit FormatUnion(double) : subtitle() {} - AudioFormat audio; - VideoFormat video; - SubtitleFormat subtitle; - // -- alignment 40 bytes -}; - -/* - MediaFormat data structure serves as input/output parameter. - Caller assigns values for input formats - or leave default values for auto detection - For output formats all fields will be set to the specific values -*/ -struct MediaFormat { - // for using map/set data structures - bool operator<(const MediaFormat& x) const { - return type < x.type; - } - bool operator==(const MediaFormat& x) const { - if (type != x.type) { - return false; - } - switch (type) { - case TYPE_AUDIO: - return format.audio == x.format.audio; - case TYPE_VIDEO: - return format.video == x.format.video; - case TYPE_SUBTITLE: - case TYPE_CC: - return true; - default: - return false; - } - } - - explicit MediaFormat(long s = -1) : type(TYPE_AUDIO), stream(s), format() {} - explicit MediaFormat(int x, long s = -1) - : type(TYPE_VIDEO), stream(s), format(x) {} - explicit MediaFormat(char x, long s = -1) - : type(TYPE_SUBTITLE), stream(s), format(x) {} - explicit MediaFormat(double x, long s = -1) - : type(TYPE_CC), stream(s), format(x) {} - - static MediaFormat makeMediaFormat(AudioFormat format, long stream) { - MediaFormat result(stream); - result.format.audio = format; - return result; - } - - static MediaFormat makeMediaFormat(VideoFormat format, long stream) { - MediaFormat result(0, stream); - result.format.video = format; - return result; - } - - static MediaFormat makeMediaFormat(SubtitleFormat format, long stream) { - MediaFormat result('0', stream); - result.format.subtitle = format; - return result; - } - - // format type - MediaType type; - // stream index: - // set -1 for one stream auto detection, -2 for all streams auto detection, - // >= 0, specified stream, if caller knows the stream index (unlikely) - long stream; - // union keeps one of the possible formats, defined by MediaType - FormatUnion format; -}; - -struct DecoderParameters { - // local file, remote file, http url, rtmp stream uri, etc. anything that - // ffmpeg can recognize - std::string uri{std::string()}; - // timeout on getting bytes for decoding - size_t timeoutMs{1000}; - // logging level, default AV_LOG_PANIC - long logLevel{0}; - // when decoder would give up, 0 means never - size_t maxPackageErrors{0}; - // max allowed consecutive times no bytes are processed. 0 means for infinite. - size_t maxProcessNoBytes{0}; - // start offset (us) - long startOffset{0}; - // end offset (us) - long endOffset{-1}; - // logging id - int64_t loggingUuid{0}; - // internal max seekable buffer size - size_t maxSeekableBytes{0}; - // adjust header pts to the epoch time - bool convertPtsToWallTime{false}; - // indicate if input stream is an encoded image - bool isImage{false}; - // listen and wait for new rtmp stream - bool listen{false}; - // don't copy frame body, only header - bool headerOnly{false}; - // enable fast seek (seek only to keyframes) - bool fastSeek{false}; - // interrupt init method on timeout - bool preventStaleness{true}; - // seek tolerated accuracy (us) - double seekAccuracy{1000000.0}; - // Allow multithreaded decoding for numThreads > 1; - // 0 numThreads=0 sets up sensible defaults - int numThreads{1}; - // what media types should be processed, default none - std::set formats; - - // can be used for asynchronous decoders - size_t cacheSize{8192}; // mow many bytes to cache before stop reading bytes - size_t cacheTimeoutMs{1000}; // timeout on bytes writing - bool enforceCacheSize{false}; // drop output frames if cache is full - bool mergeAudioMessages{false}; // combine collocated audio messages together - - std::string tlsCertFile; - std::string tlsKeyFile; - - // Skip packets that fail with EPERM errors and continue decoding. - bool skipOperationNotPermittedPackets{false}; - - // probing size in bytes, i.e. the size of the data to analyze to get stream - // information. A higher value will enable detecting more information in case - // it is dispersed into the stream, but will increase latency. Must be an - // integer not lesser than 32. It is 5000000 by default. - int64_t probeSize{5000000}; - - // Expected duration of the video to be decoded, mainly used with uniform - // sampling - float expectedDuration{0.0f}; - - // Sample N key-frames from the video roughly uniformly across the timeline - int uniformSampling{0}; - - // with 0, ffmpeg allocates buffers of size 32768 bytes for encoded frames. - // Override this with bigger buffer size if needed. - int64_t maxEncodedBufferSize{0}; -}; - -struct DecoderHeader { - // message id, from 0 till ... - size_t seqno{0}; - // decoded timestamp in microseconds from either beginning of the stream or - // from epoch time, see DecoderParameters::convertPtsToWallTime - long pts{0}; - // decoded key frame - size_t keyFrame{0}; - // frames per second, valid only for video streams - double fps{0}; - // format specifies what kind frame is in a payload - MediaFormat format; -}; - -// Abstract interface ByteStorage class -class ByteStorage { - public: - virtual ~ByteStorage() = default; - // makes sure that buffer has at least n bytes available for writing, if not - // storage must reallocate memory. - virtual void ensure(size_t n) = 0; - // caller must not to write more than available bytes - virtual uint8_t* writableTail() = 0; - // caller confirms that n bytes were written to the writable tail - virtual void append(size_t n) = 0; - // caller confirms that n bytes were read from the read buffer - virtual void trim(size_t n) = 0; - // gives an access to the beginning of the read buffer - virtual const uint8_t* data() const = 0; - // returns the stored size in bytes - virtual size_t length() const = 0; - // returns available capacity for writable tail - virtual size_t tail() const = 0; - // clears content, keeps capacity - virtual void clear() = 0; -}; - -struct DecoderOutputMessage { - DecoderHeader header; - std::unique_ptr payload; -}; - -/* - * External provider of the ecnoded bytes, specific implementation is left for - * different use cases, like file, memory, external network end-points, etc. - * Normally input/output parameter @out set to valid, not null buffer pointer, - * which indicates "read" call, however there are "seek" modes as well. - - * @out != nullptr => read from the current offset, @whence got ignored, - * @size bytes to read => return number bytes got read, 0 if no more bytes - * available, < 0 on error. - - * @out == nullptr, @timeoutMs == 0 => does provider support "seek" - * capability in a first place? @size & @whence got ignored, return 0 on - * success, < 0 if "seek" mode is not supported. - - * @out == nullptr, @timeoutMs != 0 => normal seek call - * offset == @size, i.e. @whence = [SEEK_SET, SEEK_CUR, SEEK_END, AVSEEK_SIZE) - * return < 0 on error, position if @whence = [SEEK_SET, SEEK_CUR, SEEK_END], - * length of buffer if @whence = [AVSEEK_SIZE]. - */ -using DecoderInCallback = - std::function; - -using DecoderOutCallback = std::function; - -struct DecoderMetadata { - // time base numerator - long num{0}; - // time base denominator - long den{1}; - // duration of the stream, in miscroseconds, if available - long duration{-1}; - // frames per second, valid only for video streams - double fps{0}; - // format specifies what kind frame is in a payload - MediaFormat format; -}; -/** - * Abstract class for decoding media bytes - * It has two different modes. Internal media bytes retrieval for given uri and - * external media bytes provider in case of memory streams - */ -class MediaDecoder { - public: - virtual ~MediaDecoder() = default; - - /** - * Initializes media decoder with parameters, - * calls callback when media bytes are available. - * Media bytes get fetched internally from provided URI - * or invokes provided input callback to get media bytes. - * Input callback must be empty for the internal media provider - * Caller can provide non-null pointer for the input container - * if headers to obtain the streams metadata (optional) - */ - virtual bool init( - const DecoderParameters& params, - DecoderInCallback&& in, - std::vector* metadata) = 0; - - /** - * Polls available decoded one frame from decoder - * Returns error code, 0 - for success - */ - virtual int decode(DecoderOutputMessage* out, uint64_t timeoutMs) = 0; - - /** - * Polls available decoded bytes from decoder, till EOF or error - */ - virtual int decode_all(const DecoderOutCallback& callback) = 0; - - /** - * Stops calling callback, releases resources - */ - virtual void shutdown() = 0; - - /** - * Interrupts whatever decoder is doing at any time - */ - virtual void interrupt() = 0; - - /** - * Factory to create ByteStorage class instances, particular implementation is - * left to the derived class. Caller provides the initially allocated size - */ - virtual std::unique_ptr createByteStorage(size_t n) = 0; -}; - -struct SamplerParameters { - MediaType type{TYPE_AUDIO}; - FormatUnion in; - FormatUnion out; - int64_t loggingUuid{0}; -}; - -/** - * Abstract class for sampling media bytes - */ -class MediaSampler { - public: - virtual ~MediaSampler() = default; - - /** - * Initializes media sampler with parameters - */ - virtual bool init(const SamplerParameters& params) = 0; - - /** - * Samples media bytes - * Returns error code < 0, or >=0 - for success, indicating number of bytes - * processed. - * set @in to null for flushing data - */ - virtual int sample(const ByteStorage* in, ByteStorage* out) = 0; - - /** - * Releases resources - */ - virtual void shutdown() = 0; - - /* - * Returns media type - */ - MediaType getMediaType() const { - return params_.type; - } - /* - * Returns formats - */ - FormatUnion getInputFormat() const { - return params_.in; - } - FormatUnion getOutFormat() const { - return params_.out; - } - - protected: - SamplerParameters params_; -}; -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/gpu/README.rst b/torchvision/csrc/io/decoder/gpu/README.rst deleted file mode 100644 index e4573d7fe75..00000000000 --- a/torchvision/csrc/io/decoder/gpu/README.rst +++ /dev/null @@ -1,21 +0,0 @@ -GPU Decoder -=========== - -GPU decoder depends on ffmpeg for demuxing, uses NVDECODE APIs from the nvidia-video-codec sdk and uses cuda for processing on gpu. In order to use this, please follow the following steps: - -* Download the latest `nvidia-video-codec-sdk `_ -* Extract the zipped file. -* Set TORCHVISION_INCLUDE environment variable to the location of the video codec headers(`nvcuvid.h` and `cuviddec.h`), which would be under `Interface` directory. -* Set TORCHVISION_LIBRARY environment variable to the location of the video codec library(`libnvcuvid.so`), which would be under `Lib/linux/stubs/x86_64` directory. -* Install the latest ffmpeg from `conda-forge` channel. - -.. code:: bash - - conda install -c conda-forge ffmpeg - -* Set CUDA_HOME environment variable to the cuda root directory. -* Build torchvision from source: - -.. code:: bash - - pip install . -v --no-build-isolation diff --git a/torchvision/csrc/io/decoder/gpu/decoder.cpp b/torchvision/csrc/io/decoder/gpu/decoder.cpp deleted file mode 100644 index f7377ede38b..00000000000 --- a/torchvision/csrc/io/decoder/gpu/decoder.cpp +++ /dev/null @@ -1,405 +0,0 @@ -#include "decoder.h" -#include -#include -#include -#include -#include - -static float chroma_height_factor(cudaVideoSurfaceFormat surface_format) { - return (surface_format == cudaVideoSurfaceFormat_YUV444 || - surface_format == cudaVideoSurfaceFormat_YUV444_16Bit) - ? 1.0 - : 0.5; -} - -static int chroma_plane_count(cudaVideoSurfaceFormat surface_format) { - return (surface_format == cudaVideoSurfaceFormat_YUV444 || - surface_format == cudaVideoSurfaceFormat_YUV444_16Bit) - ? 2 - : 1; -} - -/* Initialise cu_context and video_codec, create context lock and create parser - * object. - */ -void Decoder::init(CUcontext context, cudaVideoCodec codec) { - cu_context = context; - video_codec = codec; - check_for_cuda_errors( - cuvidCtxLockCreate(&ctx_lock, cu_context), __LINE__, __FILE__); - - CUVIDPARSERPARAMS parser_params = {}; - parser_params.CodecType = codec; - parser_params.ulMaxNumDecodeSurfaces = 1; - parser_params.ulClockRate = 1000; - parser_params.ulMaxDisplayDelay = 0u; - parser_params.pUserData = this; - parser_params.pfnSequenceCallback = video_sequence_handler; - parser_params.pfnDecodePicture = picture_decode_handler; - parser_params.pfnDisplayPicture = picture_display_handler; - parser_params.pfnGetOperatingPoint = operating_point_handler; - - check_for_cuda_errors( - cuvidCreateVideoParser(&parser, &parser_params), __LINE__, __FILE__); -} - -/* Destroy parser object and context lock. - */ -Decoder::~Decoder() { - if (parser) { - cuvidDestroyVideoParser(parser); - } - cuvidCtxLockDestroy(ctx_lock); -} - -/* Destroy CUvideodecoder object and free up all the unreturned decoded frames. - */ -void Decoder::release() { - cuCtxPushCurrent(cu_context); - if (decoder) { - cuvidDestroyDecoder(decoder); - } - cuCtxPopCurrent(nullptr); -} - -/* Trigger video decoding. - */ -void Decoder::decode(const uint8_t* data, unsigned long size) { - CUVIDSOURCEDATAPACKET pkt = {}; - pkt.flags = CUVID_PKT_TIMESTAMP; - pkt.payload_size = size; - pkt.payload = data; - pkt.timestamp = 0; - if (!data || size == 0) { - pkt.flags |= CUVID_PKT_ENDOFSTREAM; - } - check_for_cuda_errors(cuvidParseVideoData(parser, &pkt), __LINE__, __FILE__); - cuvidStream = 0; -} - -/* Fetch a decoded frame and remove it from the queue. - */ -torch::Tensor Decoder::fetch_frame() { - if (decoded_frames.empty()) { - auto options = - torch::TensorOptions().dtype(torch::kU8).device(torch::kCUDA); - return torch::zeros({0}, options); - } - torch::Tensor frame = decoded_frames.front(); - decoded_frames.pop(); - return frame; -} - -/* Called when a picture is ready to be decoded. - */ -int Decoder::handle_picture_decode(CUVIDPICPARAMS* pic_params) { - if (!decoder) { - TORCH_CHECK(false, "Uninitialised decoder"); - } - pic_num_in_decode_order[pic_params->CurrPicIdx] = decode_pic_count++; - check_for_cuda_errors(cuCtxPushCurrent(cu_context), __LINE__, __FILE__); - check_for_cuda_errors( - cuvidDecodePicture(decoder, pic_params), __LINE__, __FILE__); - check_for_cuda_errors(cuCtxPopCurrent(nullptr), __LINE__, __FILE__); - return 1; -} - -/* Process the decoded data and copy it to a cuda memory location. - */ -int Decoder::handle_picture_display(CUVIDPARSERDISPINFO* disp_info) { - CUVIDPROCPARAMS proc_params = {}; - proc_params.progressive_frame = disp_info->progressive_frame; - proc_params.second_field = disp_info->repeat_first_field + 1; - proc_params.top_field_first = disp_info->top_field_first; - proc_params.unpaired_field = disp_info->repeat_first_field < 0; - proc_params.output_stream = cuvidStream; - - CUdeviceptr source_frame = 0; - unsigned int source_pitch = 0; - check_for_cuda_errors(cuCtxPushCurrent(cu_context), __LINE__, __FILE__); - check_for_cuda_errors( - cuvidMapVideoFrame( - decoder, - disp_info->picture_index, - &source_frame, - &source_pitch, - &proc_params), - __LINE__, - __FILE__); - - CUVIDGETDECODESTATUS decode_status; - memset(&decode_status, 0, sizeof(decode_status)); - CUresult result = - cuvidGetDecodeStatus(decoder, disp_info->picture_index, &decode_status); - if (result == CUDA_SUCCESS && - (decode_status.decodeStatus == cuvidDecodeStatus_Error || - decode_status.decodeStatus == cuvidDecodeStatus_Error_Concealed)) { - VLOG(1) << "Decode Error occurred for picture " - << pic_num_in_decode_order[disp_info->picture_index]; - } - - auto options = torch::TensorOptions().dtype(torch::kU8).device(torch::kCUDA); - torch::Tensor decoded_frame = torch::empty({get_height(), width, 3}, options); - uint8_t* frame_ptr = decoded_frame.data_ptr(); - const uint8_t* const source_arr[] = { - (const uint8_t* const)source_frame, - (const uint8_t* const)(source_frame + - source_pitch * ((surface_height + 1) & ~1))}; - - auto err = nppiNV12ToRGB_709CSC_8u_P2C3R( - source_arr, - source_pitch, - frame_ptr, - width * 3, - {(int)decoded_frame.size(1), (int)decoded_frame.size(0)}); - - TORCH_CHECK( - err == NPP_NO_ERROR, - "Failed to convert from NV12 to RGB. Error code:", - err); - - check_for_cuda_errors(cuStreamSynchronize(cuvidStream), __LINE__, __FILE__); - decoded_frames.push(decoded_frame); - check_for_cuda_errors(cuCtxPopCurrent(nullptr), __LINE__, __FILE__); - - check_for_cuda_errors( - cuvidUnmapVideoFrame(decoder, source_frame), __LINE__, __FILE__); - return 1; -} - -/* Query the capabilities of the underlying hardware video decoder and - * verify if the hardware supports decoding the passed video. - */ -void Decoder::query_hardware(CUVIDEOFORMAT* video_format) { - CUVIDDECODECAPS decode_caps = {}; - decode_caps.eCodecType = video_format->codec; - decode_caps.eChromaFormat = video_format->chroma_format; - decode_caps.nBitDepthMinus8 = video_format->bit_depth_luma_minus8; - - check_for_cuda_errors(cuCtxPushCurrent(cu_context), __LINE__, __FILE__); - check_for_cuda_errors(cuvidGetDecoderCaps(&decode_caps), __LINE__, __FILE__); - check_for_cuda_errors(cuCtxPopCurrent(nullptr), __LINE__, __FILE__); - - if (!decode_caps.bIsSupported) { - TORCH_CHECK(false, "Codec not supported on this GPU"); - } - if ((video_format->coded_width > decode_caps.nMaxWidth) || - (video_format->coded_height > decode_caps.nMaxHeight)) { - TORCH_CHECK( - false, - "Resolution : ", - video_format->coded_width, - "x", - video_format->coded_height, - "\nMax Supported (wxh) : ", - decode_caps.nMaxWidth, - "x", - decode_caps.nMaxHeight, - "\nResolution not supported on this GPU"); - } - if ((video_format->coded_width >> 4) * (video_format->coded_height >> 4) > - decode_caps.nMaxMBCount) { - TORCH_CHECK( - false, - "MBCount : ", - (video_format->coded_width >> 4) * (video_format->coded_height >> 4), - "\nMax Supported mbcnt : ", - decode_caps.nMaxMBCount, - "\nMBCount not supported on this GPU"); - } - // Check if output format supported. If not, check fallback options - if (!(decode_caps.nOutputFormatMask & (1 << video_output_format))) { - if (decode_caps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_NV12)) { - video_output_format = cudaVideoSurfaceFormat_NV12; - } else if ( - decode_caps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_P016)) { - video_output_format = cudaVideoSurfaceFormat_P016; - } else if ( - decode_caps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_YUV444)) { - video_output_format = cudaVideoSurfaceFormat_YUV444; - } else if ( - decode_caps.nOutputFormatMask & - (1 << cudaVideoSurfaceFormat_YUV444_16Bit)) { - video_output_format = cudaVideoSurfaceFormat_YUV444_16Bit; - } else { - TORCH_CHECK(false, "No supported output format found"); - } - } -} - -/* Called before decoding frames and/or whenever there is a configuration - * change. - */ -int Decoder::handle_video_sequence(CUVIDEOFORMAT* video_format) { - // video_codec has been set in init(). Here it's set - // again for potential correction. - video_codec = video_format->codec; - video_chroma_format = video_format->chroma_format; - bit_depth_minus8 = video_format->bit_depth_luma_minus8; - bytes_per_pixel = bit_depth_minus8 > 0 ? 2 : 1; - // Set the output surface format same as chroma format - switch (video_chroma_format) { - case cudaVideoChromaFormat_Monochrome: - case cudaVideoChromaFormat_420: - video_output_format = video_format->bit_depth_luma_minus8 - ? cudaVideoSurfaceFormat_P016 - : cudaVideoSurfaceFormat_NV12; - break; - case cudaVideoChromaFormat_444: - video_output_format = video_format->bit_depth_luma_minus8 - ? cudaVideoSurfaceFormat_YUV444_16Bit - : cudaVideoSurfaceFormat_YUV444; - break; - case cudaVideoChromaFormat_422: - video_output_format = cudaVideoSurfaceFormat_NV12; - } - - query_hardware(video_format); - - if (width && luma_height && chroma_height) { - // cuvidCreateDecoder() has been called before and now there's possible - // config change. - return reconfigure_decoder(video_format); - } - - cu_video_format = *video_format; - unsigned long decode_surface = video_format->min_num_decode_surfaces; - cudaVideoDeinterlaceMode deinterlace_mode = cudaVideoDeinterlaceMode_Adaptive; - - if (video_format->progressive_sequence) { - deinterlace_mode = cudaVideoDeinterlaceMode_Weave; - } - - CUVIDDECODECREATEINFO video_decode_create_info = {}; - video_decode_create_info.ulWidth = video_format->coded_width; - video_decode_create_info.ulHeight = video_format->coded_height; - video_decode_create_info.ulNumDecodeSurfaces = decode_surface; - video_decode_create_info.CodecType = video_format->codec; - video_decode_create_info.ChromaFormat = video_format->chroma_format; - // With PreferCUVID, JPEG is still decoded by CUDA while video is decoded - // by NVDEC hardware - video_decode_create_info.ulCreationFlags = cudaVideoCreate_PreferCUVID; - video_decode_create_info.bitDepthMinus8 = video_format->bit_depth_luma_minus8; - video_decode_create_info.OutputFormat = video_output_format; - video_decode_create_info.DeinterlaceMode = deinterlace_mode; - video_decode_create_info.ulNumOutputSurfaces = 2; - video_decode_create_info.vidLock = ctx_lock; - - // AV1 has max width/height of sequence in sequence header - if (video_format->codec == cudaVideoCodec_AV1 && - video_format->seqhdr_data_length > 0) { - CUVIDEOFORMATEX* video_format_ex = (CUVIDEOFORMATEX*)video_format; - max_width = video_format_ex->av1.max_width; - max_height = video_format_ex->av1.max_height; - } - if (max_width < video_format->coded_width) { - max_width = video_format->coded_width; - } - if (max_height < video_format->coded_height) { - max_height = video_format->coded_height; - } - video_decode_create_info.ulMaxWidth = max_width; - video_decode_create_info.ulMaxHeight = max_height; - width = video_format->display_area.right - video_format->display_area.left; - luma_height = - video_format->display_area.bottom - video_format->display_area.top; - video_decode_create_info.ulTargetWidth = video_format->coded_width; - video_decode_create_info.ulTargetHeight = video_format->coded_height; - chroma_height = - (int)(ceil(luma_height * chroma_height_factor(video_output_format))); - num_chroma_planes = chroma_plane_count(video_output_format); - surface_height = video_decode_create_info.ulTargetHeight; - surface_width = video_decode_create_info.ulTargetWidth; - display_rect.bottom = video_decode_create_info.display_area.bottom; - display_rect.top = video_decode_create_info.display_area.top; - display_rect.left = video_decode_create_info.display_area.left; - display_rect.right = video_decode_create_info.display_area.right; - - check_for_cuda_errors(cuCtxPushCurrent(cu_context), __LINE__, __FILE__); - check_for_cuda_errors( - cuvidCreateDecoder(&decoder, &video_decode_create_info), - __LINE__, - __FILE__); - check_for_cuda_errors(cuCtxPopCurrent(nullptr), __LINE__, __FILE__); - return decode_surface; -} - -int Decoder::reconfigure_decoder(CUVIDEOFORMAT* video_format) { - if (video_format->bit_depth_luma_minus8 != - cu_video_format.bit_depth_luma_minus8 || - video_format->bit_depth_chroma_minus8 != - cu_video_format.bit_depth_chroma_minus8) { - TORCH_CHECK(false, "Reconfigure not supported for bit depth change"); - } - if (video_format->chroma_format != cu_video_format.chroma_format) { - TORCH_CHECK(false, "Reconfigure not supported for chroma format change"); - } - - bool decode_res_change = - !(video_format->coded_width == cu_video_format.coded_width && - video_format->coded_height == cu_video_format.coded_height); - bool display_rect_change = - !(video_format->display_area.bottom == - cu_video_format.display_area.bottom && - video_format->display_area.top == cu_video_format.display_area.top && - video_format->display_area.left == cu_video_format.display_area.left && - video_format->display_area.right == cu_video_format.display_area.right); - - unsigned int decode_surface = video_format->min_num_decode_surfaces; - - if ((video_format->coded_width > max_width) || - (video_format->coded_height > max_height)) { - // For VP9, let driver handle the change if new width/height > - // maxwidth/maxheight - if (video_codec != cudaVideoCodec_VP9) { - TORCH_CHECK( - false, - "Reconfigure not supported when width/height > maxwidth/maxheight"); - } - return 1; - } - - if (!decode_res_change) { - // If the coded_width/coded_height hasn't changed but display resolution has - // changed, then need to update width/height for correct output without - // cropping. Example : 1920x1080 vs 1920x1088. - if (display_rect_change) { - width = - video_format->display_area.right - video_format->display_area.left; - luma_height = - video_format->display_area.bottom - video_format->display_area.top; - chroma_height = - (int)ceil(luma_height * chroma_height_factor(video_output_format)); - num_chroma_planes = chroma_plane_count(video_output_format); - } - return 1; - } - cu_video_format.coded_width = video_format->coded_width; - cu_video_format.coded_height = video_format->coded_height; - CUVIDRECONFIGUREDECODERINFO reconfig_params = {}; - reconfig_params.ulWidth = video_format->coded_width; - reconfig_params.ulHeight = video_format->coded_height; - reconfig_params.ulTargetWidth = surface_width; - reconfig_params.ulTargetHeight = surface_height; - reconfig_params.ulNumDecodeSurfaces = decode_surface; - reconfig_params.display_area.bottom = display_rect.bottom; - reconfig_params.display_area.top = display_rect.top; - reconfig_params.display_area.left = display_rect.left; - reconfig_params.display_area.right = display_rect.right; - - check_for_cuda_errors(cuCtxPushCurrent(cu_context), __LINE__, __FILE__); - check_for_cuda_errors( - cuvidReconfigureDecoder(decoder, &reconfig_params), __LINE__, __FILE__); - check_for_cuda_errors(cuCtxPopCurrent(nullptr), __LINE__, __FILE__); - - return decode_surface; -} - -/* Called from AV1 sequence header to get operating point of an AV1 bitstream. - */ -int Decoder::get_operating_point(CUVIDOPERATINGPOINTINFO* oper_point_info) { - return oper_point_info->codec == cudaVideoCodec_AV1 && - oper_point_info->av1.operating_points_cnt > 1 - ? 0 - : -1; -} diff --git a/torchvision/csrc/io/decoder/gpu/decoder.h b/torchvision/csrc/io/decoder/gpu/decoder.h deleted file mode 100644 index 5ad685ec746..00000000000 --- a/torchvision/csrc/io/decoder/gpu/decoder.h +++ /dev/null @@ -1,89 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -static auto check_for_cuda_errors = - [](CUresult result, int line_num, std::string file_name) { - if (CUDA_SUCCESS != result) { - const char* error_name = nullptr; - - TORCH_CHECK( - CUDA_SUCCESS != cuGetErrorName(result, &error_name), - "CUDA error: ", - error_name, - " in ", - file_name, - " at line ", - line_num) - TORCH_CHECK( - false, "Error: ", result, " in ", file_name, " at line ", line_num); - } - }; - -struct Rect { - int left, top, right, bottom; -}; - -class Decoder { - public: - Decoder() {} - ~Decoder(); - void init(CUcontext, cudaVideoCodec); - void release(); - void decode(const uint8_t*, unsigned long); - torch::Tensor fetch_frame(); - int get_height() const { - return luma_height; - } - - private: - unsigned int width = 0, luma_height = 0, chroma_height = 0; - unsigned int surface_height = 0, surface_width = 0; - unsigned int max_width = 0, max_height = 0; - unsigned int num_chroma_planes = 0; - int bit_depth_minus8 = 0, bytes_per_pixel = 1; - int decode_pic_count = 0, pic_num_in_decode_order[32]; - std::queue decoded_frames; - CUcontext cu_context = NULL; - CUvideoctxlock ctx_lock; - CUvideoparser parser = NULL; - CUvideodecoder decoder = NULL; - CUstream cuvidStream = 0; - cudaVideoCodec video_codec = cudaVideoCodec_NumCodecs; - cudaVideoChromaFormat video_chroma_format = cudaVideoChromaFormat_420; - cudaVideoSurfaceFormat video_output_format = cudaVideoSurfaceFormat_NV12; - CUVIDEOFORMAT cu_video_format = {}; - Rect display_rect = {}; - - static int video_sequence_handler( - void* user_data, - CUVIDEOFORMAT* video_format) { - return ((Decoder*)user_data)->handle_video_sequence(video_format); - } - static int picture_decode_handler( - void* user_data, - CUVIDPICPARAMS* pic_params) { - return ((Decoder*)user_data)->handle_picture_decode(pic_params); - } - static int picture_display_handler( - void* user_data, - CUVIDPARSERDISPINFO* disp_info) { - return ((Decoder*)user_data)->handle_picture_display(disp_info); - } - static int operating_point_handler( - void* user_data, - CUVIDOPERATINGPOINTINFO* operating_info) { - return ((Decoder*)user_data)->get_operating_point(operating_info); - } - - void query_hardware(CUVIDEOFORMAT*); - int reconfigure_decoder(CUVIDEOFORMAT*); - int handle_video_sequence(CUVIDEOFORMAT*); - int handle_picture_decode(CUVIDPICPARAMS*); - int handle_picture_display(CUVIDPARSERDISPINFO*); - int get_operating_point(CUVIDOPERATINGPOINTINFO*); -}; diff --git a/torchvision/csrc/io/decoder/gpu/demuxer.h b/torchvision/csrc/io/decoder/gpu/demuxer.h deleted file mode 100644 index f6e72dceee1..00000000000 --- a/torchvision/csrc/io/decoder/gpu/demuxer.h +++ /dev/null @@ -1,257 +0,0 @@ -extern "C" { -#include -#include -#include -#include -} - -class Demuxer { - private: - AVFormatContext* fmtCtx = NULL; - AVBSFContext* bsfCtx = NULL; - AVPacket pkt, pktFiltered; - AVCodecID eVideoCodec; - uint8_t* dataWithHeader = NULL; - bool bMp4H264, bMp4HEVC, bMp4MPEG4; - unsigned int frameCount = 0; - int iVideoStream; - double timeBase = 0.0; - - public: - Demuxer(const char* filePath) { - avformat_network_init(); - TORCH_CHECK( - 0 <= avformat_open_input(&fmtCtx, filePath, NULL, NULL), - "avformat_open_input() failed at line ", - __LINE__, - " in demuxer.h\n"); - if (!fmtCtx) { - TORCH_CHECK( - false, - "Encountered NULL AVFormatContext at line ", - __LINE__, - " in demuxer.h\n"); - } - - TORCH_CHECK( - 0 <= avformat_find_stream_info(fmtCtx, NULL), - "avformat_find_stream_info() failed at line ", - __LINE__, - " in demuxer.h\n"); - iVideoStream = - av_find_best_stream(fmtCtx, AVMEDIA_TYPE_VIDEO, -1, -1, NULL, 0); - if (iVideoStream < 0) { - TORCH_CHECK( - false, - "av_find_best_stream() failed at line ", - __LINE__, - " in demuxer.h\n"); - } - - eVideoCodec = fmtCtx->streams[iVideoStream]->codecpar->codec_id; - AVRational rTimeBase = fmtCtx->streams[iVideoStream]->time_base; - timeBase = av_q2d(rTimeBase); - - bMp4H264 = eVideoCodec == AV_CODEC_ID_H264 && - (!strcmp(fmtCtx->iformat->long_name, "QuickTime / MOV") || - !strcmp(fmtCtx->iformat->long_name, "FLV (Flash Video)") || - !strcmp(fmtCtx->iformat->long_name, "Matroska / WebM")); - bMp4HEVC = eVideoCodec == AV_CODEC_ID_HEVC && - (!strcmp(fmtCtx->iformat->long_name, "QuickTime / MOV") || - !strcmp(fmtCtx->iformat->long_name, "FLV (Flash Video)") || - !strcmp(fmtCtx->iformat->long_name, "Matroska / WebM")); - bMp4MPEG4 = eVideoCodec == AV_CODEC_ID_MPEG4 && - (!strcmp(fmtCtx->iformat->long_name, "QuickTime / MOV") || - !strcmp(fmtCtx->iformat->long_name, "FLV (Flash Video)") || - !strcmp(fmtCtx->iformat->long_name, "Matroska / WebM")); - - av_init_packet(&pkt); - pkt.data = NULL; - pkt.size = 0; - av_init_packet(&pktFiltered); - pktFiltered.data = NULL; - pktFiltered.size = 0; - - if (bMp4H264) { - const AVBitStreamFilter* bsf = av_bsf_get_by_name("h264_mp4toannexb"); - if (!bsf) { - TORCH_CHECK( - false, - "av_bsf_get_by_name() failed at line ", - __LINE__, - " in demuxer.h\n"); - } - TORCH_CHECK( - 0 <= av_bsf_alloc(bsf, &bsfCtx), - "av_bsf_alloc() failed at line ", - __LINE__, - " in demuxer.h\n"); - avcodec_parameters_copy( - bsfCtx->par_in, fmtCtx->streams[iVideoStream]->codecpar); - TORCH_CHECK( - 0 <= av_bsf_init(bsfCtx), - "av_bsf_init() failed at line ", - __LINE__, - " in demuxer.h\n"); - } - if (bMp4HEVC) { - const AVBitStreamFilter* bsf = av_bsf_get_by_name("hevc_mp4toannexb"); - if (!bsf) { - TORCH_CHECK( - false, - "av_bsf_get_by_name() failed at line ", - __LINE__, - " in demuxer.h\n"); - } - TORCH_CHECK( - 0 <= av_bsf_alloc(bsf, &bsfCtx), - "av_bsf_alloc() failed at line ", - __LINE__, - " in demuxer.h\n"); - avcodec_parameters_copy( - bsfCtx->par_in, fmtCtx->streams[iVideoStream]->codecpar); - TORCH_CHECK( - 0 <= av_bsf_init(bsfCtx), - "av_bsf_init() failed at line ", - __LINE__, - " in demuxer.h\n"); - } - } - - ~Demuxer() { - if (!fmtCtx) { - return; - } - if (pkt.data) { - av_packet_unref(&pkt); - } - if (pktFiltered.data) { - av_packet_unref(&pktFiltered); - } - if (bsfCtx) { - av_bsf_free(&bsfCtx); - } - avformat_close_input(&fmtCtx); - if (dataWithHeader) { - av_free(dataWithHeader); - } - } - - AVCodecID get_video_codec() { - return eVideoCodec; - } - - double get_duration() const { - return (double)fmtCtx->duration / AV_TIME_BASE; - } - - double get_fps() const { - return av_q2d(fmtCtx->streams[iVideoStream]->r_frame_rate); - } - - bool demux(uint8_t** video, unsigned long* videoBytes) { - if (!fmtCtx) { - return false; - } - *videoBytes = 0; - - if (pkt.data) { - av_packet_unref(&pkt); - } - int e = 0; - while ((e = av_read_frame(fmtCtx, &pkt)) >= 0 && - pkt.stream_index != iVideoStream) { - av_packet_unref(&pkt); - } - if (e < 0) { - return false; - } - - if (bMp4H264 || bMp4HEVC) { - if (pktFiltered.data) { - av_packet_unref(&pktFiltered); - } - TORCH_CHECK( - 0 <= av_bsf_send_packet(bsfCtx, &pkt), - "av_bsf_send_packet() failed at line ", - __LINE__, - " in demuxer.h\n"); - TORCH_CHECK( - 0 <= av_bsf_receive_packet(bsfCtx, &pktFiltered), - "av_bsf_receive_packet() failed at line ", - __LINE__, - " in demuxer.h\n"); - *video = pktFiltered.data; - *videoBytes = pktFiltered.size; - } else { - if (bMp4MPEG4 && (frameCount == 0)) { - int extraDataSize = - fmtCtx->streams[iVideoStream]->codecpar->extradata_size; - - if (extraDataSize > 0) { - dataWithHeader = (uint8_t*)av_malloc( - extraDataSize + pkt.size - 3 * sizeof(uint8_t)); - if (!dataWithHeader) { - TORCH_CHECK( - false, - "av_malloc() failed at line ", - __LINE__, - " in demuxer.h\n"); - } - memcpy( - dataWithHeader, - fmtCtx->streams[iVideoStream]->codecpar->extradata, - extraDataSize); - memcpy( - dataWithHeader + extraDataSize, - pkt.data + 3, - pkt.size - 3 * sizeof(uint8_t)); - *video = dataWithHeader; - *videoBytes = extraDataSize + pkt.size - 3 * sizeof(uint8_t); - } - } else { - *video = pkt.data; - *videoBytes = pkt.size; - } - } - frameCount++; - return true; - } - - void seek(double timestamp, int flag) { - int64_t time = timestamp * AV_TIME_BASE; - TORCH_CHECK( - 0 <= av_seek_frame(fmtCtx, -1, time, flag), - "av_seek_frame() failed at line ", - __LINE__, - " in demuxer.h\n"); - } -}; - -inline cudaVideoCodec ffmpeg_to_codec(AVCodecID id) { - switch (id) { - case AV_CODEC_ID_MPEG1VIDEO: - return cudaVideoCodec_MPEG1; - case AV_CODEC_ID_MPEG2VIDEO: - return cudaVideoCodec_MPEG2; - case AV_CODEC_ID_MPEG4: - return cudaVideoCodec_MPEG4; - case AV_CODEC_ID_WMV3: - case AV_CODEC_ID_VC1: - return cudaVideoCodec_VC1; - case AV_CODEC_ID_H264: - return cudaVideoCodec_H264; - case AV_CODEC_ID_HEVC: - return cudaVideoCodec_HEVC; - case AV_CODEC_ID_VP8: - return cudaVideoCodec_VP8; - case AV_CODEC_ID_VP9: - return cudaVideoCodec_VP9; - case AV_CODEC_ID_MJPEG: - return cudaVideoCodec_JPEG; - case AV_CODEC_ID_AV1: - return cudaVideoCodec_AV1; - default: - return cudaVideoCodec_NumCodecs; - } -} diff --git a/torchvision/csrc/io/decoder/gpu/gpu_decoder.cpp b/torchvision/csrc/io/decoder/gpu/gpu_decoder.cpp deleted file mode 100644 index 1fe3ec8ab7a..00000000000 --- a/torchvision/csrc/io/decoder/gpu/gpu_decoder.cpp +++ /dev/null @@ -1,65 +0,0 @@ -#include "gpu_decoder.h" -#include - -/* Set cuda device, create cuda context and initialise the demuxer and decoder. - */ -GPUDecoder::GPUDecoder(std::string src_file, torch::Device dev) - : demuxer(src_file.c_str()) { - at::cuda::CUDAGuard device_guard(dev); - device = device_guard.current_device().index(); - check_for_cuda_errors( - cuDevicePrimaryCtxRetain(&ctx, device), __LINE__, __FILE__); - decoder.init(ctx, ffmpeg_to_codec(demuxer.get_video_codec())); - initialised = true; -} - -GPUDecoder::~GPUDecoder() { - at::cuda::CUDAGuard device_guard(device); - decoder.release(); - if (initialised) { - check_for_cuda_errors( - cuDevicePrimaryCtxRelease(device), __LINE__, __FILE__); - } -} - -/* Fetch a decoded frame tensor after demuxing and decoding. - */ -torch::Tensor GPUDecoder::decode() { - torch::Tensor frameTensor; - unsigned long videoBytes = 0; - uint8_t* video = nullptr; - at::cuda::CUDAGuard device_guard(device); - torch::Tensor frame; - do { - demuxer.demux(&video, &videoBytes); - decoder.decode(video, videoBytes); - frame = decoder.fetch_frame(); - } while (frame.numel() == 0 && videoBytes > 0); - return frame; -} - -/* Seek to a passed timestamp. The second argument controls whether to seek to a - * keyframe. - */ -void GPUDecoder::seek(double timestamp, bool keyframes_only) { - int flag = keyframes_only ? 0 : AVSEEK_FLAG_ANY; - demuxer.seek(timestamp, flag); -} - -c10::Dict> GPUDecoder:: - get_metadata() const { - c10::Dict> metadata; - c10::Dict video_metadata; - video_metadata.insert("duration", demuxer.get_duration()); - video_metadata.insert("fps", demuxer.get_fps()); - metadata.insert("video", video_metadata); - return metadata; -} - -TORCH_LIBRARY(torchvision, m) { - m.class_("GPUDecoder") - .def(torch::init()) - .def("seek", &GPUDecoder::seek) - .def("get_metadata", &GPUDecoder::get_metadata) - .def("next", &GPUDecoder::decode); -} diff --git a/torchvision/csrc/io/decoder/gpu/gpu_decoder.h b/torchvision/csrc/io/decoder/gpu/gpu_decoder.h deleted file mode 100644 index 22bf680a982..00000000000 --- a/torchvision/csrc/io/decoder/gpu/gpu_decoder.h +++ /dev/null @@ -1,20 +0,0 @@ -#include -#include -#include "decoder.h" -#include "demuxer.h" - -class GPUDecoder : public torch::CustomClassHolder { - public: - GPUDecoder(std::string, torch::Device); - ~GPUDecoder(); - torch::Tensor decode(); - void seek(double, bool); - c10::Dict> get_metadata() const; - - private: - Demuxer demuxer; - CUcontext ctx; - Decoder decoder; - int64_t device; - bool initialised = false; -}; diff --git a/torchvision/csrc/io/decoder/memory_buffer.cpp b/torchvision/csrc/io/decoder/memory_buffer.cpp deleted file mode 100644 index 4e420c3b3cd..00000000000 --- a/torchvision/csrc/io/decoder/memory_buffer.cpp +++ /dev/null @@ -1,71 +0,0 @@ -#include "memory_buffer.h" -#include - -namespace ffmpeg { - -MemoryBuffer::MemoryBuffer(const uint8_t* buffer, size_t size) - : buffer_(buffer), len_(size) {} - -int MemoryBuffer::read(uint8_t* buf, int size) { - if (pos_ < len_) { - auto available = std::min(int(len_ - pos_), size); - memcpy(buf, buffer_ + pos_, available); - pos_ += available; - return available; - } - - return 0; -} - -int64_t MemoryBuffer::seek(int64_t offset, int whence) { - if (whence & AVSEEK_SIZE) { - return len_; - } - - // remove force flag - whence &= ~AVSEEK_FORCE; - - switch (whence) { - case SEEK_SET: - if (offset >= 0 && offset <= len_) { - pos_ = offset; - } - break; - case SEEK_END: - if (len_ + offset >= 0 && len_ + offset <= len_) { - pos_ = len_ + offset; - } - break; - case SEEK_CUR: - if (pos_ + offset > 0 && pos_ + offset <= len_) { - pos_ += offset; - } - break; - default: - LOG(ERROR) << "Unknown whence flag gets provided: " << whence; - } - return pos_; -} - -/* static */ -DecoderInCallback MemoryBuffer::getCallback( - const uint8_t* buffer, - size_t size) { - MemoryBuffer object(buffer, size); - return - [object](uint8_t* out, int size, int whence, uint64_t timeoutMs) mutable - -> int { - if (out) { // see defs.h file - // read mode - return object.read(out, size); - } - // seek mode - if (!timeoutMs) { - // seek capability, yes - supported - return 0; - } - return object.seek(size, whence); - }; -} - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/memory_buffer.h b/torchvision/csrc/io/decoder/memory_buffer.h deleted file mode 100644 index 909626d3cae..00000000000 --- a/torchvision/csrc/io/decoder/memory_buffer.h +++ /dev/null @@ -1,25 +0,0 @@ -#pragma once - -#include "defs.h" - -namespace ffmpeg { - -/** - * Class uses external memory buffer and implements a seekable interface. - */ -class MemoryBuffer { - public: - explicit MemoryBuffer(const uint8_t* buffer, size_t size); - int64_t seek(int64_t offset, int whence); - int read(uint8_t* buf, int size); - - // static constructor for decoder callback. - static DecoderInCallback getCallback(const uint8_t* buffer, size_t size); - - private: - const uint8_t* buffer_; // set at construction time - long pos_{0}; // current position - long len_{0}; // bytes in buffer -}; - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/seekable_buffer.cpp b/torchvision/csrc/io/decoder/seekable_buffer.cpp deleted file mode 100644 index 41e3e689c7b..00000000000 --- a/torchvision/csrc/io/decoder/seekable_buffer.cpp +++ /dev/null @@ -1,139 +0,0 @@ -#include "seekable_buffer.h" -#include -#include -#include "memory_buffer.h" - -namespace ffmpeg { - -int SeekableBuffer::init( - DecoderInCallback&& in, - uint64_t timeoutMs, - size_t maxSeekableBytes, - ImageType* type) { - shutdown(); - isSeekable_ = in(nullptr, 0, 0, 0) == 0; - if (isSeekable_) { // seekable - if (type) { - if (!readBytes(in, 8, timeoutMs)) { - return -1; - } - setImageType(type); - end_ = 0; - eof_ = false; - std::vector().swap(buffer_); - // reset callback - if (in(nullptr, 0, SEEK_SET, timeoutMs)) { - return -1; - } - } - inCallback_ = std::forward(in); - return 1; - } - - if (!readBytes(in, maxSeekableBytes + (type ? 8 : 0), timeoutMs)) { - return -1; - } - - if (type) { - setImageType(type); - } - - if (eof_) { - end_ = 0; - eof_ = false; - // reuse MemoryBuffer functionality - inCallback_ = MemoryBuffer::getCallback(buffer_.data(), buffer_.size()); - isSeekable_ = true; - return 1; - } - inCallback_ = std::forward(in); - return 0; -} - -bool SeekableBuffer::readBytes( - DecoderInCallback& in, - size_t maxBytes, - uint64_t timeoutMs) { - // Resize to th minimum 4K page or less - buffer_.resize(std::min(maxBytes, size_t(4 * 1024UL))); - end_ = 0; - eof_ = false; - - auto end = - std::chrono::steady_clock::now() + std::chrono::milliseconds(timeoutMs); - auto watcher = [end]() -> bool { - return std::chrono::steady_clock::now() <= end; - }; - - bool hasTime = true; - while (!eof_ && end_ < maxBytes && (hasTime = watcher())) { - // lets read all bytes into available buffer - auto res = in(buffer_.data() + end_, buffer_.size() - end_, 0, timeoutMs); - if (res > 0) { - end_ += res; - if (end_ == buffer_.size()) { - buffer_.resize(std::min(size_t(end_ * 4UL), maxBytes)); - } - } else if (res == 0) { - eof_ = true; - } else { - // error - return false; - } - } - - buffer_.resize(end_); - - return hasTime; -} - -void SeekableBuffer::setImageType(ImageType* type) { - if (buffer_.size() > 2 && buffer_[0] == 0xFF && buffer_[1] == 0xD8 && - buffer_[2] == 0xFF) { - *type = ImageType::JPEG; - } else if ( - buffer_.size() > 3 && buffer_[1] == 'P' && buffer_[2] == 'N' && - buffer_[3] == 'G') { - *type = ImageType::PNG; - } else if ( - buffer_.size() > 1 && - ((buffer_[0] == 0x49 && buffer_[1] == 0x49) || - (buffer_[0] == 0x4D && buffer_[1] == 0x4D))) { - *type = ImageType::TIFF; - } else { - *type = ImageType::UNKNOWN; - } -} - -int SeekableBuffer::read(uint8_t* buf, int size, uint64_t timeoutMs) { - if (isSeekable_) { - return inCallback_(buf, size, 0, timeoutMs); - } - if (pos_ < end_) { - // read cached bytes for non-seekable callback - auto available = std::min(int(end_ - pos_), size); - memcpy(buf, buffer_.data() + pos_, available); - pos_ += available; - return available; - } else if (!eof_) { - // normal sequential read (see defs.h file), i.e. @buf != null - auto res = inCallback_(buf, size, 0, timeoutMs); // read through - eof_ = res == 0; - return res; - } else { - return 0; - } -} - -int64_t SeekableBuffer::seek(int64_t offset, int whence, uint64_t timeoutMs) { - return inCallback_(nullptr, offset, whence, timeoutMs); -} - -void SeekableBuffer::shutdown() { - pos_ = end_ = 0; - eof_ = false; - std::vector().swap(buffer_); - inCallback_ = nullptr; -} - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/seekable_buffer.h b/torchvision/csrc/io/decoder/seekable_buffer.h deleted file mode 100644 index 9d5729f5306..00000000000 --- a/torchvision/csrc/io/decoder/seekable_buffer.h +++ /dev/null @@ -1,45 +0,0 @@ -#pragma once - -#include "defs.h" - -namespace ffmpeg { - -/** - * Class uses internal buffer to store initial size bytes as a seekable cache - * from Media provider and let ffmpeg to seek and read bytes from cache - * and beyond - reading bytes directly from Media provider - */ -enum class ImageType { - UNKNOWN = 0, - JPEG = 1, - PNG = 2, - TIFF = 3, -}; - -class SeekableBuffer { - public: - // @type is optional, not nullptr only is image detection required - // \returns 1 is buffer seekable, 0 - if not seekable, < 0 on error - int init( - DecoderInCallback&& in, - uint64_t timeoutMs, - size_t maxSeekableBytes, - ImageType* type); - int read(uint8_t* buf, int size, uint64_t timeoutMs); - int64_t seek(int64_t offset, int whence, uint64_t timeoutMs); - void shutdown(); - - private: - bool readBytes(DecoderInCallback& in, size_t maxBytes, uint64_t timeoutMs); - void setImageType(ImageType* type); - - private: - DecoderInCallback inCallback_; - std::vector buffer_; // resized at init time - long pos_{0}; // current position (SEEK_CUR iff pos_ < end_) - long end_{0}; // current buffer size - bool eof_{0}; // indicates the EOF - bool isSeekable_{false}; // is callback seekable -}; - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/stream.cpp b/torchvision/csrc/io/decoder/stream.cpp deleted file mode 100644 index 7969741e72c..00000000000 --- a/torchvision/csrc/io/decoder/stream.cpp +++ /dev/null @@ -1,288 +0,0 @@ -#include "stream.h" -#include -#include -#include "util.h" - -namespace ffmpeg { -const AVRational timeBaseQ = AVRational{1, AV_TIME_BASE}; - -Stream::Stream( - AVFormatContext* inputCtx, - MediaFormat format, - bool convertPtsToWallTime, - int64_t loggingUuid) - : inputCtx_(inputCtx), - format_(format), - convertPtsToWallTime_(convertPtsToWallTime), - loggingUuid_(loggingUuid) {} - -Stream::~Stream() { - if (frame_) { - av_free(frame_); - } - if (codecCtx_) { - avcodec_free_context(&codecCtx_); - } -} - -// look up the proper CODEC querying the function -AVCodec* Stream::findCodec(AVCodecParameters* params) { - return (AVCodec*)avcodec_find_decoder(params->codec_id); -} - -// Allocate memory for the AVCodecContext, which will hold the context for -// decode/encode process. Then fill this codec context with CODEC parameters -// defined in stream parameters. Open the codec, and allocate the global frame -// defined in the header file -int Stream::openCodec(std::vector* metadata, int num_threads) { - AVStream* steam = inputCtx_->streams[format_.stream]; - - AVCodec* codec = findCodec(steam->codecpar); - if (!codec) { - LOG(ERROR) << "LoggingUuid #" << loggingUuid_ - << ", avcodec_find_decoder failed for codec_id: " - << int(steam->codecpar->codec_id); - return AVERROR(EINVAL); - } - - if (!(codecCtx_ = avcodec_alloc_context3(codec))) { - LOG(ERROR) << "LoggingUuid #" << loggingUuid_ - << ", avcodec_alloc_context3 failed"; - return AVERROR(ENOMEM); - } - // multithreading heuristics - // if user defined, - if (num_threads > max_threads) { - num_threads = max_threads; - } - - if (num_threads > 0) { - // if user defined, respect that - // note that default thread_type will be used - codecCtx_->thread_count = num_threads; - } else { - // otherwise set sensible defaults - codecCtx_->thread_count = 8; - codecCtx_->thread_type = FF_THREAD_SLICE; - } - - int ret; - // Copy codec parameters from input stream to output codec context - if ((ret = avcodec_parameters_to_context(codecCtx_, steam->codecpar)) < 0) { - LOG(ERROR) << "LoggingUuid #" << loggingUuid_ - << ", avcodec_parameters_to_context failed"; - return ret; - } - - // after avcodec_open2, value of codecCtx_->time_base is NOT meaningful - if ((ret = avcodec_open2(codecCtx_, codec, nullptr)) < 0) { - LOG(ERROR) << "LoggingUuid #" << loggingUuid_ - << ", avcodec_open2 failed: " << Util::generateErrorDesc(ret); - avcodec_free_context(&codecCtx_); - codecCtx_ = nullptr; - return ret; - } - - frame_ = av_frame_alloc(); - - switch (format_.type) { - case TYPE_VIDEO: - fps_ = av_q2d(av_guess_frame_rate(inputCtx_, steam, nullptr)); - break; - case TYPE_AUDIO: - fps_ = codecCtx_->sample_rate; - break; - default: - fps_ = 30.0; - } - - if ((ret = initFormat())) { - LOG(ERROR) << "initFormat failed, type: " << format_.type; - } - - if (metadata) { - DecoderMetadata header; - header.format = format_; - header.fps = fps_; - header.num = steam->time_base.num; - header.den = steam->time_base.den; - header.duration = - av_rescale_q(steam->duration, steam->time_base, timeBaseQ); - metadata->push_back(header); - } - - return ret; -} - -// send the raw data packet (compressed frame) to the decoder, through the codec -// context and receive the raw data frame (uncompressed frame) from the -// decoder, through the same codec context -int Stream::analyzePacket(const AVPacket* packet, bool* gotFrame) { - int consumed = 0; - int result = avcodec_send_packet(codecCtx_, packet); - if (result == AVERROR(EAGAIN)) { - *gotFrame = false; // no bytes get consumed, fetch frame - } else if (result == AVERROR_EOF) { - *gotFrame = false; // more than one flush packet - if (packet) { - // got packet after flush, this is an error - return result; - } - } else if (result < 0) { - LOG(ERROR) << "avcodec_send_packet failed, err: " - << Util::generateErrorDesc(result); - return result; // error - } else { - consumed = packet ? packet->size : 0; // all bytes get consumed - } - - result = avcodec_receive_frame(codecCtx_, frame_); - - if (result >= 0) { - *gotFrame = true; // frame is available - } else if (result == AVERROR(EAGAIN)) { - *gotFrame = false; // no frames at this time, needs more packets - if (!consumed) { - // precaution, if no packages got consumed and no frames are available - return result; - } - } else if (result == AVERROR_EOF) { - *gotFrame = false; // the last frame has been flushed - // precaution, if no more frames are available assume we consume all bytes - consumed = 0; - } else { // error - LOG(ERROR) << "avcodec_receive_frame failed, err: " - << Util::generateErrorDesc(result); - return result; - } - return consumed; -} - -// General decoding function: -// given the packet, analyse the metadata, and write the -// metadata and the buffer to the DecoderOutputImage. -int Stream::decodePacket( - const AVPacket* packet, - DecoderOutputMessage* out, - bool headerOnly, - bool* hasMsg) { - int consumed; - bool gotFrame = false; - *hasMsg = false; - if ((consumed = analyzePacket(packet, &gotFrame)) >= 0 && - (packet == nullptr || gotFrame)) { - int result; - if ((result = getMessage(out, !gotFrame, headerOnly)) < 0) { - return result; // report error - } - *hasMsg = result > 0; - } - return consumed; -} - -int Stream::flush(DecoderOutputMessage* out, bool headerOnly) { - bool hasMsg = false; - int result = decodePacket(nullptr, out, headerOnly, &hasMsg); - if (result < 0) { - avcodec_flush_buffers(codecCtx_); - return result; - } - if (!hasMsg) { - avcodec_flush_buffers(codecCtx_); - return 0; - } - return 1; -} - -// Sets the header and payload via stream::setHeader and copyFrameBytes -// functions that are defined in type stream subclass (VideoStream, AudioStream, -// ...) -int Stream::getMessage(DecoderOutputMessage* out, bool flush, bool headerOnly) { - if (flush) { - // only flush of audio frames makes sense - if (format_.type == TYPE_AUDIO) { - int processed = 0; - size_t total = 0; - // grab all audio bytes by chunks - do { - if ((processed = copyFrameBytes(out->payload.get(), flush)) < 0) { - return processed; - } - total += processed; - } while (processed); - - if (total) { - // set header if message bytes are available - setHeader(&out->header, flush); - return 1; - } - } - return 0; - } else { - if (format_.type == TYPE_AUDIO) { - int processed = 0; - if ((processed = copyFrameBytes(out->payload.get(), flush)) < 0) { - return processed; - } - if (processed) { - // set header if message bytes are available - setHeader(&out->header, flush); - return 1; - } - return 0; - } else { - // set header - setHeader(&out->header, flush); - - if (headerOnly) { - // Only header is requisted - return 1; - } - - return copyFrameBytes(out->payload.get(), flush); - } - } -} - -void Stream::setHeader(DecoderHeader* header, bool flush) { - header->seqno = numGenerator_++; - - setFramePts(header, flush); - - if (convertPtsToWallTime_) { - keeper_.adjust(header->pts); - } - - header->format = format_; - header->keyFrame = 0; - header->fps = std::numeric_limits::quiet_NaN(); -} - -void Stream::setFramePts(DecoderHeader* header, bool flush) { - if (flush) { - header->pts = nextPts_; // already in us - } else { - header->pts = frame_->best_effort_timestamp; - if (header->pts == AV_NOPTS_VALUE) { - header->pts = nextPts_; - } else { - header->pts = av_rescale_q( - header->pts, - inputCtx_->streams[format_.stream]->time_base, - timeBaseQ); - } - - switch (format_.type) { - case TYPE_AUDIO: - nextPts_ = header->pts + frame_->nb_samples * AV_TIME_BASE / fps_; - break; - case TYPE_VIDEO: - nextPts_ = header->pts + AV_TIME_BASE / fps_; - break; - default: - nextPts_ = header->pts; - } - } -} - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/stream.h b/torchvision/csrc/io/decoder/stream.h deleted file mode 100644 index 6250dd9ecd2..00000000000 --- a/torchvision/csrc/io/decoder/stream.h +++ /dev/null @@ -1,80 +0,0 @@ -#pragma once - -#include -#include "defs.h" -#include "time_keeper.h" - -namespace ffmpeg { - -/** - * Class uses FFMPEG library to decode one media stream (audio or video). - */ - -class Stream { - public: - Stream( - AVFormatContext* inputCtx, - MediaFormat format, - bool convertPtsToWallTime, - int64_t loggingUuid); - virtual ~Stream(); - - // returns 0 - on success or negative error - // num_threads sets up the codec context for multithreading if needed - // default is set to single thread in order to not break BC - int openCodec(std::vector* metadata, int num_threads = 1); - // returns 1 - if packet got consumed, 0 - if it's not, and < 0 on error - int decodePacket( - const AVPacket* packet, - DecoderOutputMessage* out, - bool headerOnly, - bool* hasMsg); - // returns stream index - int getIndex() const { - return format_.stream; - } - // returns 1 - if message got a payload, 0 - if it's not, and < 0 on error - int flush(DecoderOutputMessage* out, bool headerOnly); - // return media format - MediaFormat getMediaFormat() const { - return format_; - } - - protected: - virtual int initFormat() = 0; - // returns number processed bytes from packet, or negative error - virtual int analyzePacket(const AVPacket* packet, bool* gotFrame); - // returns number processed bytes from packet, or negative error - virtual int copyFrameBytes(ByteStorage* out, bool flush) = 0; - // sets output format - virtual void setHeader(DecoderHeader* header, bool flush); - // set frame pts - virtual void setFramePts(DecoderHeader* header, bool flush); - // finds codec - virtual AVCodec* findCodec(AVCodecParameters* params); - - private: - // returns 1 - if message got a payload, 0 - if it's not, and < 0 on error - int getMessage(DecoderOutputMessage* out, bool flush, bool headerOnly); - - protected: - AVFormatContext* const inputCtx_; - MediaFormat format_; - const bool convertPtsToWallTime_; - int64_t loggingUuid_; - - AVCodecContext* codecCtx_{nullptr}; - AVFrame* frame_{nullptr}; - - std::atomic numGenerator_{0}; - TimeKeeper keeper_; - // estimated next frame pts for flushing the last frame - int64_t nextPts_{0}; - double fps_{30.}; - // this is a dumb conservative limit; ideally we'd use - // int max_threads = at::get_num_threads(); but this would cause - // fb sync to fail as it would add dependency to ATen to the decoder API - const int max_threads = 12; -}; - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/subtitle_sampler.cpp b/torchvision/csrc/io/decoder/subtitle_sampler.cpp deleted file mode 100644 index d0df24d3e35..00000000000 --- a/torchvision/csrc/io/decoder/subtitle_sampler.cpp +++ /dev/null @@ -1,46 +0,0 @@ -#include "subtitle_sampler.h" -#include -#include "util.h" - -namespace ffmpeg { - -SubtitleSampler::~SubtitleSampler() { - cleanUp(); -} - -void SubtitleSampler::shutdown() { - cleanUp(); -} - -bool SubtitleSampler::init(const SamplerParameters& params) { - cleanUp(); - // set formats - params_ = params; - return true; -} - -int SubtitleSampler::sample(AVSubtitle* sub, ByteStorage* out) { - if (!sub || !out) { - return 0; // flush - } - - out->ensure(Util::size(*sub)); - - return Util::serialize(*sub, out); -} - -int SubtitleSampler::sample(const ByteStorage* in, ByteStorage* out) { - if (in && out) { - // Get a writable copy - if (size_t len = in->length()) { - out->ensure(len); - memcpy(out->writableTail(), in->data(), len); - } - return out->length(); - } - return 0; -} - -void SubtitleSampler::cleanUp() {} - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/subtitle_sampler.h b/torchvision/csrc/io/decoder/subtitle_sampler.h deleted file mode 100644 index 4aee811ed56..00000000000 --- a/torchvision/csrc/io/decoder/subtitle_sampler.h +++ /dev/null @@ -1,32 +0,0 @@ -#pragma once - -#include "defs.h" - -namespace ffmpeg { - -/** - * Class transcode audio frames from one format into another - */ - -class SubtitleSampler : public MediaSampler { - public: - SubtitleSampler() = default; - ~SubtitleSampler() override; - - bool init(const SamplerParameters& params) override; - int sample(const ByteStorage* in, ByteStorage* out) override; - void shutdown() override; - - // returns number processed/scaling bytes - int sample(AVSubtitle* sub, ByteStorage* out); - - // helper serialization/deserialization methods - static void serialize(const AVSubtitle& sub, ByteStorage* out); - static bool deserialize(const ByteStorage& buf, AVSubtitle* sub); - - private: - // close resources - void cleanUp(); -}; - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/subtitle_stream.cpp b/torchvision/csrc/io/decoder/subtitle_stream.cpp deleted file mode 100644 index 3416f702d7e..00000000000 --- a/torchvision/csrc/io/decoder/subtitle_stream.cpp +++ /dev/null @@ -1,96 +0,0 @@ -#include "subtitle_stream.h" -#include -#include "util.h" - -namespace ffmpeg { -const AVRational timeBaseQ = AVRational{1, AV_TIME_BASE}; - -SubtitleStream::SubtitleStream( - AVFormatContext* inputCtx, - int index, - bool convertPtsToWallTime, - const SubtitleFormat& format) - : Stream( - inputCtx, - MediaFormat::makeMediaFormat(format, index), - convertPtsToWallTime, - 0) { - memset(&sub_, 0, sizeof(sub_)); -} - -void SubtitleStream::releaseSubtitle() { - if (sub_.release) { - avsubtitle_free(&sub_); - memset(&sub_, 0, sizeof(sub_)); - } -} - -SubtitleStream::~SubtitleStream() { - releaseSubtitle(); - sampler_.shutdown(); -} - -int SubtitleStream::initFormat() { - if (!codecCtx_->subtitle_header) { - LOG(ERROR) << "No subtitle header found"; - } else { - VLOG(1) << "Subtitle header found!"; - } - return 0; -} - -int SubtitleStream::analyzePacket(const AVPacket* packet, bool* gotFrame) { - // clean-up - releaseSubtitle(); - - // FIXME: should this even be created? - AVPacket* avPacket; - avPacket = av_packet_alloc(); - if (avPacket == nullptr) { - LOG(ERROR) - << "decoder as not able to allocate the subtitle-specific packet."; - // alternative to ENOMEM - return AVERROR_BUFFER_TOO_SMALL; - } - avPacket->data = nullptr; - avPacket->size = 0; - // check flush packet - auto pkt = packet ? packet : avPacket; - - int gotFramePtr = 0; - // is these a better way than cast from const? - int result = - avcodec_decode_subtitle2(codecCtx_, &sub_, &gotFramePtr, (AVPacket*)pkt); - - if (result < 0) { - LOG(ERROR) << "avcodec_decode_subtitle2 failed, err: " - << Util::generateErrorDesc(result); - // free the packet we've created - av_packet_free(&avPacket); - return result; - } else if (result == 0) { - result = pkt->size; // discard the rest of the package - } - - sub_.release = gotFramePtr; - *gotFrame = gotFramePtr > 0; - - // set proper pts in us - if (gotFramePtr) { - sub_.pts = av_rescale_q( - pkt->pts, inputCtx_->streams[format_.stream]->time_base, timeBaseQ); - } - - av_packet_free(&avPacket); - return result; -} - -int SubtitleStream::copyFrameBytes(ByteStorage* out, bool flush) { - return sampler_.sample(flush ? nullptr : &sub_, out); -} - -void SubtitleStream::setFramePts(DecoderHeader* header, bool) { - header->pts = sub_.pts; // already in us -} - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/subtitle_stream.h b/torchvision/csrc/io/decoder/subtitle_stream.h deleted file mode 100644 index 6c366e11f50..00000000000 --- a/torchvision/csrc/io/decoder/subtitle_stream.h +++ /dev/null @@ -1,38 +0,0 @@ -#pragma once - -#include "stream.h" -#include "subtitle_sampler.h" - -namespace ffmpeg { - -/** - * Class uses FFMPEG library to decode one subtitle stream. - */ -struct AVSubtitleKeeper : AVSubtitle { - int64_t release{0}; -}; - -class SubtitleStream : public Stream { - public: - SubtitleStream( - AVFormatContext* inputCtx, - int index, - bool convertPtsToWallTime, - const SubtitleFormat& format); - ~SubtitleStream() override; - - protected: - void setFramePts(DecoderHeader* header, bool flush) override; - - private: - int initFormat() override; - int analyzePacket(const AVPacket* packet, bool* gotFrame) override; - int copyFrameBytes(ByteStorage* out, bool flush) override; - void releaseSubtitle(); - - private: - SubtitleSampler sampler_; - AVSubtitleKeeper sub_; -}; - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/sync_decoder.cpp b/torchvision/csrc/io/decoder/sync_decoder.cpp deleted file mode 100644 index 1f03ef8eb95..00000000000 --- a/torchvision/csrc/io/decoder/sync_decoder.cpp +++ /dev/null @@ -1,97 +0,0 @@ -#include "sync_decoder.h" -#include - -namespace ffmpeg { - -SyncDecoder::AVByteStorage::AVByteStorage(size_t n) { - ensure(n); -} - -SyncDecoder::AVByteStorage::~AVByteStorage() { - av_free(buffer_); -} - -void SyncDecoder::AVByteStorage::ensure(size_t n) { - if (tail() < n) { - capacity_ = offset_ + length_ + n; - buffer_ = static_cast(av_realloc(buffer_, capacity_)); - } -} - -uint8_t* SyncDecoder::AVByteStorage::writableTail() { - TORCH_CHECK_LE(offset_ + length_, capacity_); - return buffer_ + offset_ + length_; -} - -void SyncDecoder::AVByteStorage::append(size_t n) { - TORCH_CHECK_LE(n, tail()); - length_ += n; -} - -void SyncDecoder::AVByteStorage::trim(size_t n) { - TORCH_CHECK_LE(n, length_); - offset_ += n; - length_ -= n; -} - -const uint8_t* SyncDecoder::AVByteStorage::data() const { - return buffer_ + offset_; -} - -size_t SyncDecoder::AVByteStorage::length() const { - return length_; -} - -size_t SyncDecoder::AVByteStorage::tail() const { - TORCH_CHECK_LE(offset_ + length_, capacity_); - return capacity_ - offset_ - length_; -} - -void SyncDecoder::AVByteStorage::clear() { - offset_ = 0; - length_ = 0; -} - -std::unique_ptr SyncDecoder::createByteStorage(size_t n) { - return std::make_unique(n); -} - -void SyncDecoder::onInit() { - eof_ = false; - queue_.clear(); -} - -int SyncDecoder::decode(DecoderOutputMessage* out, uint64_t timeoutMs) { - if (eof_ && queue_.empty()) { - return ENODATA; - } - - if (queue_.empty()) { - int result = getFrame(timeoutMs); - // assign EOF - eof_ = result == ENODATA; - // check unrecoverable error, any error but ENODATA - if (result && result != ENODATA) { - return result; - } - - // still empty - if (queue_.empty()) { - if (eof_) { - return ENODATA; - } else { - LOG(INFO) << "Queue is empty"; - return ETIMEDOUT; - } - } - } - - *out = std::move(queue_.front()); - queue_.pop_front(); - return 0; -} - -void SyncDecoder::push(DecoderOutputMessage&& buffer) { - queue_.push_back(std::move(buffer)); -} -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/sync_decoder.h b/torchvision/csrc/io/decoder/sync_decoder.h deleted file mode 100644 index b7cf7b625ac..00000000000 --- a/torchvision/csrc/io/decoder/sync_decoder.h +++ /dev/null @@ -1,48 +0,0 @@ -#pragma once - -#include -#include "decoder.h" - -namespace ffmpeg { - -/** - * Class uses FFMPEG library to decode media streams. - * Media bytes can be explicitly provided through read-callback - * or fetched internally by FFMPEG library - */ -class SyncDecoder : public Decoder { - public: - // Allocation of memory must be done with a proper alignment. - class AVByteStorage : public ByteStorage { - public: - explicit AVByteStorage(size_t n); - ~AVByteStorage() override; - void ensure(size_t n) override; - uint8_t* writableTail() override; - void append(size_t n) override; - void trim(size_t n) override; - const uint8_t* data() const override; - size_t length() const override; - size_t tail() const override; - void clear() override; - - private: - size_t offset_{0}; - size_t length_{0}; - size_t capacity_{0}; - uint8_t* buffer_{nullptr}; - }; - - public: - int decode(DecoderOutputMessage* out, uint64_t timeoutMs) override; - - private: - void push(DecoderOutputMessage&& buffer) override; - void onInit() override; - std::unique_ptr createByteStorage(size_t n) override; - - private: - std::list queue_; - bool eof_{false}; -}; -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/sync_decoder_test.cpp b/torchvision/csrc/io/decoder/sync_decoder_test.cpp deleted file mode 100644 index 085966ce687..00000000000 --- a/torchvision/csrc/io/decoder/sync_decoder_test.cpp +++ /dev/null @@ -1,416 +0,0 @@ -#include -#include -#include -#include "memory_buffer.h" -#include "sync_decoder.h" -#include "util.h" - -using namespace ffmpeg; - -namespace { -struct VideoFileStats { - std::string name; - size_t durationPts{0}; - int num{0}; - int den{0}; - int fps{0}; -}; - -void gotAllTestFiles( - const std::string& folder, - std::vector* stats) { - DIR* d = opendir(folder.c_str()); - CHECK(d); - struct dirent* dir; - while ((dir = readdir(d))) { - if (dir->d_type != DT_DIR && 0 != strcmp(dir->d_name, "README")) { - VideoFileStats item; - item.name = folder + '/' + dir->d_name; - LOG(INFO) << "Found video file: " << item.name; - stats->push_back(std::move(item)); - } - } - closedir(d); -} - -void gotFilesStats(std::vector& stats) { - DecoderParameters params; - params.timeoutMs = 10000; - params.startOffset = 1000000; - params.seekAccuracy = 100000; - params.formats = {MediaFormat(0)}; - params.headerOnly = true; - params.preventStaleness = false; - size_t avgProvUs = 0; - const size_t rounds = 100; - for (auto& item : stats) { - LOG(INFO) << "Decoding video file in memory: " << item.name; - FILE* f = fopen(item.name.c_str(), "rb"); - CHECK(f != nullptr); - fseek(f, 0, SEEK_END); - std::vector buffer(ftell(f)); - rewind(f); - size_t s = fread(buffer.data(), 1, buffer.size(), f); - TORCH_CHECK_EQ(buffer.size(), s); - fclose(f); - - for (size_t i = 0; i < rounds; ++i) { - SyncDecoder decoder; - std::vector metadata; - const auto now = std::chrono::steady_clock::now(); - CHECK(decoder.init( - params, - MemoryBuffer::getCallback(buffer.data(), buffer.size()), - &metadata)); - const auto then = std::chrono::steady_clock::now(); - decoder.shutdown(); - avgProvUs += - std::chrono::duration_cast(then - now) - .count(); - TORCH_CHECK_EQ(metadata.size(), 1); - item.num = metadata[0].num; - item.den = metadata[0].den; - item.fps = metadata[0].fps; - item.durationPts = - av_rescale_q(metadata[0].duration, AV_TIME_BASE_Q, {1, item.fps}); - } - } - LOG(INFO) << "Probing (us) " << avgProvUs / stats.size() / rounds; -} - -size_t measurePerformanceUs( - const std::vector& stats, - size_t rounds, - size_t num, - size_t stride) { - size_t avgClipDecodingUs = 0; - std::srand(time(nullptr)); - for (const auto& item : stats) { - FILE* f = fopen(item.name.c_str(), "rb"); - CHECK(f != nullptr); - fseek(f, 0, SEEK_END); - std::vector buffer(ftell(f)); - rewind(f); - size_t s = fread(buffer.data(), 1, buffer.size(), f); - TORCH_CHECK_EQ(buffer.size(), s); - fclose(f); - - for (size_t i = 0; i < rounds; ++i) { - // randomy select clip - size_t rOffset = std::rand(); - size_t fOffset = rOffset % item.durationPts; - size_t clipFrames = num + (num - 1) * stride; - if (fOffset + clipFrames > item.durationPts) { - fOffset = item.durationPts - clipFrames; - } - - DecoderParameters params; - params.timeoutMs = 10000; - params.startOffset = 1000000; - params.seekAccuracy = 100000; - params.preventStaleness = false; - - for (size_t n = 0; n < num; ++n) { - std::list msgs; - - params.startOffset = - av_rescale_q(fOffset, {1, item.fps}, AV_TIME_BASE_Q); - params.endOffset = params.startOffset + 100; - - auto now = std::chrono::steady_clock::now(); - SyncDecoder decoder; - CHECK(decoder.init( - params, - MemoryBuffer::getCallback(buffer.data(), buffer.size()), - nullptr)); - DecoderOutputMessage out; - while (0 == decoder.decode(&out, params.timeoutMs)) { - msgs.push_back(std::move(out)); - } - - decoder.shutdown(); - - const auto then = std::chrono::steady_clock::now(); - - fOffset += 1 + stride; - - avgClipDecodingUs += - std::chrono::duration_cast(then - now) - .count(); - } - } - } - - return avgClipDecodingUs / rounds / num / stats.size(); -} - -void runDecoder(SyncDecoder& decoder) { - DecoderOutputMessage out; - size_t audioFrames = 0, videoFrames = 0, totalBytes = 0; - while (0 == decoder.decode(&out, 10000)) { - if (out.header.format.type == TYPE_AUDIO) { - ++audioFrames; - } else if (out.header.format.type == TYPE_VIDEO) { - ++videoFrames; - } else if (out.header.format.type == TYPE_SUBTITLE && out.payload) { - // deserialize - LOG(INFO) << "Deserializing subtitle"; - AVSubtitle sub; - memset(&sub, 0, sizeof(sub)); - EXPECT_TRUE(Util::deserialize(*out.payload, &sub)); - LOG(INFO) << "Found subtitles" << ", num rects: " << sub.num_rects; - for (int i = 0; i < sub.num_rects; ++i) { - std::string text = "picture"; - if (sub.rects[i]->type == SUBTITLE_TEXT) { - text = sub.rects[i]->text; - } else if (sub.rects[i]->type == SUBTITLE_ASS) { - text = sub.rects[i]->ass; - } - - LOG(INFO) << "Rect num: " << i << ", type:" << sub.rects[i]->type - << ", text: " << text; - } - - avsubtitle_free(&sub); - } - if (out.payload) { - totalBytes += out.payload->length(); - } - } - LOG(INFO) << "Decoded audio frames: " << audioFrames - << ", video frames: " << videoFrames - << ", total bytes: " << totalBytes; -} -} // namespace - -TEST(SyncDecoder, TestSyncDecoderPerformance) { - // Measure the average time of decoding per clip - // 1. list of the videos in testing directory - // 2. for each video got number of frames with timestamps - // 3. randomly select frame offset - // 4. adjust offset for number frames and strides, - // if it's out out upper boundary - // 5. repeat multiple times, measuring and accumulating decoding time - // per clip. - /* - 1) 4 x 2 - 2) 8 x 8 - 3) 16 x 8 - 4) 32 x 4 - */ - const std::string kFolder = "pytorch/vision/test/assets/videos"; - std::vector stats; - gotAllTestFiles(kFolder, &stats); - gotFilesStats(stats); - - const size_t kRounds = 10; - - auto new4x2 = measurePerformanceUs(stats, kRounds, 4, 2); - auto new8x8 = measurePerformanceUs(stats, kRounds, 8, 8); - auto new16x8 = measurePerformanceUs(stats, kRounds, 16, 8); - auto new32x4 = measurePerformanceUs(stats, kRounds, 32, 4); - LOG(INFO) << "Clip decoding (us)" << ", new(4x2): " << new4x2 - << ", new(8x8): " << new8x8 << ", new(16x8): " << new16x8 - << ", new(32x4): " << new32x4; -} - -TEST(SyncDecoder, Test) { - SyncDecoder decoder; - DecoderParameters params; - params.timeoutMs = 10000; - params.startOffset = 1000000; - params.seekAccuracy = 100000; - params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')}; - params.uri = "pytorch/vision/test/assets/videos/R6llTwEh07w.mp4"; - CHECK(decoder.init(params, nullptr, nullptr)); - runDecoder(decoder); - decoder.shutdown(); -} - -TEST(SyncDecoder, TestSubtitles) { - SyncDecoder decoder; - DecoderParameters params; - params.timeoutMs = 10000; - params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')}; - params.uri = "vue/synergy/data/robotsub.mp4"; - CHECK(decoder.init(params, nullptr, nullptr)); - runDecoder(decoder); - decoder.shutdown(); -} - -TEST(SyncDecoder, TestHeadersOnly) { - SyncDecoder decoder; - DecoderParameters params; - params.timeoutMs = 10000; - params.startOffset = 1000000; - params.seekAccuracy = 100000; - params.headerOnly = true; - params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')}; - - params.uri = "pytorch/vision/test/assets/videos/R6llTwEh07w.mp4"; - CHECK(decoder.init(params, nullptr, nullptr)); - runDecoder(decoder); - decoder.shutdown(); - - params.uri = "pytorch/vision/test/assets/videos/SOX5yA1l24A.mp4"; - CHECK(decoder.init(params, nullptr, nullptr)); - runDecoder(decoder); - decoder.shutdown(); - - params.uri = "pytorch/vision/test/assets/videos/WUzgd7C1pWA.mp4"; - CHECK(decoder.init(params, nullptr, nullptr)); - runDecoder(decoder); - decoder.shutdown(); -} - -TEST(SyncDecoder, TestHeadersOnlyDownSampling) { - SyncDecoder decoder; - DecoderParameters params; - params.timeoutMs = 10000; - params.startOffset = 1000000; - params.seekAccuracy = 100000; - params.headerOnly = true; - MediaFormat format; - format.type = TYPE_AUDIO; - format.format.audio.samples = 8000; - params.formats.insert(format); - - format.type = TYPE_VIDEO; - format.format.video.width = 224; - format.format.video.height = 224; - params.formats.insert(format); - - params.uri = "pytorch/vision/test/assets/videos/R6llTwEh07w.mp4"; - CHECK(decoder.init(params, nullptr, nullptr)); - runDecoder(decoder); - decoder.shutdown(); - - params.uri = "pytorch/vision/test/assets/videos/SOX5yA1l24A.mp4"; - CHECK(decoder.init(params, nullptr, nullptr)); - runDecoder(decoder); - decoder.shutdown(); - - params.uri = "pytorch/vision/test/assets/videos/WUzgd7C1pWA.mp4"; - CHECK(decoder.init(params, nullptr, nullptr)); - runDecoder(decoder); - decoder.shutdown(); -} - -TEST(SyncDecoder, TestInitOnlyNoShutdown) { - SyncDecoder decoder; - DecoderParameters params; - params.timeoutMs = 10000; - params.startOffset = 1000000; - params.seekAccuracy = 100000; - params.headerOnly = false; - params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')}; - params.uri = "pytorch/vision/test/assets/videos/R6llTwEh07w.mp4"; - std::vector metadata; - CHECK(decoder.init(params, nullptr, &metadata)); -} - -TEST(SyncDecoder, TestMemoryBuffer) { - SyncDecoder decoder; - DecoderParameters params; - params.timeoutMs = 10000; - params.startOffset = 1000000; - params.endOffset = 9000000; - params.seekAccuracy = 10000; - params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')}; - - FILE* f = fopen( - "pytorch/vision/test/assets/videos/RATRACE_wave_f_nm_np1_fr_goo_37.avi", - "rb"); - CHECK(f != nullptr); - fseek(f, 0, SEEK_END); - std::vector buffer(ftell(f)); - rewind(f); - size_t s = fread(buffer.data(), 1, buffer.size(), f); - TORCH_CHECK_EQ(buffer.size(), s); - fclose(f); - CHECK(decoder.init( - params, - MemoryBuffer::getCallback(buffer.data(), buffer.size()), - nullptr)); - LOG(INFO) << "Decoding from memory bytes: " << buffer.size(); - runDecoder(decoder); - decoder.shutdown(); -} - -TEST(SyncDecoder, TestMemoryBufferNoSeekableWithFullRead) { - SyncDecoder decoder; - DecoderParameters params; - params.timeoutMs = 10000; - params.startOffset = 1000000; - params.endOffset = 9000000; - params.seekAccuracy = 10000; - params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')}; - - FILE* f = fopen("pytorch/vision/test/assets/videos/R6llTwEh07w.mp4", "rb"); - CHECK(f != nullptr); - fseek(f, 0, SEEK_END); - std::vector buffer(ftell(f)); - rewind(f); - size_t s = fread(buffer.data(), 1, buffer.size(), f); - TORCH_CHECK_EQ(buffer.size(), s); - fclose(f); - - params.maxSeekableBytes = buffer.size() + 1; - MemoryBuffer object(buffer.data(), buffer.size()); - CHECK(decoder.init( - params, - [object](uint8_t* out, int size, int whence, uint64_t timeoutMs) mutable - -> int { - if (out) { // see defs.h file - // read mode - return object.read(out, size); - } - // seek mode - if (!timeoutMs) { - // seek capability, yes - no - return -1; - } - return object.seek(size, whence); - }, - nullptr)); - runDecoder(decoder); - decoder.shutdown(); -} - -TEST(SyncDecoder, TestMemoryBufferNoSeekableWithPartialRead) { - SyncDecoder decoder; - DecoderParameters params; - params.timeoutMs = 10000; - params.startOffset = 1000000; - params.endOffset = 9000000; - params.seekAccuracy = 10000; - params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')}; - - FILE* f = fopen("pytorch/vision/test/assets/videos/R6llTwEh07w.mp4", "rb"); - CHECK(f != nullptr); - fseek(f, 0, SEEK_END); - std::vector buffer(ftell(f)); - rewind(f); - size_t s = fread(buffer.data(), 1, buffer.size(), f); - TORCH_CHECK_EQ(buffer.size(), s); - fclose(f); - - params.maxSeekableBytes = buffer.size() / 2; - MemoryBuffer object(buffer.data(), buffer.size()); - CHECK(!decoder.init( - params, - [object](uint8_t* out, int size, int whence, uint64_t timeoutMs) mutable - -> int { - if (out) { // see defs.h file - // read mode - return object.read(out, size); - } - // seek mode - if (!timeoutMs) { - // seek capability, yes - no - return -1; - } - return object.seek(size, whence); - }, - nullptr)); -} diff --git a/torchvision/csrc/io/decoder/time_keeper.cpp b/torchvision/csrc/io/decoder/time_keeper.cpp deleted file mode 100644 index 845c76cddc8..00000000000 --- a/torchvision/csrc/io/decoder/time_keeper.cpp +++ /dev/null @@ -1,35 +0,0 @@ -#include "time_keeper.h" -#include "defs.h" - -namespace ffmpeg { - -namespace { -const long kMaxTimeBaseDiference = 10; -} - -long TimeKeeper::adjust(long& decoderTimestamp) { - const long now = std::chrono::duration_cast( - std::chrono::system_clock::now().time_since_epoch()) - .count(); - - if (startTime_ == 0) { - startTime_ = now; - } - if (streamTimestamp_ == 0) { - streamTimestamp_ = decoderTimestamp; - } - - const auto runOut = startTime_ + decoderTimestamp - streamTimestamp_; - - if (std::labs((now - runOut) / AV_TIME_BASE) > kMaxTimeBaseDiference) { - streamTimestamp_ = startTime_ - now + decoderTimestamp; - } - - const auto sleepAdvised = runOut - now; - - decoderTimestamp += startTime_ - streamTimestamp_; - - return sleepAdvised > 0 ? sleepAdvised : 0; -} - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/time_keeper.h b/torchvision/csrc/io/decoder/time_keeper.h deleted file mode 100644 index e4d4718c705..00000000000 --- a/torchvision/csrc/io/decoder/time_keeper.h +++ /dev/null @@ -1,25 +0,0 @@ -#pragma once - -#include -#include - -namespace ffmpeg { - -/** - * Class keeps the track of the decoded timestamps (us) for media streams. - */ - -class TimeKeeper { - public: - TimeKeeper() = default; - - // adjust provided @timestamp to the corrected value - // return advised sleep time before next frame processing in (us) - long adjust(long& decoderTimestamp); - - private: - long startTime_{0}; - long streamTimestamp_{0}; -}; - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/util.cpp b/torchvision/csrc/io/decoder/util.cpp deleted file mode 100644 index 7198d2174ed..00000000000 --- a/torchvision/csrc/io/decoder/util.cpp +++ /dev/null @@ -1,401 +0,0 @@ -#include "util.h" -#include - -namespace ffmpeg { - -namespace Serializer { - -// fixed size types -template -inline size_t getSize(const T& x) { - return sizeof(x); -} - -template -inline bool serializeItem( - uint8_t* dest, - size_t len, - size_t& pos, - const T& src) { - VLOG(6) << "Generic serializeItem"; - const auto required = sizeof(src); - if (len < pos + required) { - return false; - } - memcpy(dest + pos, &src, required); - pos += required; - return true; -} - -template -inline bool deserializeItem( - const uint8_t* src, - size_t len, - size_t& pos, - T& dest) { - const auto required = sizeof(dest); - if (len < pos + required) { - return false; - } - memcpy(&dest, src + pos, required); - pos += required; - return true; -} - -// AVSubtitleRect specialization -inline size_t getSize(const AVSubtitleRect& x) { - auto rectBytes = [](const AVSubtitleRect& y) -> size_t { - size_t s = 0; - switch (y.type) { - case SUBTITLE_BITMAP: - for (int i = 0; i < y.nb_colors; ++i) { - s += sizeof(y.linesize[i]); - s += y.linesize[i]; - } - break; - case SUBTITLE_TEXT: - s += sizeof(size_t); - s += strlen(y.text); - break; - case SUBTITLE_ASS: - s += sizeof(size_t); - s += strlen(y.ass); - break; - default: - break; - } - return s; - }; - return getSize(x.x) + getSize(x.y) + getSize(x.w) + getSize(x.h) + - getSize(x.nb_colors) + getSize(x.type) + getSize(x.flags) + rectBytes(x); -} - -// AVSubtitle specialization -inline size_t getSize(const AVSubtitle& x) { - auto rectBytes = [](const AVSubtitle& y) -> size_t { - size_t s = getSize(y.num_rects); - for (unsigned i = 0; i < y.num_rects; ++i) { - s += getSize(*y.rects[i]); - } - return s; - }; - return getSize(x.format) + getSize(x.start_display_time) + - getSize(x.end_display_time) + getSize(x.pts) + rectBytes(x); -} - -inline bool serializeItem( - uint8_t* dest, - size_t len, - size_t& pos, - const AVSubtitleRect& src) { - auto rectSerialize = - [](uint8_t* d, size_t l, size_t& p, const AVSubtitleRect& x) -> size_t { - switch (x.type) { - case SUBTITLE_BITMAP: - for (int i = 0; i < x.nb_colors; ++i) { - if (!serializeItem(d, l, p, x.linesize[i])) { - return false; - } - if (p + x.linesize[i] > l) { - return false; - } - memcpy(d + p, x.data[i], x.linesize[i]); - p += x.linesize[i]; - } - return true; - case SUBTITLE_TEXT: { - const size_t s = strlen(x.text); - if (!serializeItem(d, l, p, s)) { - return false; - } - if (p + s > l) { - return false; - } - memcpy(d + p, x.text, s); - p += s; - return true; - } - case SUBTITLE_ASS: { - const size_t s = strlen(x.ass); - if (!serializeItem(d, l, p, s)) { - return false; - } - if (p + s > l) { - return false; - } - memcpy(d + p, x.ass, s); - p += s; - return true; - } - default: - return true; - } - }; - return serializeItem(dest, len, pos, src.x) && - serializeItem(dest, len, pos, src.y) && - serializeItem(dest, len, pos, src.w) && - serializeItem(dest, len, pos, src.h) && - serializeItem(dest, len, pos, src.nb_colors) && - serializeItem(dest, len, pos, src.type) && - serializeItem(dest, len, pos, src.flags) && - rectSerialize(dest, len, pos, src); -} - -inline bool serializeItem( - uint8_t* dest, - size_t len, - size_t& pos, - const AVSubtitle& src) { - auto rectSerialize = - [](uint8_t* d, size_t l, size_t& p, const AVSubtitle& x) -> bool { - bool res = serializeItem(d, l, p, x.num_rects); - for (unsigned i = 0; res && i < x.num_rects; ++i) { - res = serializeItem(d, l, p, *(x.rects[i])); - } - return res; - }; - VLOG(6) << "AVSubtitle serializeItem"; - return serializeItem(dest, len, pos, src.format) && - serializeItem(dest, len, pos, src.start_display_time) && - serializeItem(dest, len, pos, src.end_display_time) && - serializeItem(dest, len, pos, src.pts) && - rectSerialize(dest, len, pos, src); -} - -inline bool deserializeItem( - const uint8_t* src, - size_t len, - size_t& pos, - AVSubtitleRect& dest) { - auto rectDeserialize = - [](const uint8_t* y, size_t l, size_t& p, AVSubtitleRect& x) -> bool { - switch (x.type) { - case SUBTITLE_BITMAP: - for (int i = 0; i < x.nb_colors; ++i) { - if (!deserializeItem(y, l, p, x.linesize[i])) { - return false; - } - if (p + x.linesize[i] > l) { - return false; - } - x.data[i] = (uint8_t*)av_malloc(x.linesize[i]); - memcpy(x.data[i], y + p, x.linesize[i]); - p += x.linesize[i]; - } - return true; - case SUBTITLE_TEXT: { - size_t s = 0; - if (!deserializeItem(y, l, p, s)) { - return false; - } - if (p + s > l) { - return false; - } - x.text = (char*)av_malloc(s + 1); - memcpy(x.text, y + p, s); - x.text[s] = 0; - p += s; - return true; - } - case SUBTITLE_ASS: { - size_t s = 0; - if (!deserializeItem(y, l, p, s)) { - return false; - } - if (p + s > l) { - return false; - } - x.ass = (char*)av_malloc(s + 1); - memcpy(x.ass, y + p, s); - x.ass[s] = 0; - p += s; - return true; - } - default: - return true; - } - }; - - return deserializeItem(src, len, pos, dest.x) && - deserializeItem(src, len, pos, dest.y) && - deserializeItem(src, len, pos, dest.w) && - deserializeItem(src, len, pos, dest.h) && - deserializeItem(src, len, pos, dest.nb_colors) && - deserializeItem(src, len, pos, dest.type) && - deserializeItem(src, len, pos, dest.flags) && - rectDeserialize(src, len, pos, dest); -} - -inline bool deserializeItem( - const uint8_t* src, - size_t len, - size_t& pos, - AVSubtitle& dest) { - auto rectDeserialize = - [](const uint8_t* y, size_t l, size_t& p, AVSubtitle& x) -> bool { - bool res = deserializeItem(y, l, p, x.num_rects); - if (res && x.num_rects) { - x.rects = - (AVSubtitleRect**)av_malloc(x.num_rects * sizeof(AVSubtitleRect*)); - } - for (unsigned i = 0; res && i < x.num_rects; ++i) { - x.rects[i] = (AVSubtitleRect*)av_malloc(sizeof(AVSubtitleRect)); - memset(x.rects[i], 0, sizeof(AVSubtitleRect)); - res = deserializeItem(y, l, p, *x.rects[i]); - } - return res; - }; - return deserializeItem(src, len, pos, dest.format) && - deserializeItem(src, len, pos, dest.start_display_time) && - deserializeItem(src, len, pos, dest.end_display_time) && - deserializeItem(src, len, pos, dest.pts) && - rectDeserialize(src, len, pos, dest); -} -} // namespace Serializer - -namespace Util { -std::string generateErrorDesc(int errorCode) { - std::array buffer; - if (av_strerror(errorCode, buffer.data(), buffer.size()) < 0) { - return std::string("Unknown error code: ") + std::to_string(errorCode); - } - buffer.back() = 0; - return std::string(buffer.data()); -} - -size_t serialize(const AVSubtitle& sub, ByteStorage* out) { - const auto len = size(sub); - size_t pos = 0; - if (!Serializer::serializeItem(out->writableTail(), len, pos, sub)) { - return 0; - } - out->append(len); - return len; -} - -bool deserialize(const ByteStorage& buf, AVSubtitle* sub) { - size_t pos = 0; - return Serializer::deserializeItem(buf.data(), buf.length(), pos, *sub); -} - -size_t size(const AVSubtitle& sub) { - return Serializer::getSize(sub); -} - -bool validateVideoFormat(const VideoFormat& f) { - // clang-format off - /* - Valid parameters values for decoder - ____________________________________________________________________________________ - | W | H | minDimension | maxDimension | cropImage | algorithm | - |__________________________________________________________________________________| - | 0 | 0 | 0 | 0 | N/A | original | - |__________________________________________________________________________________| - | >0 | 0 | N/A | N/A | N/A | scale keeping W | - |__________________________________________________________________________________| - | 0 | >0 | N/A | N/A | N/A | scale keeping H | - |__________________________________________________________________________________| - | >0 | >0 | N/A | N/A | 0 | stretch/scale | - |__________________________________________________________________________________| - | >0 | >0 | N/A | N/A | >0 | scale/crop | - |__________________________________________________________________________________| - | 0 | 0 | >0 | 0 | N/A |scale to min dimension | - |__________________________________________________________________________________| - | 0 | 0 | 0 | >0 | N/A |scale to max dimension | - |__________________________________________________________________________________| - | 0 | 0 | >0 | >0 | N/A |stretch to min/max dimension| - |_____|_____|______________|______________|___________|____________________________| - - */ - // clang-format on - return (f.width == 0 && // #1, #6, #7 and #8 - f.height == 0 && f.cropImage == 0) || - (f.width != 0 && // #4 and #5 - f.height != 0 && f.minDimension == 0 && f.maxDimension == 0) || - (((f.width != 0 && // #2 - f.height == 0) || - (f.width == 0 && // #3 - f.height != 0)) && - f.minDimension == 0 && f.maxDimension == 0 && f.cropImage == 0); -} - -void setFormatDimensions( - size_t& destW, - size_t& destH, - size_t userW, - size_t userH, - size_t srcW, - size_t srcH, - size_t minDimension, - size_t maxDimension, - size_t cropImage) { - // rounding rules - // int -> double -> round up - // if fraction is >= 0.5 or round down if fraction is < 0.5 - // int result = double(value) + 0.5 - // here we rounding double to int according to the above rule - - // #1, #6, #7 and #8 - if (userW == 0 && userH == 0) { - if (minDimension > 0 && maxDimension == 0) { // #6 - if (srcW > srcH) { - // landscape - destH = minDimension; - destW = round(double(srcW * minDimension) / srcH); - } else { - // portrait - destW = minDimension; - destH = round(double(srcH * minDimension) / srcW); - } - } else if (minDimension == 0 && maxDimension > 0) { // #7 - if (srcW > srcH) { - // landscape - destW = maxDimension; - destH = round(double(srcH * maxDimension) / srcW); - } else { - // portrait - destH = maxDimension; - destW = round(double(srcW * maxDimension) / srcH); - } - } else if (minDimension > 0 && maxDimension > 0) { // #8 - if (srcW > srcH) { - // landscape - destW = maxDimension; - destH = minDimension; - } else { - // portrait - destW = minDimension; - destH = maxDimension; - } - } else { // #1 - destW = srcW; - destH = srcH; - } - } else if (userW != 0 && userH == 0) { // #2 - destW = userW; - destH = round(double(srcH * userW) / srcW); - } else if (userW == 0 && userH != 0) { // #3 - destW = round(double(srcW * userH) / srcH); - destH = userH; - } else { // userW != 0 && userH != 0 - if (cropImage == 0) { // #4 - destW = userW; - destH = userH; - } else { // #5 - double userSlope = double(userH) / userW; - double srcSlope = double(srcH) / srcW; - if (srcSlope < userSlope) { - destW = round(double(srcW * userH) / srcH); - destH = userH; - } else { - destW = userW; - destH = round(double(srcH * userW) / srcW); - } - } - } - // prevent zeros - destW = std::max(destW, size_t(1UL)); - destH = std::max(destH, size_t(1UL)); -} -} // namespace Util -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/util.h b/torchvision/csrc/io/decoder/util.h deleted file mode 100644 index 01b550e5bbc..00000000000 --- a/torchvision/csrc/io/decoder/util.h +++ /dev/null @@ -1,28 +0,0 @@ -#pragma once - -#include "defs.h" - -namespace ffmpeg { - -/** - * FFMPEG library utility functions. - */ - -namespace Util { -std::string generateErrorDesc(int errorCode); -size_t serialize(const AVSubtitle& sub, ByteStorage* out); -bool deserialize(const ByteStorage& buf, AVSubtitle* sub); -size_t size(const AVSubtitle& sub); -void setFormatDimensions( - size_t& destW, - size_t& destH, - size_t userW, - size_t userH, - size_t srcW, - size_t srcH, - size_t minDimension, - size_t maxDimension, - size_t cropImage); -bool validateVideoFormat(const VideoFormat& format); -} // namespace Util -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/util_test.cpp b/torchvision/csrc/io/decoder/util_test.cpp deleted file mode 100644 index 0a093d9561b..00000000000 --- a/torchvision/csrc/io/decoder/util_test.cpp +++ /dev/null @@ -1,34 +0,0 @@ -#include -#include -#include "util.h" - -TEST(Util, TestSetFormatDimensions) { - // clang-format off - const size_t test_cases[][9] = { - // (userW, userH, srcW, srcH, minDimension, maxDimension, cropImage, destW, destH) - {0, 0, 172, 128, 0, 0, 0, 172, 128}, // #1 - {86, 0, 172, 128, 0, 0, 0, 86, 64}, // #2 - {64, 0, 128, 172, 0, 0, 0, 64, 86}, // #2 - {0, 32, 172, 128, 0, 0, 0, 43, 32}, // #3 - {32, 0, 128, 172, 0, 0, 0, 32, 43}, // #3 - {60, 50, 172, 128, 0, 0, 0, 60, 50}, // #4 - {50, 60, 128, 172, 0, 0, 0, 50, 60}, // #4 - {86, 40, 172, 128, 0, 0, 1, 86, 64}, // #5 - {86, 92, 172, 128, 0, 0, 1, 124, 92}, // #5 - {0, 0, 172, 128, 256, 0, 0, 344, 256}, // #6 - {0, 0, 128, 172, 256, 0, 0, 256, 344}, // #6 - {0, 0, 128, 172, 0, 344, 0, 256, 344}, // #7 - {0, 0, 172, 128, 0, 344, 0, 344, 256}, // #7 - {0, 0, 172, 128, 100, 344, 0, 344, 100},// #8 - {0, 0, 128, 172, 100, 344, 0, 100, 344} // #8 - }; - // clang-format onn - - for (const auto& tc : test_cases) { - size_t destW = 0; - size_t destH = 0; - ffmpeg::Util::setFormatDimensions(destW, destH, tc[0], tc[1], tc[2], tc[3], tc[4], tc[5], tc[6]); - CHECK(destW == tc[7]); - CHECK(destH == tc[8]); - } -} diff --git a/torchvision/csrc/io/decoder/video_sampler.cpp b/torchvision/csrc/io/decoder/video_sampler.cpp deleted file mode 100644 index 8b712609e34..00000000000 --- a/torchvision/csrc/io/decoder/video_sampler.cpp +++ /dev/null @@ -1,337 +0,0 @@ -#include "video_sampler.h" -#include -#include "util.h" - -// www.ffmpeg.org/doxygen/0.5/swscale-example_8c-source.html - -namespace ffmpeg { - -namespace { - -// Setup the data pointers and linesizes based on the specified image -// parameters and the provided array. This sets up "planes" to point to a -// "buffer" -// NOTE: this is most likely culprit behind #3534 -// -// Args: -// fmt: desired output video format -// buffer: source constant image buffer (in different format) that will contain -// the final image after SWScale planes: destination data pointer to be filled -// lineSize: target destination linesize (always {0}) -int preparePlanes( - const VideoFormat& fmt, - const uint8_t* buffer, - uint8_t** planes, - int* lineSize) { - int result; - - // NOTE: 1 at the end of av_fill_arrays is the value used for alignment - if ((result = av_image_fill_arrays( - planes, - lineSize, - buffer, - (AVPixelFormat)fmt.format, - fmt.width, - fmt.height, - 1)) < 0) { - LOG(ERROR) << "av_image_fill_arrays failed, err: " - << Util::generateErrorDesc(result); - } - return result; -} - -// Scale (and crop) the image slice in srcSlice and put the resulting scaled -// slice to `planes` buffer, which is mapped to be `out` via preparePlanes as -// `sws_scale` cannot access buffers directly. -// -// Args: -// context: SWSContext allocated on line 119 (if crop, optional) or 163 (if -// scale) srcSlice: frame data in YUV420P srcStride: the array containing the -// strides for each plane of the source -// image (from AVFrame->linesize[0]) -// out: destination buffer -// planes: indirect destination buffer (mapped to "out" via preparePlanes) -// lines: destination linesize; constant {0} -int transformImage( - SwsContext* context, - const uint8_t* const srcSlice[], - int srcStride[], - VideoFormat inFormat, - VideoFormat outFormat, - uint8_t* out, - uint8_t* planes[], - int lines[]) { - int result; - if ((result = preparePlanes(outFormat, out, planes, lines)) < 0) { - return result; - } - if (context) { - // NOTE: srcY stride always 0: this is a parameter of YUV format - if ((result = sws_scale( - context, srcSlice, srcStride, 0, inFormat.height, planes, lines)) < - 0) { - LOG(ERROR) << "sws_scale failed, err: " - << Util::generateErrorDesc(result); - return result; - } - } else if ( - inFormat.width == outFormat.width && - inFormat.height == outFormat.height && - inFormat.format == outFormat.format) { - // Copy planes without using sws_scale if sws_getContext failed. - av_image_copy( - planes, - lines, - (const uint8_t**)srcSlice, - srcStride, - (AVPixelFormat)inFormat.format, - inFormat.width, - inFormat.height); - } else { - LOG(ERROR) << "Invalid scale context format " << inFormat.format; - return AVERROR(EINVAL); - } - return 0; -} -} // namespace - -VideoSampler::VideoSampler(int swsFlags, int64_t loggingUuid) - : swsFlags_(swsFlags), loggingUuid_(loggingUuid) {} - -VideoSampler::~VideoSampler() { - cleanUp(); -} - -void VideoSampler::shutdown() { - cleanUp(); -} - -bool VideoSampler::init(const SamplerParameters& params) { - cleanUp(); - - if (params.out.video.cropImage != 0) { - if (!Util::validateVideoFormat(params.out.video)) { - LOG(ERROR) << "Invalid video format" - << ", width: " << params.out.video.width - << ", height: " << params.out.video.height - << ", format: " << params.out.video.format - << ", minDimension: " << params.out.video.minDimension - << ", crop: " << params.out.video.cropImage; - - return false; - } - - scaleFormat_.format = params.out.video.format; - Util::setFormatDimensions( - scaleFormat_.width, - scaleFormat_.height, - params.out.video.width, - params.out.video.height, - params.in.video.width, - params.in.video.height, - 0, - 0, - 1); - - if (!(scaleFormat_ == params_.out.video)) { // crop required - cropContext_ = sws_getContext( - params.out.video.width, - params.out.video.height, - (AVPixelFormat)params.out.video.format, - params.out.video.width, - params.out.video.height, - (AVPixelFormat)params.out.video.format, - swsFlags_, - nullptr, - nullptr, - nullptr); - - if (!cropContext_) { - LOG(ERROR) << "sws_getContext failed for crop context"; - return false; - } - - const auto scaleImageSize = av_image_get_buffer_size( - (AVPixelFormat)scaleFormat_.format, - scaleFormat_.width, - scaleFormat_.height, - 1); - scaleBuffer_.resize(scaleImageSize); - } - } else { - scaleFormat_ = params.out.video; - } - - VLOG(1) << "Input format #" << loggingUuid_ << ", width " - << params.in.video.width << ", height " << params.in.video.height - << ", format " << params.in.video.format << ", minDimension " - << params.in.video.minDimension << ", cropImage " - << params.in.video.cropImage; - VLOG(1) << "Scale format #" << loggingUuid_ << ", width " - << scaleFormat_.width << ", height " << scaleFormat_.height - << ", format " << scaleFormat_.format << ", minDimension " - << scaleFormat_.minDimension << ", cropImage " - << scaleFormat_.cropImage; - VLOG(1) << "Crop format #" << loggingUuid_ << ", width " - << params.out.video.width << ", height " << params.out.video.height - << ", format " << params.out.video.format << ", minDimension " - << params.out.video.minDimension << ", cropImage " - << params.out.video.cropImage; - - // set output format - params_ = params; - - if (params.in.video.format == AV_PIX_FMT_YUV420P) { - /* When the video width and height are not multiples of 8, - * and there is no size change in the conversion, - * a blurry screen will appear on the right side - * This problem was discovered in 2012 and - * continues to exist in version 4.1.3 in 2019 - * This problem can be avoided by increasing SWS_ACCURATE_RND - * details https://trac.ffmpeg.org/ticket/1582 - */ - if ((params.in.video.width & 0x7) || (params.in.video.height & 0x7)) { - VLOG(1) << "The width " << params.in.video.width << " and height " - << params.in.video.height << " the image is not a multiple of 8, " - << "the decoding speed may be reduced"; - swsFlags_ |= SWS_ACCURATE_RND; - } - } - - scaleContext_ = sws_getContext( - params.in.video.width, - params.in.video.height, - (AVPixelFormat)params.in.video.format, - scaleFormat_.width, - scaleFormat_.height, - (AVPixelFormat)scaleFormat_.format, - swsFlags_, - nullptr, - nullptr, - nullptr); - // sws_getContext might fail if in/out format == AV_PIX_FMT_PAL8 (png format) - // Return true if input and output formats/width/height are identical - // Check scaleContext_ for nullptr in transformImage to copy planes directly - - if (params.in.video.width == scaleFormat_.width && - params.in.video.height == scaleFormat_.height && - params.in.video.format == scaleFormat_.format) { - return true; - } - return scaleContext_ != nullptr; -} - -// Main body of the sample function called from one of the overloads below -// -// Args: -// srcSlice: decoded AVFrame->data perpared buffer -// srcStride: linesize (usually obtained from AVFrame->linesize) -// out: return buffer (ByteStorage*) -int VideoSampler::sample( - const uint8_t* const srcSlice[], - int srcStride[], - ByteStorage* out) { - int result; - // scaled and cropped image - int outImageSize = av_image_get_buffer_size( - (AVPixelFormat)params_.out.video.format, - params_.out.video.width, - params_.out.video.height, - 1); - - out->ensure(outImageSize); - - uint8_t* scalePlanes[4] = {nullptr}; - int scaleLines[4] = {0}; - // perform scale first - if ((result = transformImage( - scaleContext_, - srcSlice, - srcStride, - params_.in.video, - scaleFormat_, - // for crop use internal buffer - cropContext_ ? scaleBuffer_.data() : out->writableTail(), - scalePlanes, - scaleLines))) { - return result; - } - - // is crop required? - if (cropContext_) { - uint8_t* cropPlanes[4] = {nullptr}; - int cropLines[4] = {0}; - - if (params_.out.video.height < scaleFormat_.height) { - // Destination image is wider of source image: cut top and bottom - for (size_t i = 0; i < 4 && scalePlanes[i] != nullptr; ++i) { - scalePlanes[i] += scaleLines[i] * - (scaleFormat_.height - params_.out.video.height) / 2; - } - } else { - // Source image is wider of destination image: cut sides - for (size_t i = 0; i < 4 && scalePlanes[i] != nullptr; ++i) { - scalePlanes[i] += scaleLines[i] * - (scaleFormat_.width - params_.out.video.width) / 2 / - scaleFormat_.width; - } - } - - // crop image - if ((result = transformImage( - cropContext_, - scalePlanes, - scaleLines, - params_.out.video, - params_.out.video, - out->writableTail(), - cropPlanes, - cropLines))) { - return result; - } - } - - out->append(outImageSize); - return outImageSize; -} - -// Call from `video_stream.cpp::114` - occurs during file reads -int VideoSampler::sample(AVFrame* frame, ByteStorage* out) { - if (!frame) { - return 0; // no flush for videos - } - - return sample(frame->data, frame->linesize, out); -} - -// Call from `video_stream.cpp::114` - not sure when this occurs -int VideoSampler::sample(const ByteStorage* in, ByteStorage* out) { - if (!in) { - return 0; // no flush for videos - } - - int result; - uint8_t* inPlanes[4] = {nullptr}; - int inLineSize[4] = {0}; - - if ((result = preparePlanes( - params_.in.video, in->data(), inPlanes, inLineSize)) < 0) { - return result; - } - - return sample(inPlanes, inLineSize, out); -} - -void VideoSampler::cleanUp() { - if (scaleContext_) { - sws_freeContext(scaleContext_); - scaleContext_ = nullptr; - } - if (cropContext_) { - sws_freeContext(cropContext_); - cropContext_ = nullptr; - scaleBuffer_.clear(); - } -} - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/video_sampler.h b/torchvision/csrc/io/decoder/video_sampler.h deleted file mode 100644 index 47247f2c0c5..00000000000 --- a/torchvision/csrc/io/decoder/video_sampler.h +++ /dev/null @@ -1,44 +0,0 @@ -#pragma once - -#include "defs.h" - -namespace ffmpeg { - -/** - * Class transcode video frames from one format into another - */ - -class VideoSampler : public MediaSampler { - public: - VideoSampler(int swsFlags = SWS_AREA, int64_t loggingUuid = 0); - - ~VideoSampler() override; - - // MediaSampler overrides - bool init(const SamplerParameters& params) override; - int sample(const ByteStorage* in, ByteStorage* out) override; - void shutdown() override; - - // returns number processed/scaling bytes - int sample(AVFrame* frame, ByteStorage* out); - int getImageBytes() const; - - private: - // close resources - void cleanUp(); - // helper functions for rescaling, cropping, etc. - int sample( - const uint8_t* const srcSlice[], - int srcStride[], - ByteStorage* out); - - private: - VideoFormat scaleFormat_; - SwsContext* scaleContext_{nullptr}; - SwsContext* cropContext_{nullptr}; - int swsFlags_{SWS_AREA}; - std::vector scaleBuffer_; - int64_t loggingUuid_{0}; -}; - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/video_stream.cpp b/torchvision/csrc/io/decoder/video_stream.cpp deleted file mode 100644 index fa08c65cac1..00000000000 --- a/torchvision/csrc/io/decoder/video_stream.cpp +++ /dev/null @@ -1,131 +0,0 @@ -#include "video_stream.h" -#include -#include "util.h" - -namespace ffmpeg { - -namespace { -bool operator==(const VideoFormat& x, const AVFrame& y) { - return x.width == static_cast(y.width) && - x.height == static_cast(y.height) && x.format == y.format; -} - -bool operator==(const VideoFormat& x, const AVCodecContext& y) { - return x.width == static_cast(y.width) && - x.height == static_cast(y.height) && x.format == y.pix_fmt; -} - -VideoFormat& toVideoFormat(VideoFormat& x, const AVFrame& y) { - x.width = y.width; - x.height = y.height; - x.format = y.format; - return x; -} - -VideoFormat& toVideoFormat(VideoFormat& x, const AVCodecContext& y) { - x.width = y.width; - x.height = y.height; - x.format = y.pix_fmt; - return x; -} -} // namespace - -VideoStream::VideoStream( - AVFormatContext* inputCtx, - int index, - bool convertPtsToWallTime, - const VideoFormat& format, - int64_t loggingUuid) - : Stream( - inputCtx, - MediaFormat::makeMediaFormat(format, index), - convertPtsToWallTime, - loggingUuid) {} - -VideoStream::~VideoStream() { - if (sampler_) { - sampler_->shutdown(); - sampler_.reset(); - } -} - -int VideoStream::initFormat() { - // set output format - if (!Util::validateVideoFormat(format_.format.video)) { - LOG(ERROR) << "Invalid video format" - << ", width: " << format_.format.video.width - << ", height: " << format_.format.video.height - << ", format: " << format_.format.video.format - << ", minDimension: " << format_.format.video.minDimension - << ", crop: " << format_.format.video.cropImage; - return -1; - } - - // keep aspect ratio - Util::setFormatDimensions( - format_.format.video.width, - format_.format.video.height, - format_.format.video.width, - format_.format.video.height, - codecCtx_->width, - codecCtx_->height, - format_.format.video.minDimension, - format_.format.video.maxDimension, - 0); - - if (format_.format.video.format == AV_PIX_FMT_NONE) { - format_.format.video.format = codecCtx_->pix_fmt; - } - return format_.format.video.width != 0 && format_.format.video.height != 0 && - format_.format.video.format != AV_PIX_FMT_NONE - ? 0 - : -1; -} - -// copies frame bytes via sws_scale call in video_sampler.cpp -int VideoStream::copyFrameBytes(ByteStorage* out, bool flush) { - if (!sampler_) { - sampler_ = std::make_unique(SWS_AREA, loggingUuid_); - } - - // check if input format gets changed - if (flush ? !(sampler_->getInputFormat().video == *codecCtx_) - : !(sampler_->getInputFormat().video == *frame_)) { - // - reinit sampler - SamplerParameters params; - params.type = format_.type; - params.out = format_.format; - params.in = FormatUnion(0); - flush ? toVideoFormat(params.in.video, *codecCtx_) - : toVideoFormat(params.in.video, *frame_); - if (!sampler_->init(params)) { - return -1; - } - - VLOG(1) << "Set input video sampler format" - << ", width: " << params.in.video.width - << ", height: " << params.in.video.height - << ", format: " << params.in.video.format - << " : output video sampler format" - << ", width: " << format_.format.video.width - << ", height: " << format_.format.video.height - << ", format: " << format_.format.video.format - << ", minDimension: " << format_.format.video.minDimension - << ", crop: " << format_.format.video.cropImage; - } - // calls to a sampler that converts the frame from YUV422 to RGB24, and - // optionally crops and resizes the frame. Frame bytes are copied from - // frame_->data to out buffer - return sampler_->sample(flush ? nullptr : frame_, out); -} - -void VideoStream::setHeader(DecoderHeader* header, bool flush) { - Stream::setHeader(header, flush); - if (!flush) { // no frames for video flush - header->keyFrame = frame_->key_frame; - header->fps = av_q2d(av_guess_frame_rate( - inputCtx_, inputCtx_->streams[format_.stream], nullptr)); - } -} - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/video_stream.h b/torchvision/csrc/io/decoder/video_stream.h deleted file mode 100644 index e6a8bf02b65..00000000000 --- a/torchvision/csrc/io/decoder/video_stream.h +++ /dev/null @@ -1,31 +0,0 @@ -#pragma once - -#include "stream.h" -#include "video_sampler.h" - -namespace ffmpeg { - -/** - * Class uses FFMPEG library to decode one video stream. - */ - -class VideoStream : public Stream { - public: - VideoStream( - AVFormatContext* inputCtx, - int index, - bool convertPtsToWallTime, - const VideoFormat& format, - int64_t loggingUuid); - ~VideoStream() override; - - private: - int initFormat() override; - int copyFrameBytes(ByteStorage* out, bool flush) override; - void setHeader(DecoderHeader* header, bool flush) override; - - private: - std::unique_ptr sampler_; -}; - -} // namespace ffmpeg diff --git a/torchvision/csrc/io/video/video.cpp b/torchvision/csrc/io/video/video.cpp deleted file mode 100644 index 8f1fb3fb5b9..00000000000 --- a/torchvision/csrc/io/video/video.cpp +++ /dev/null @@ -1,387 +0,0 @@ -#include "video.h" - -#include - -using namespace ffmpeg; - -namespace vision { -namespace video { - -namespace { - -const size_t decoderTimeoutMs = 600000; -const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24; - -// returns number of written bytes -template -size_t fillTensorList(DecoderOutputMessage& msgs, torch::Tensor& frame) { - const auto& msg = msgs; - T* frameData = frame.numel() > 0 ? frame.data_ptr() : nullptr; - if (frameData) { - auto sizeInBytes = msg.payload->length(); - memcpy(frameData, msg.payload->data(), sizeInBytes); - } - return sizeof(T); -} - -size_t fillVideoTensor(DecoderOutputMessage& msgs, torch::Tensor& videoFrame) { - return fillTensorList(msgs, videoFrame); -} - -size_t fillAudioTensor(DecoderOutputMessage& msgs, torch::Tensor& audioFrame) { - return fillTensorList(msgs, audioFrame); -} - -std::array, 4>::const_iterator -_parse_type(const std::string& stream_string) { - static const std::array, 4> types = {{ - {"video", TYPE_VIDEO}, - {"audio", TYPE_AUDIO}, - {"subtitle", TYPE_SUBTITLE}, - {"cc", TYPE_CC}, - }}; - auto device = std::find_if( - types.begin(), - types.end(), - [stream_string](const std::pair& p) { - return p.first == stream_string; - }); - if (device != types.end()) { - return device; - } - TORCH_CHECK( - false, "Expected one of [audio, video, subtitle, cc] ", stream_string); -} - -std::string parse_type_to_string(const std::string& stream_string) { - auto device = _parse_type(stream_string); - return device->first; -} - -MediaType parse_type_to_mt(const std::string& stream_string) { - auto device = _parse_type(stream_string); - return device->second; -} - -std::tuple _parseStream(const std::string& streamString) { - TORCH_CHECK(!streamString.empty(), "Stream string must not be empty"); - static const std::regex regex("([a-zA-Z_]+)(?::([1-9]\\d*|0))?"); - std::smatch match; - - TORCH_CHECK( - std::regex_match(streamString, match, regex), - "Invalid stream string: '", - streamString, - "'"); - - std::string type_ = "video"; - type_ = parse_type_to_string(match[1].str()); - long index_ = -1; - if (match[2].matched) { - try { - index_ = std::stoi(match[2].str()); - } catch (const std::exception&) { - TORCH_CHECK( - false, - "Could not parse device index '", - match[2].str(), - "' in device string '", - streamString, - "'"); - } - } - return std::make_tuple(type_, index_); -} - -} // namespace - -void Video::_getDecoderParams( - double videoStartS, - int64_t getPtsOnly, - std::string stream, - long stream_id = -1, - bool fastSeek = true, - bool all_streams = false, - int64_t num_threads = 1, - double seekFrameMarginUs = 10) { - int64_t videoStartUs = int64_t(videoStartS * 1e6); - - params.timeoutMs = decoderTimeoutMs; - params.startOffset = videoStartUs; - params.seekAccuracy = seekFrameMarginUs; - params.fastSeek = fastSeek; - params.headerOnly = false; - params.numThreads = num_threads; - - params.preventStaleness = false; // not sure what this is about - - if (all_streams == true) { - MediaFormat format; - format.stream = -2; - format.type = TYPE_AUDIO; - params.formats.insert(format); - - format.type = TYPE_VIDEO; - format.stream = -2; - format.format.video.width = 0; - format.format.video.height = 0; - format.format.video.cropImage = 0; - format.format.video.format = defaultVideoPixelFormat; - params.formats.insert(format); - - format.type = TYPE_SUBTITLE; - format.stream = -2; - params.formats.insert(format); - - format.type = TYPE_CC; - format.stream = -2; - params.formats.insert(format); - } else { - // parse stream type - MediaType stream_type = parse_type_to_mt(stream); - - // TODO: reset params.formats - std::set formats; - params.formats = formats; - // Define new format - MediaFormat format; - format.type = stream_type; - format.stream = stream_id; - if (stream_type == TYPE_VIDEO) { - format.format.video.width = 0; - format.format.video.height = 0; - format.format.video.cropImage = 0; - format.format.video.format = defaultVideoPixelFormat; - } - params.formats.insert(format); - } - -} // _get decoder params - -void Video::initFromFile( - std::string videoPath, - std::string stream, - int64_t numThreads) { - TORCH_CHECK(!initialized, "Video object can only be initialized once"); - initialized = true; - params.uri = videoPath; - _init(stream, numThreads); -} - -void Video::initFromMemory( - torch::Tensor videoTensor, - std::string stream, - int64_t numThreads) { - TORCH_CHECK(!initialized, "Video object can only be initialized once"); - initialized = true; - callback = MemoryBuffer::getCallback( - videoTensor.data_ptr(), videoTensor.size(0)); - _init(stream, numThreads); -} - -void Video::_init(std::string stream, int64_t numThreads) { - // set number of threads global - numThreads_ = numThreads; - // parse stream information - current_stream = _parseStream(stream); - // note that in the initial call we want to get all streams - _getDecoderParams( - 0, // video start - 0, // headerOnly - std::get<0>(current_stream), // stream info - remove that - long(-1), // stream_id parsed from info above change to -2 - false, // fastseek: we're using the default param here - true, // read all streams - numThreads_ // global number of Threads for decoding - ); - - std::string logMessage, logType; - - // locals - std::vector audioFPS, videoFPS; - std::vector audioDuration, videoDuration, ccDuration, subsDuration; - std::vector audioTB, videoTB, ccTB, subsTB; - c10::Dict> audioMetadata; - c10::Dict> videoMetadata; - c10::Dict> ccMetadata; - c10::Dict> subsMetadata; - - // callback and metadata defined in struct - DecoderInCallback tmp_callback = callback; - succeeded = decoder.init(params, std::move(tmp_callback), &metadata); - if (succeeded) { - for (const auto& header : metadata) { - double fps = double(header.fps); - double duration = double(header.duration) * 1e-6; // * timeBase; - - if (header.format.type == TYPE_VIDEO) { - videoFPS.push_back(fps); - videoDuration.push_back(duration); - } else if (header.format.type == TYPE_AUDIO) { - audioFPS.push_back(fps); - audioDuration.push_back(duration); - } else if (header.format.type == TYPE_CC) { - ccDuration.push_back(duration); - } else if (header.format.type == TYPE_SUBTITLE) { - subsDuration.push_back(duration); - }; - } - } - // audio - audioMetadata.insert("duration", audioDuration); - audioMetadata.insert("framerate", audioFPS); - // video - videoMetadata.insert("duration", videoDuration); - videoMetadata.insert("fps", videoFPS); - // subs - subsMetadata.insert("duration", subsDuration); - // cc - ccMetadata.insert("duration", ccDuration); - // put all to a data - streamsMetadata.insert("video", videoMetadata); - streamsMetadata.insert("audio", audioMetadata); - streamsMetadata.insert("subtitles", subsMetadata); - streamsMetadata.insert("cc", ccMetadata); - - succeeded = setCurrentStream(stream); - if (std::get<1>(current_stream) != -1) { - LOG(INFO) - << "Stream index set to " << std::get<1>(current_stream) - << ". If you encounter trouble, consider switching it to automatic stream discovery. \n"; - } -} - -Video::Video(std::string videoPath, std::string stream, int64_t numThreads) { - C10_LOG_API_USAGE_ONCE("torchvision.csrc.io.video.video.Video"); - if (!videoPath.empty()) { - initFromFile(videoPath, stream, numThreads); - } -} // video - -bool Video::setCurrentStream(std::string stream = "video") { - TORCH_CHECK(initialized, "Video object has to be initialized first"); - if ((!stream.empty()) && (_parseStream(stream) != current_stream)) { - current_stream = _parseStream(stream); - } - - double ts = 0; - if (seekTS > 0) { - ts = seekTS; - } - - _getDecoderParams( - ts, // video start - 0, // headerOnly - std::get<0>(current_stream), // stream - long(std::get<1>( - current_stream)), // stream_id parsed from info above change to -2 - false, // fastseek param set to 0 false by default (changed in seek) - false, // read all streams - numThreads_ // global number of threads - ); - - // callback and metadata defined in Video.h - DecoderInCallback tmp_callback = callback; - return (decoder.init(params, std::move(tmp_callback), &metadata)); -} - -std::tuple Video::getCurrentStream() const { - TORCH_CHECK(initialized, "Video object has to be initialized first"); - return current_stream; -} - -c10::Dict>> Video:: - getStreamMetadata() const { - TORCH_CHECK(initialized, "Video object has to be initialized first"); - return streamsMetadata; -} - -void Video::Seek(double ts, bool fastSeek = false) { - TORCH_CHECK(initialized, "Video object has to be initialized first"); - // initialize the class variables used for seeking and retrurn - _getDecoderParams( - ts, // video start - 0, // headerOnly - std::get<0>(current_stream), // stream - long(std::get<1>( - current_stream)), // stream_id parsed from info above change to -2 - fastSeek, // fastseek - false, // read all streams - numThreads_ // global number of threads - ); - - // callback and metadata defined in Video.h - DecoderInCallback tmp_callback = callback; - succeeded = decoder.init(params, std::move(tmp_callback), &metadata); -} - -std::tuple Video::Next() { - TORCH_CHECK(initialized, "Video object has to be initialized first"); - // if failing to decode simply return a null tensor (note, should we - // raise an exception?) - double frame_pts_s; - torch::Tensor outFrame = torch::zeros({0}, torch::kByte); - - // decode single frame - DecoderOutputMessage out; - int64_t res = decoder.decode(&out, decoderTimeoutMs); - // if successful - if (res == 0) { - frame_pts_s = double(double(out.header.pts) * 1e-6); - - auto header = out.header; - const auto& format = header.format; - - // initialize the output variables based on type - - if (format.type == TYPE_VIDEO) { - // note: this can potentially be optimized - // by having the global tensor that we fill at decode time - // (would avoid allocations) - int outHeight = format.format.video.height; - int outWidth = format.format.video.width; - int numChannels = 3; - outFrame = torch::zeros({outHeight, outWidth, numChannels}, torch::kByte); - fillVideoTensor(out, outFrame); - outFrame = outFrame.permute({2, 0, 1}); - - } else if (format.type == TYPE_AUDIO) { - int outAudioChannels = format.format.audio.channels; - int bytesPerSample = av_get_bytes_per_sample( - static_cast(format.format.audio.format)); - int frameSizeTotal = out.payload->length(); - - TORCH_CHECK_EQ(frameSizeTotal % (outAudioChannels * bytesPerSample), 0); - int numAudioSamples = - frameSizeTotal / (outAudioChannels * bytesPerSample); - - outFrame = - torch::zeros({numAudioSamples, outAudioChannels}, torch::kFloat); - - fillAudioTensor(out, outFrame); - } - // currently not supporting other formats (will do soon) - - out.payload.reset(); - } else if (res == ENODATA) { - LOG(INFO) << "Decoder ran out of frames (ENODATA)\n"; - } else { - LOG(ERROR) << "Decoder failed with ERROR_CODE " << res; - } - - return std::make_tuple(outFrame, frame_pts_s); -} - -static auto registerVideo = - torch::class_