diff --git a/.github/scripts/setup-env.sh b/.github/scripts/setup-env.sh
index e1c5855f31c..64b4aa7fa90 100755
--- a/.github/scripts/setup-env.sh
+++ b/.github/scripts/setup-env.sh
@@ -23,15 +23,13 @@ case $(uname) in
 esac
 
 echo '::group::Create build environment'
-# See https://github.com/pytorch/vision/issues/7296 for ffmpeg
 conda create \
   --name ci \
   --quiet --yes \
   python="${PYTHON_VERSION}" pip \
   ninja cmake \
   libpng \
-  libwebp \
-  'ffmpeg<4.3'
+  libwebp
 conda activate ci
 conda install --quiet --yes libjpeg-turbo -c pytorch
 pip install --progress-bar=off --upgrade setuptools==72.1.0
diff --git a/.github/scripts/unittest.sh b/.github/scripts/unittest.sh
index 43968762a8b..e5ec6eedacd 100755
--- a/.github/scripts/unittest.sh
+++ b/.github/scripts/unittest.sh
@@ -14,5 +14,4 @@ echo '::endgroup::'
 
 python test/smoke_test.py
 
-# We explicitly ignore the video tests until we resolve https://github.com/pytorch/vision/issues/8162
-pytest --ignore-glob="*test_video*" --ignore-glob="*test_onnx*" --junit-xml="${RUNNER_TEST_RESULTS_DIR}/test-results.xml" -v --durations=25 -k "not TestFxFeatureExtraction"
+pytest --ignore-glob="*test_onnx*" --junit-xml="${RUNNER_TEST_RESULTS_DIR}/test-results.xml" -v --durations=25 -k "not TestFxFeatureExtraction"
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 8b341622181..1b246cc01a6 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -34,12 +34,6 @@ jobs:
         CONDA_PATH=$(which conda)
         eval "$(${CONDA_PATH} shell.bash hook)"
         conda activate ci
-        # FIXME: not sure why we need this. `ldd torchvision/video_reader.so` shows that it
-        #  already links against the one pulled from conda. However, at runtime it pulls from
-        #  /lib64
-        # Should we maybe always do this in `./.github/scripts/setup-env.sh` so that we don't
-        # have to pay attention in all other workflows?
-        export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}"
 
         cd docs
 
diff --git a/docs/source/io.rst b/docs/source/io.rst
index 478321a4e6d..72a6edd965b 100644
--- a/docs/source/io.rst
+++ b/docs/source/io.rst
@@ -1,10 +1,10 @@
-Decoding / Encoding images and videos
-=====================================
+Decoding / Encoding images
+==========================
 
 .. currentmodule:: torchvision.io
 
 The :mod:`torchvision.io` module provides utilities for decoding and encoding
-images and videos.
+images.
 
 Image Decoding
 --------------
@@ -92,7 +92,7 @@ Video - DEPREACTED
 .. warning::
 
     DEPRECATED: All the video decoding and encoding capabilities of torchvision
-    are deprecated from version 0.22 and will be removed in version 0.24.  We
+    are deprecated from version 0.22 and will be removed in version 0.25.  We
     recommend that you migrate to
     `TorchCodec <https://github.com/pytorch/torchcodec>`__, where we'll
     consolidate the future decoding/encoding capabilities of PyTorch
@@ -101,19 +101,4 @@ Video - DEPREACTED
     :toctree: generated/
     :template: function.rst
 
-    read_video
-    read_video_timestamps
     write_video
-
-
-**Fine-grained video API**
-
-In addition to the :mod:`read_video` function, we provide a high-performance 
-lower-level API for more fine-grained control compared to the :mod:`read_video` function.
-It does all this whilst fully supporting torchscript.
-
-.. autosummary::
-    :toctree: generated/
-    :template: class.rst
-
-    VideoReader
diff --git a/mypy.ini b/mypy.ini
index e25212a169d..a68e48f27ef 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -41,10 +41,6 @@ ignore_errors = True
 
 ignore_errors = True
 
-[mypy-torchvision.io.video_reader]
-
-ignore_errors = True
-
 [mypy-torchvision.models.*]
 
 ignore_errors=True
diff --git a/packaging/pre_build_script.sh b/packaging/pre_build_script.sh
index fcacf4bf8a4..b9046aa81c5 100644
--- a/packaging/pre_build_script.sh
+++ b/packaging/pre_build_script.sh
@@ -17,7 +17,6 @@ if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then
   # Installing webp also installs a non-turbo jpeg, so we uninstall jpeg stuff
   # before re-installing them
   conda uninstall libjpeg-turbo libjpeg -y
-  conda install -y ffmpeg=4.2 -c pytorch
   conda install -y libjpeg-turbo -c pytorch
 
   # Copy binaries to be included in the wheel distribution
@@ -30,7 +29,7 @@ else
 
   if [[ "$ARCH" == "aarch64" ]]; then
     conda install libpng -y
-    conda install -y ffmpeg=4.2 libjpeg-turbo -c pytorch-nightly
+    conda install -y libjpeg-turbo -c pytorch-nightly
   fi
 
   conda install libwebp -y
diff --git a/packaging/wheel/relocate.py b/packaging/wheel/relocate.py
index 4587f3798da..3b9e00c3b59 100644
--- a/packaging/wheel/relocate.py
+++ b/packaging/wheel/relocate.py
@@ -316,8 +316,9 @@ def patch_linux():
     output_dir = osp.join(PACKAGE_ROOT, "dist", ".wheel-process")
 
     image_binary = "image.so"
-    video_binary = "video_reader.so"
-    torchvision_binaries = [image_binary, video_binary]
+    torchvision_binaries = [
+        image_binary,
+    ]
     for wheel in wheels:
         if osp.exists(output_dir):
             shutil.rmtree(output_dir)
@@ -352,8 +353,9 @@ def patch_win():
     output_dir = osp.join(PACKAGE_ROOT, "dist", ".wheel-process")
 
     image_binary = "image.pyd"
-    video_binary = "video_reader.pyd"
-    torchvision_binaries = [image_binary, video_binary]
+    torchvision_binaries = [
+        image_binary,
+    ]
     for wheel in wheels:
         if osp.exists(output_dir):
             shutil.rmtree(output_dir)
diff --git a/setup.py b/setup.py
index 5e69fa50f52..5e1cb9dfba6 100644
--- a/setup.py
+++ b/setup.py
@@ -22,14 +22,6 @@
 USE_WEBP = os.getenv("TORCHVISION_USE_WEBP", "1") == "1"
 USE_NVJPEG = os.getenv("TORCHVISION_USE_NVJPEG", "1") == "1"
 NVCC_FLAGS = os.getenv("NVCC_FLAGS", None)
-# Note: the GPU video decoding stuff used to be called "video codec", which
-# isn't an accurate or descriptive name considering there are at least 2 other
-# video decoding backends in torchvision. I'm renaming this to "gpu video
-# decoder" where possible, keeping user facing names (like the env var below) to
-# the old scheme for BC.
-USE_GPU_VIDEO_DECODER = os.getenv("TORCHVISION_USE_VIDEO_CODEC", "1") == "1"
-# Same here: "use ffmpeg" was used to denote "use cpu video decoder".
-USE_CPU_VIDEO_DECODER = os.getenv("TORCHVISION_USE_FFMPEG", "1") == "1"
 
 TORCHVISION_INCLUDE = os.environ.get("TORCHVISION_INCLUDE", "")
 TORCHVISION_LIBRARY = os.environ.get("TORCHVISION_LIBRARY", "")
@@ -52,8 +44,6 @@
 print(f"{USE_WEBP = }")
 print(f"{USE_NVJPEG = }")
 print(f"{NVCC_FLAGS = }")
-print(f"{USE_CPU_VIDEO_DECODER = }")
-print(f"{USE_GPU_VIDEO_DECODER = }")
 print(f"{TORCHVISION_INCLUDE = }")
 print(f"{TORCHVISION_LIBRARY = }")
 print(f"{IS_ROCM = }")
@@ -371,158 +361,6 @@ def make_image_extension():
     )
 
 
-def make_video_decoders_extensions():
-    print("Building video decoder extensions")
-
-    build_without_extensions_msg = "Building without video decoders extensions."
-    if sys.platform != "linux" or (sys.version_info.major == 3 and sys.version_info.minor == 9):
-        # FIXME: Building torchvision with ffmpeg on MacOS or with Python 3.9
-        # FIXME: causes crash. See the following GitHub issues for more details.
-        # FIXME: https://github.com/pytorch/pytorch/issues/65000
-        # FIXME: https://github.com/pytorch/vision/issues/3367
-        print("Can only build video decoder extensions on linux and Python != 3.9")
-        return []
-
-    ffmpeg_exe = shutil.which("ffmpeg")
-    if ffmpeg_exe is None:
-        print(f"{build_without_extensions_msg} Couldn't find ffmpeg binary.")
-        return []
-
-    def find_ffmpeg_libraries():
-        ffmpeg_libraries = {"libavcodec", "libavformat", "libavutil", "libswresample", "libswscale"}
-
-        ffmpeg_bin = os.path.dirname(ffmpeg_exe)
-        ffmpeg_root = os.path.dirname(ffmpeg_bin)
-        ffmpeg_include_dir = os.path.join(ffmpeg_root, "include")
-        ffmpeg_library_dir = os.path.join(ffmpeg_root, "lib")
-
-        gcc = os.environ.get("CC", shutil.which("gcc"))
-        platform_tag = subprocess.run([gcc, "-print-multiarch"], stdout=subprocess.PIPE)
-        platform_tag = platform_tag.stdout.strip().decode("utf-8")
-
-        if platform_tag:
-            # Most probably a Debian-based distribution
-            ffmpeg_include_dir = [ffmpeg_include_dir, os.path.join(ffmpeg_include_dir, platform_tag)]
-            ffmpeg_library_dir = [ffmpeg_library_dir, os.path.join(ffmpeg_library_dir, platform_tag)]
-        else:
-            ffmpeg_include_dir = [ffmpeg_include_dir]
-            ffmpeg_library_dir = [ffmpeg_library_dir]
-
-        for library in ffmpeg_libraries:
-            library_found = False
-            for search_path in ffmpeg_include_dir + TORCHVISION_INCLUDE:
-                full_path = os.path.join(search_path, library, "*.h")
-                library_found |= len(glob.glob(full_path)) > 0
-
-            if not library_found:
-                print(f"{build_without_extensions_msg}")
-                print(f"{library} header files were not found.")
-                return None, None
-
-        return ffmpeg_include_dir, ffmpeg_library_dir
-
-    ffmpeg_include_dir, ffmpeg_library_dir = find_ffmpeg_libraries()
-    if ffmpeg_include_dir is None or ffmpeg_library_dir is None:
-        return []
-
-    print("Found ffmpeg:")
-    print(f"  ffmpeg include path: {ffmpeg_include_dir}")
-    print(f"  ffmpeg library_dir: {ffmpeg_library_dir}")
-
-    extensions = []
-    if USE_CPU_VIDEO_DECODER:
-        print("Building with CPU video decoder support")
-
-        # TorchVision base decoder + video reader
-        video_reader_src_dir = os.path.join(ROOT_DIR, "torchvision", "csrc", "io", "video_reader")
-        video_reader_src = glob.glob(os.path.join(video_reader_src_dir, "*.cpp"))
-        base_decoder_src_dir = os.path.join(ROOT_DIR, "torchvision", "csrc", "io", "decoder")
-        base_decoder_src = glob.glob(os.path.join(base_decoder_src_dir, "*.cpp"))
-        # Torchvision video API
-        videoapi_src_dir = os.path.join(ROOT_DIR, "torchvision", "csrc", "io", "video")
-        videoapi_src = glob.glob(os.path.join(videoapi_src_dir, "*.cpp"))
-        # exclude tests
-        base_decoder_src = [x for x in base_decoder_src if "_test.cpp" not in x]
-
-        combined_src = video_reader_src + base_decoder_src + videoapi_src
-
-        extensions.append(
-            CppExtension(
-                # This is an awful name. It should be "cpu_video_decoder". Keeping for BC.
-                "torchvision.video_reader",
-                combined_src,
-                include_dirs=[
-                    base_decoder_src_dir,
-                    video_reader_src_dir,
-                    videoapi_src_dir,
-                    str(CSRS_DIR),
-                    *ffmpeg_include_dir,
-                    *TORCHVISION_INCLUDE,
-                ],
-                library_dirs=ffmpeg_library_dir + TORCHVISION_LIBRARY,
-                libraries=[
-                    "avcodec",
-                    "avformat",
-                    "avutil",
-                    "swresample",
-                    "swscale",
-                ],
-                extra_compile_args=["-std=c++17"] if os.name != "nt" else ["/std:c++17", "/MP"],
-                extra_link_args=["-std=c++17" if os.name != "nt" else "/std:c++17"],
-            )
-        )
-
-    if USE_GPU_VIDEO_DECODER:
-        # Locating GPU video decoder headers and libraries
-        # CUDA_HOME should be set to the cuda root directory.
-        # TORCHVISION_INCLUDE and TORCHVISION_LIBRARY should include the locations
-        # to the headers and libraries below
-        if not (
-            BUILD_CUDA_SOURCES
-            and CUDA_HOME is not None
-            and any([os.path.exists(os.path.join(folder, "cuviddec.h")) for folder in TORCHVISION_INCLUDE])
-            and any([os.path.exists(os.path.join(folder, "nvcuvid.h")) for folder in TORCHVISION_INCLUDE])
-            and any([os.path.exists(os.path.join(folder, "libnvcuvid.so")) for folder in TORCHVISION_LIBRARY])
-            and any([os.path.exists(os.path.join(folder, "libavcodec", "bsf.h")) for folder in ffmpeg_include_dir])
-        ):
-            print("Could not find necessary dependencies. Refer the setup.py to check which ones are needed.")
-            print("Building without GPU video decoder support")
-            return extensions
-        print("Building torchvision with GPU video decoder support")
-
-        gpu_decoder_path = os.path.join(CSRS_DIR, "io", "decoder", "gpu")
-        gpu_decoder_src = glob.glob(os.path.join(gpu_decoder_path, "*.cpp"))
-        cuda_libs = os.path.join(CUDA_HOME, "lib64")
-        cuda_inc = os.path.join(CUDA_HOME, "include")
-
-        _, extra_compile_args = get_macros_and_flags()
-        extensions.append(
-            CUDAExtension(
-                "torchvision.gpu_decoder",
-                gpu_decoder_src,
-                include_dirs=[CSRS_DIR] + TORCHVISION_INCLUDE + [gpu_decoder_path] + [cuda_inc] + ffmpeg_include_dir,
-                library_dirs=ffmpeg_library_dir + TORCHVISION_LIBRARY + [cuda_libs],
-                libraries=[
-                    "avcodec",
-                    "avformat",
-                    "avutil",
-                    "swresample",
-                    "swscale",
-                    "nvcuvid",
-                    "cuda",
-                    "cudart",
-                    "z",
-                    "pthread",
-                    "dl",
-                    "nppicc",
-                ],
-                extra_compile_args=extra_compile_args,
-            )
-        )
-
-    return extensions
-
-
 class clean(distutils.command.clean.clean):
     def run(self):
         with open(".gitignore") as f:
@@ -550,7 +388,6 @@ def run(self):
     extensions = [
         make_C_extension(),
         make_image_extension(),
-        *make_video_decoders_extensions(),
     ]
 
     setup(
diff --git a/test/test_datasets_video_utils_opt.py b/test/test_datasets_video_utils_opt.py
deleted file mode 100644
index 5e6b19bfb95..00000000000
--- a/test/test_datasets_video_utils_opt.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import unittest
-
-import test_datasets_video_utils
-from torchvision import set_video_backend  # noqa: 401
-
-# Disabling the video backend switching temporarily
-# set_video_backend('video_reader')
-
-
-if __name__ == "__main__":
-    suite = unittest.TestLoader().loadTestsFromModule(test_datasets_video_utils)
-    unittest.TextTestRunner(verbosity=1).run(suite)
diff --git a/test/test_io.py b/test/test_io.py
deleted file mode 100644
index d2950ac9595..00000000000
--- a/test/test_io.py
+++ /dev/null
@@ -1,292 +0,0 @@
-import contextlib
-import os
-import sys
-import tempfile
-
-import pytest
-import torch
-import torchvision.io as io
-from common_utils import assert_equal, cpu_and_cuda
-from torchvision import get_video_backend
-
-
-try:
-    import av
-
-    # Do a version test too
-    io.video._check_av_available()
-except ImportError:
-    av = None
-
-
-VIDEO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "videos")
-
-
-def _create_video_frames(num_frames, height, width):
-    y, x = torch.meshgrid(torch.linspace(-2, 2, height), torch.linspace(-2, 2, width), indexing="ij")
-    data = []
-    for i in range(num_frames):
-        xc = float(i) / num_frames
-        yc = 1 - float(i) / (2 * num_frames)
-        d = torch.exp(-((x - xc) ** 2 + (y - yc) ** 2) / 2) * 255
-        data.append(d.unsqueeze(2).repeat(1, 1, 3).byte())
-
-    return torch.stack(data, 0)
-
-
-@contextlib.contextmanager
-def temp_video(num_frames, height, width, fps, lossless=False, video_codec=None, options=None):
-    if lossless:
-        if video_codec is not None:
-            raise ValueError("video_codec can't be specified together with lossless")
-        if options is not None:
-            raise ValueError("options can't be specified together with lossless")
-        video_codec = "libx264rgb"
-        options = {"crf": "0"}
-
-    if video_codec is None:
-        if get_video_backend() == "pyav":
-            video_codec = "libx264"
-        else:
-            # when video_codec is not set, we assume it is libx264rgb which accepts
-            # RGB pixel formats as input instead of YUV
-            video_codec = "libx264rgb"
-    if options is None:
-        options = {}
-
-    data = _create_video_frames(num_frames, height, width)
-    with tempfile.NamedTemporaryFile(suffix=".mp4") as f:
-        f.close()
-        io.write_video(f.name, data, fps=fps, video_codec=video_codec, options=options)
-        yield f.name, data
-    os.unlink(f.name)
-
-
-@pytest.mark.skipif(
-    get_video_backend() != "pyav" and not io._HAS_CPU_VIDEO_DECODER, reason="video_reader backend not available"
-)
-@pytest.mark.skipif(av is None, reason="PyAV unavailable")
-class TestVideo:
-    # compression adds artifacts, thus we add a tolerance of
-    # 6 in 0-255 range
-    TOLERANCE = 6
-
-    def test_write_read_video(self):
-        with temp_video(10, 300, 300, 5, lossless=True) as (f_name, data):
-            lv, _, info = io.read_video(f_name)
-            assert_equal(data, lv)
-            assert info["video_fps"] == 5
-
-    @pytest.mark.skipif(not io._HAS_CPU_VIDEO_DECODER, reason="video_reader backend is not chosen")
-    def test_probe_video_from_file(self):
-        with temp_video(10, 300, 300, 5) as (f_name, data):
-            video_info = io._probe_video_from_file(f_name)
-            assert pytest.approx(2, rel=0.0, abs=0.1) == video_info.video_duration
-            assert pytest.approx(5, rel=0.0, abs=0.1) == video_info.video_fps
-
-    @pytest.mark.skipif(not io._HAS_CPU_VIDEO_DECODER, reason="video_reader backend is not chosen")
-    def test_probe_video_from_memory(self):
-        with temp_video(10, 300, 300, 5) as (f_name, data):
-            with open(f_name, "rb") as fp:
-                filebuffer = fp.read()
-            video_info = io._probe_video_from_memory(filebuffer)
-            assert pytest.approx(2, rel=0.0, abs=0.1) == video_info.video_duration
-            assert pytest.approx(5, rel=0.0, abs=0.1) == video_info.video_fps
-
-    def test_read_timestamps(self):
-        with temp_video(10, 300, 300, 5) as (f_name, data):
-            pts, _ = io.read_video_timestamps(f_name)
-            # note: not all formats/codecs provide accurate information for computing the
-            # timestamps. For the format that we use here, this information is available,
-            # so we use it as a baseline
-            with av.open(f_name) as container:
-                stream = container.streams[0]
-                pts_step = int(round(float(1 / (stream.average_rate * stream.time_base))))
-                num_frames = int(round(float(stream.average_rate * stream.time_base * stream.duration)))
-                expected_pts = [i * pts_step for i in range(num_frames)]
-
-            assert pts == expected_pts
-
-    @pytest.mark.parametrize("start", range(5))
-    @pytest.mark.parametrize("offset", range(1, 4))
-    def test_read_partial_video(self, start, offset):
-        with temp_video(10, 300, 300, 5, lossless=True) as (f_name, data):
-            pts, _ = io.read_video_timestamps(f_name)
-
-            lv, _, _ = io.read_video(f_name, pts[start], pts[start + offset - 1])
-            s_data = data[start : (start + offset)]
-            assert len(lv) == offset
-            assert_equal(s_data, lv)
-
-            if get_video_backend() == "pyav":
-                # for "video_reader" backend, we don't decode the closest early frame
-                # when the given start pts is not matching any frame pts
-                lv, _, _ = io.read_video(f_name, pts[4] + 1, pts[7])
-                assert len(lv) == 4
-                assert_equal(data[4:8], lv)
-
-    @pytest.mark.parametrize("start", range(0, 80, 20))
-    @pytest.mark.parametrize("offset", range(1, 4))
-    def test_read_partial_video_bframes(self, start, offset):
-        # do not use lossless encoding, to test the presence of B-frames
-        options = {"bframes": "16", "keyint": "10", "min-keyint": "4"}
-        with temp_video(100, 300, 300, 5, options=options) as (f_name, data):
-            pts, _ = io.read_video_timestamps(f_name)
-
-            lv, _, _ = io.read_video(f_name, pts[start], pts[start + offset - 1])
-            s_data = data[start : (start + offset)]
-            assert len(lv) == offset
-            assert_equal(s_data, lv, rtol=0.0, atol=self.TOLERANCE)
-
-            lv, _, _ = io.read_video(f_name, pts[4] + 1, pts[7])
-            # TODO fix this
-            if get_video_backend() == "pyav":
-                assert len(lv) == 4
-                assert_equal(data[4:8], lv, rtol=0.0, atol=self.TOLERANCE)
-            else:
-                assert len(lv) == 3
-                assert_equal(data[5:8], lv, rtol=0.0, atol=self.TOLERANCE)
-
-    def test_read_packed_b_frames_divx_file(self):
-        name = "hmdb51_Turnk_r_Pippi_Michel_cartwheel_f_cm_np2_le_med_6.avi"
-        f_name = os.path.join(VIDEO_DIR, name)
-        pts, fps = io.read_video_timestamps(f_name)
-
-        assert pts == sorted(pts)
-        assert fps == 30
-
-    def test_read_timestamps_from_packet(self):
-        with temp_video(10, 300, 300, 5, video_codec="mpeg4") as (f_name, data):
-            pts, _ = io.read_video_timestamps(f_name)
-            # note: not all formats/codecs provide accurate information for computing the
-            # timestamps. For the format that we use here, this information is available,
-            # so we use it as a baseline
-            with av.open(f_name) as container:
-                stream = container.streams[0]
-                # make sure we went through the optimized codepath
-                assert b"Lavc" in stream.codec_context.extradata
-                pts_step = int(round(float(1 / (stream.average_rate * stream.time_base))))
-                num_frames = int(round(float(stream.average_rate * stream.time_base * stream.duration)))
-                expected_pts = [i * pts_step for i in range(num_frames)]
-
-            assert pts == expected_pts
-
-    def test_read_video_pts_unit_sec(self):
-        with temp_video(10, 300, 300, 5, lossless=True) as (f_name, data):
-            lv, _, info = io.read_video(f_name, pts_unit="sec")
-
-            assert_equal(data, lv)
-            assert info["video_fps"] == 5
-            assert info == {"video_fps": 5}
-
-    def test_read_timestamps_pts_unit_sec(self):
-        with temp_video(10, 300, 300, 5) as (f_name, data):
-            pts, _ = io.read_video_timestamps(f_name, pts_unit="sec")
-
-            with av.open(f_name) as container:
-                stream = container.streams[0]
-                pts_step = int(round(float(1 / (stream.average_rate * stream.time_base))))
-                num_frames = int(round(float(stream.average_rate * stream.time_base * stream.duration)))
-                expected_pts = [i * pts_step * stream.time_base for i in range(num_frames)]
-
-            assert pts == expected_pts
-
-    @pytest.mark.parametrize("start", range(5))
-    @pytest.mark.parametrize("offset", range(1, 4))
-    def test_read_partial_video_pts_unit_sec(self, start, offset):
-        with temp_video(10, 300, 300, 5, lossless=True) as (f_name, data):
-            pts, _ = io.read_video_timestamps(f_name, pts_unit="sec")
-
-            lv, _, _ = io.read_video(f_name, pts[start], pts[start + offset - 1], pts_unit="sec")
-            s_data = data[start : (start + offset)]
-            assert len(lv) == offset
-            assert_equal(s_data, lv)
-
-            with av.open(f_name) as container:
-                stream = container.streams[0]
-                lv, _, _ = io.read_video(
-                    f_name, int(pts[4] * (1.0 / stream.time_base) + 1) * stream.time_base, pts[7], pts_unit="sec"
-                )
-            if get_video_backend() == "pyav":
-                # for "video_reader" backend, we don't decode the closest early frame
-                # when the given start pts is not matching any frame pts
-                assert len(lv) == 4
-                assert_equal(data[4:8], lv)
-
-    def test_read_video_corrupted_file(self):
-        with tempfile.NamedTemporaryFile(suffix=".mp4") as f:
-            f.write(b"This is not an mpg4 file")
-            video, audio, info = io.read_video(f.name)
-            assert isinstance(video, torch.Tensor)
-            assert isinstance(audio, torch.Tensor)
-            assert video.numel() == 0
-            assert audio.numel() == 0
-            assert info == {}
-
-    def test_read_video_timestamps_corrupted_file(self):
-        with tempfile.NamedTemporaryFile(suffix=".mp4") as f:
-            f.write(b"This is not an mpg4 file")
-            video_pts, video_fps = io.read_video_timestamps(f.name)
-            assert video_pts == []
-            assert video_fps is None
-
-    @pytest.mark.skip(reason="Temporarily disabled due to new pyav")
-    def test_read_video_partially_corrupted_file(self):
-        with temp_video(5, 4, 4, 5, lossless=True) as (f_name, data):
-            with open(f_name, "r+b") as f:
-                size = os.path.getsize(f_name)
-                bytes_to_overwrite = size // 10
-                # seek to the middle of the file
-                f.seek(5 * bytes_to_overwrite)
-                # corrupt 10% of the file from the middle
-                f.write(b"\xff" * bytes_to_overwrite)
-            # this exercises the container.decode assertion check
-            video, audio, info = io.read_video(f.name, pts_unit="sec")
-            # check that size is not equal to 5, but 3
-            # TODO fix this
-            if get_video_backend() == "pyav":
-                assert len(video) == 3
-            else:
-                assert len(video) == 4
-            # but the valid decoded content is still correct
-            assert_equal(video[:3], data[:3])
-            # and the last few frames are wrong
-            with pytest.raises(AssertionError):
-                assert_equal(video, data)
-
-    @pytest.mark.skipif(sys.platform == "win32", reason="temporarily disabled on Windows")
-    @pytest.mark.parametrize("device", cpu_and_cuda())
-    def test_write_video_with_audio(self, device, tmpdir):
-        f_name = os.path.join(VIDEO_DIR, "R6llTwEh07w.mp4")
-        video_tensor, audio_tensor, info = io.read_video(f_name, pts_unit="sec")
-
-        out_f_name = os.path.join(tmpdir, "testing.mp4")
-        io.video.write_video(
-            out_f_name,
-            video_tensor.to(device),
-            round(info["video_fps"]),
-            video_codec="libx264rgb",
-            options={"crf": "0"},
-            audio_array=audio_tensor.to(device),
-            audio_fps=info["audio_fps"],
-            audio_codec="aac",
-        )
-
-        out_video_tensor, out_audio_tensor, out_info = io.read_video(out_f_name, pts_unit="sec")
-
-        assert info["video_fps"] == out_info["video_fps"]
-        assert_equal(video_tensor, out_video_tensor)
-
-        audio_stream = av.open(f_name).streams.audio[0]
-        out_audio_stream = av.open(out_f_name).streams.audio[0]
-
-        assert info["audio_fps"] == out_info["audio_fps"]
-        assert audio_stream.rate == out_audio_stream.rate
-        assert pytest.approx(out_audio_stream.frames, rel=0.0, abs=1) == audio_stream.frames
-        assert audio_stream.frame_size == out_audio_stream.frame_size
-
-    # TODO add tests for audio
-
-
-if __name__ == "__main__":
-    pytest.main(__file__)
diff --git a/test/test_io_opt.py b/test/test_io_opt.py
deleted file mode 100644
index f4e3d305295..00000000000
--- a/test/test_io_opt.py
+++ /dev/null
@@ -1,13 +0,0 @@
-import unittest
-
-import test_io
-from torchvision import set_video_backend  # noqa: 401
-
-
-# Disabling the video backend switching temporarily
-# set_video_backend('video_reader')
-
-
-if __name__ == "__main__":
-    suite = unittest.TestLoader().loadTestsFromModule(test_io)
-    unittest.TextTestRunner(verbosity=1).run(suite)
diff --git a/test/test_video_gpu_decoder.py b/test/test_video_gpu_decoder.py
deleted file mode 100644
index aa6d0aee9e0..00000000000
--- a/test/test_video_gpu_decoder.py
+++ /dev/null
@@ -1,97 +0,0 @@
-import math
-import os
-
-import pytest
-import torch
-import torchvision
-from torchvision.io import _HAS_GPU_VIDEO_DECODER, VideoReader
-
-try:
-    import av
-except ImportError:
-    av = None
-
-VIDEO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "videos")
-
-
-@pytest.mark.skipif(_HAS_GPU_VIDEO_DECODER is False, reason="Didn't compile with support for gpu decoder")
-class TestVideoGPUDecoder:
-    @pytest.mark.skipif(av is None, reason="PyAV unavailable")
-    @pytest.mark.parametrize(
-        "video_file",
-        [
-            "RATRACE_wave_f_nm_np1_fr_goo_37.avi",
-            "TrumanShow_wave_f_nm_np1_fr_med_26.avi",
-            "v_SoccerJuggling_g23_c01.avi",
-            "v_SoccerJuggling_g24_c01.avi",
-            "R6llTwEh07w.mp4",
-            "SOX5yA1l24A.mp4",
-            "WUzgd7C1pWA.mp4",
-        ],
-    )
-    def test_frame_reading(self, video_file):
-        torchvision.set_video_backend("cuda")
-        full_path = os.path.join(VIDEO_DIR, video_file)
-        decoder = VideoReader(full_path)
-        with av.open(full_path) as container:
-            for av_frame in container.decode(container.streams.video[0]):
-                av_frames = torch.tensor(av_frame.to_rgb(src_colorspace="ITU709").to_ndarray())
-                vision_frames = next(decoder)["data"]
-                mean_delta = torch.mean(torch.abs(av_frames.float() - vision_frames.cpu().float()))
-                assert mean_delta < 0.75
-
-    @pytest.mark.skipif(av is None, reason="PyAV unavailable")
-    @pytest.mark.parametrize("keyframes", [True, False])
-    @pytest.mark.parametrize(
-        "full_path, duration",
-        [
-            (os.path.join(VIDEO_DIR, x), y)
-            for x, y in [
-                ("v_SoccerJuggling_g23_c01.avi", 8.0),
-                ("v_SoccerJuggling_g24_c01.avi", 8.0),
-                ("R6llTwEh07w.mp4", 10.0),
-                ("SOX5yA1l24A.mp4", 11.0),
-                ("WUzgd7C1pWA.mp4", 11.0),
-            ]
-        ],
-    )
-    def test_seek_reading(self, keyframes, full_path, duration):
-        torchvision.set_video_backend("cuda")
-        decoder = VideoReader(full_path)
-        time = duration / 2
-        decoder.seek(time, keyframes_only=keyframes)
-        with av.open(full_path) as container:
-            container.seek(int(time * 1000000), any_frame=not keyframes, backward=False)
-            for av_frame in container.decode(container.streams.video[0]):
-                av_frames = torch.tensor(av_frame.to_rgb(src_colorspace="ITU709").to_ndarray())
-                vision_frames = next(decoder)["data"]
-                mean_delta = torch.mean(torch.abs(av_frames.float() - vision_frames.cpu().float()))
-                assert mean_delta < 0.75
-
-    @pytest.mark.skipif(av is None, reason="PyAV unavailable")
-    @pytest.mark.parametrize(
-        "video_file",
-        [
-            "RATRACE_wave_f_nm_np1_fr_goo_37.avi",
-            "TrumanShow_wave_f_nm_np1_fr_med_26.avi",
-            "v_SoccerJuggling_g23_c01.avi",
-            "v_SoccerJuggling_g24_c01.avi",
-            "R6llTwEh07w.mp4",
-            "SOX5yA1l24A.mp4",
-            "WUzgd7C1pWA.mp4",
-        ],
-    )
-    def test_metadata(self, video_file):
-        torchvision.set_video_backend("cuda")
-        full_path = os.path.join(VIDEO_DIR, video_file)
-        decoder = VideoReader(full_path)
-        video_metadata = decoder.get_metadata()["video"]
-        with av.open(full_path) as container:
-            video = container.streams.video[0]
-            av_duration = float(video.duration * video.time_base)
-            assert math.isclose(video_metadata["duration"], av_duration, rel_tol=1e-2)
-            assert math.isclose(video_metadata["fps"], video.base_rate, rel_tol=1e-2)
-
-
-if __name__ == "__main__":
-    pytest.main([__file__])
diff --git a/test/test_video_reader.py b/test/test_video_reader.py
deleted file mode 100644
index 10995424982..00000000000
--- a/test/test_video_reader.py
+++ /dev/null
@@ -1,1254 +0,0 @@
-import collections
-import math
-import os
-from fractions import Fraction
-
-import numpy as np
-import pytest
-import torch
-import torchvision.io as io
-from common_utils import assert_equal
-from numpy.random import randint
-from pytest import approx
-from torchvision import set_video_backend
-from torchvision.io import _HAS_CPU_VIDEO_DECODER
-
-
-try:
-    import av
-
-    # Do a version test too
-    io.video._check_av_available()
-except ImportError:
-    av = None
-
-
-VIDEO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "videos")
-
-CheckerConfig = [
-    "duration",
-    "video_fps",
-    "audio_sample_rate",
-    # We find for some videos (e.g. HMDB51 videos), the decoded audio frames and pts are
-    # slightly different between TorchVision decoder and PyAv decoder. So omit it during check
-    "check_aframes",
-    "check_aframe_pts",
-]
-GroundTruth = collections.namedtuple("GroundTruth", " ".join(CheckerConfig))
-
-all_check_config = GroundTruth(
-    duration=0,
-    video_fps=0,
-    audio_sample_rate=0,
-    check_aframes=True,
-    check_aframe_pts=True,
-)
-
-test_videos = {
-    "RATRACE_wave_f_nm_np1_fr_goo_37.avi": GroundTruth(
-        duration=2.0,
-        video_fps=30.0,
-        audio_sample_rate=None,
-        check_aframes=True,
-        check_aframe_pts=True,
-    ),
-    "SchoolRulesHowTheyHelpUs_wave_f_nm_np1_ba_med_0.avi": GroundTruth(
-        duration=2.0,
-        video_fps=30.0,
-        audio_sample_rate=None,
-        check_aframes=True,
-        check_aframe_pts=True,
-    ),
-    "TrumanShow_wave_f_nm_np1_fr_med_26.avi": GroundTruth(
-        duration=2.0,
-        video_fps=30.0,
-        audio_sample_rate=None,
-        check_aframes=True,
-        check_aframe_pts=True,
-    ),
-    "v_SoccerJuggling_g23_c01.avi": GroundTruth(
-        duration=8.0,
-        video_fps=29.97,
-        audio_sample_rate=None,
-        check_aframes=True,
-        check_aframe_pts=True,
-    ),
-    "v_SoccerJuggling_g24_c01.avi": GroundTruth(
-        duration=8.0,
-        video_fps=29.97,
-        audio_sample_rate=None,
-        check_aframes=True,
-        check_aframe_pts=True,
-    ),
-    "R6llTwEh07w.mp4": GroundTruth(
-        duration=10.0,
-        video_fps=30.0,
-        audio_sample_rate=44100,
-        # PyAv miss one audio frame at the beginning (pts=0)
-        check_aframes=False,
-        check_aframe_pts=False,
-    ),
-    "SOX5yA1l24A.mp4": GroundTruth(
-        duration=11.0,
-        video_fps=29.97,
-        audio_sample_rate=48000,
-        # PyAv miss one audio frame at the beginning (pts=0)
-        check_aframes=False,
-        check_aframe_pts=False,
-    ),
-    "WUzgd7C1pWA.mp4": GroundTruth(
-        duration=11.0,
-        video_fps=29.97,
-        audio_sample_rate=48000,
-        # PyAv miss one audio frame at the beginning (pts=0)
-        check_aframes=False,
-        check_aframe_pts=False,
-    ),
-}
-
-
-DecoderResult = collections.namedtuple("DecoderResult", "vframes vframe_pts vtimebase aframes aframe_pts atimebase")
-
-# av_seek_frame is imprecise so seek to a timestamp earlier by a margin
-# The unit of margin is second
-SEEK_FRAME_MARGIN = 0.25
-
-
-def _read_from_stream(container, start_pts, end_pts, stream, stream_name, buffer_size=4):
-    """
-    Args:
-        container: pyav container
-        start_pts/end_pts: the starting/ending Presentation TimeStamp where
-            frames are read
-        stream: pyav stream
-        stream_name: a dictionary of streams. For example, {"video": 0} means
-            video stream at stream index 0
-        buffer_size: pts of frames decoded by PyAv is not guaranteed to be in
-            ascending order. We need to decode more frames even when we meet end
-            pts
-    """
-    # seeking in the stream is imprecise. Thus, seek to an earlier PTS by a margin
-    margin = 1
-    seek_offset = max(start_pts - margin, 0)
-
-    container.seek(seek_offset, any_frame=False, backward=True, stream=stream)
-    frames = {}
-    buffer_count = 0
-    for frame in container.decode(**stream_name):
-        if frame.pts < start_pts:
-            continue
-        if frame.pts <= end_pts:
-            frames[frame.pts] = frame
-        else:
-            buffer_count += 1
-            if buffer_count >= buffer_size:
-                break
-    result = [frames[pts] for pts in sorted(frames)]
-
-    return result
-
-
-def _get_timebase_by_av_module(full_path):
-    container = av.open(full_path)
-    video_time_base = container.streams.video[0].time_base
-    if container.streams.audio:
-        audio_time_base = container.streams.audio[0].time_base
-    else:
-        audio_time_base = None
-    return video_time_base, audio_time_base
-
-
-def _fraction_to_tensor(fraction):
-    ret = torch.zeros([2], dtype=torch.int32)
-    ret[0] = fraction.numerator
-    ret[1] = fraction.denominator
-    return ret
-
-
-def _decode_frames_by_av_module(
-    full_path,
-    video_start_pts=0,
-    video_end_pts=None,
-    audio_start_pts=0,
-    audio_end_pts=None,
-):
-    """
-    Use PyAv to decode video frames. This provides a reference for our decoder
-    to compare the decoding results.
-    Input arguments:
-        full_path: video file path
-        video_start_pts/video_end_pts: the starting/ending Presentation TimeStamp where
-            frames are read
-    """
-    if video_end_pts is None:
-        video_end_pts = float("inf")
-    if audio_end_pts is None:
-        audio_end_pts = float("inf")
-    container = av.open(full_path)
-
-    video_frames = []
-    vtimebase = torch.zeros([0], dtype=torch.int32)
-    if container.streams.video:
-        video_frames = _read_from_stream(
-            container,
-            video_start_pts,
-            video_end_pts,
-            container.streams.video[0],
-            {"video": 0},
-        )
-        # container.streams.video[0].average_rate is not a reliable estimator of
-        # frame rate. It can be wrong for certain codec, such as VP80
-        # So we do not return video fps here
-        vtimebase = _fraction_to_tensor(container.streams.video[0].time_base)
-
-    audio_frames = []
-    atimebase = torch.zeros([0], dtype=torch.int32)
-    if container.streams.audio:
-        audio_frames = _read_from_stream(
-            container,
-            audio_start_pts,
-            audio_end_pts,
-            container.streams.audio[0],
-            {"audio": 0},
-        )
-        atimebase = _fraction_to_tensor(container.streams.audio[0].time_base)
-
-    container.close()
-    vframes = [frame.to_rgb().to_ndarray() for frame in video_frames]
-    vframes = torch.as_tensor(np.stack(vframes))
-
-    vframe_pts = torch.tensor([frame.pts for frame in video_frames], dtype=torch.int64)
-
-    aframes = [frame.to_ndarray() for frame in audio_frames]
-    if aframes:
-        aframes = np.transpose(np.concatenate(aframes, axis=1))
-        aframes = torch.as_tensor(aframes)
-    else:
-        aframes = torch.empty((1, 0), dtype=torch.float32)
-
-    aframe_pts = torch.tensor([audio_frame.pts for audio_frame in audio_frames], dtype=torch.int64)
-
-    return DecoderResult(
-        vframes=vframes,
-        vframe_pts=vframe_pts,
-        vtimebase=vtimebase,
-        aframes=aframes,
-        aframe_pts=aframe_pts,
-        atimebase=atimebase,
-    )
-
-
-def _pts_convert(pts, timebase_from, timebase_to, round_func=math.floor):
-    """convert pts between different time bases
-    Args:
-        pts: presentation timestamp, float
-        timebase_from: original timebase. Fraction
-        timebase_to: new timebase. Fraction
-        round_func: rounding function.
-    """
-    new_pts = Fraction(pts, 1) * timebase_from / timebase_to
-    return int(round_func(new_pts))
-
-
-def _get_video_tensor(video_dir, video_file):
-    """open a video file, and represent the video data by a PT tensor"""
-    full_path = os.path.join(video_dir, video_file)
-
-    assert os.path.exists(full_path), "File not found: %s" % full_path
-
-    with open(full_path, "rb") as fp:
-        video_tensor = torch.frombuffer(fp.read(), dtype=torch.uint8)
-
-    return full_path, video_tensor
-
-
-@pytest.mark.skipif(av is None, reason="PyAV unavailable")
-@pytest.mark.skipif(_HAS_CPU_VIDEO_DECODER is False, reason="Didn't compile with ffmpeg")
-class TestVideoReader:
-    def check_separate_decoding_result(self, tv_result, config):
-        """check the decoding results from TorchVision decoder"""
-        (
-            vframes,
-            vframe_pts,
-            vtimebase,
-            vfps,
-            vduration,
-            aframes,
-            aframe_pts,
-            atimebase,
-            asample_rate,
-            aduration,
-        ) = tv_result
-
-        video_duration = vduration.item() * Fraction(vtimebase[0].item(), vtimebase[1].item())
-        assert video_duration == approx(config.duration, abs=0.5)
-
-        assert vfps.item() == approx(config.video_fps, abs=0.5)
-
-        if asample_rate.numel() > 0:
-            assert asample_rate.item() == config.audio_sample_rate
-            audio_duration = aduration.item() * Fraction(atimebase[0].item(), atimebase[1].item())
-            assert audio_duration == approx(config.duration, abs=0.5)
-
-        # check if pts of video frames are sorted in ascending order
-        for i in range(len(vframe_pts) - 1):
-            assert vframe_pts[i] < vframe_pts[i + 1]
-
-        if len(aframe_pts) > 1:
-            # check if pts of audio frames are sorted in ascending order
-            for i in range(len(aframe_pts) - 1):
-                assert aframe_pts[i] < aframe_pts[i + 1]
-
-    def check_probe_result(self, result, config):
-        vtimebase, vfps, vduration, atimebase, asample_rate, aduration = result
-        video_duration = vduration.item() * Fraction(vtimebase[0].item(), vtimebase[1].item())
-        assert video_duration == approx(config.duration, abs=0.5)
-        assert vfps.item() == approx(config.video_fps, abs=0.5)
-        if asample_rate.numel() > 0:
-            assert asample_rate.item() == config.audio_sample_rate
-            audio_duration = aduration.item() * Fraction(atimebase[0].item(), atimebase[1].item())
-            assert audio_duration == approx(config.duration, abs=0.5)
-
-    def check_meta_result(self, result, config):
-        assert result.video_duration == approx(config.duration, abs=0.5)
-        assert result.video_fps == approx(config.video_fps, abs=0.5)
-        if result.has_audio > 0:
-            assert result.audio_sample_rate == config.audio_sample_rate
-            assert result.audio_duration == approx(config.duration, abs=0.5)
-
-    def compare_decoding_result(self, tv_result, ref_result, config=all_check_config):
-        """
-        Compare decoding results from two sources.
-        Args:
-            tv_result: decoding results from TorchVision decoder
-            ref_result: reference decoding results which can be from either PyAv
-                        decoder or TorchVision decoder with getPtsOnly = 1
-            config: config of decoding results checker
-        """
-        (
-            vframes,
-            vframe_pts,
-            vtimebase,
-            _vfps,
-            _vduration,
-            aframes,
-            aframe_pts,
-            atimebase,
-            _asample_rate,
-            _aduration,
-        ) = tv_result
-        if isinstance(ref_result, list):
-            # the ref_result is from new video_reader decoder
-            ref_result = DecoderResult(
-                vframes=ref_result[0],
-                vframe_pts=ref_result[1],
-                vtimebase=ref_result[2],
-                aframes=ref_result[5],
-                aframe_pts=ref_result[6],
-                atimebase=ref_result[7],
-            )
-
-        if vframes.numel() > 0 and ref_result.vframes.numel() > 0:
-            mean_delta = torch.mean(torch.abs(vframes.float() - ref_result.vframes.float()))
-            assert mean_delta == approx(0.0, abs=8.0)
-
-        mean_delta = torch.mean(torch.abs(vframe_pts.float() - ref_result.vframe_pts.float()))
-        assert mean_delta == approx(0.0, abs=1.0)
-
-        assert_equal(vtimebase, ref_result.vtimebase)
-
-        if config.check_aframes and aframes.numel() > 0 and ref_result.aframes.numel() > 0:
-            """Audio stream is available and audio frame is required to return
-            from decoder"""
-            assert_equal(aframes, ref_result.aframes)
-
-        if config.check_aframe_pts and aframe_pts.numel() > 0 and ref_result.aframe_pts.numel() > 0:
-            """Audio stream is available"""
-            assert_equal(aframe_pts, ref_result.aframe_pts)
-
-            assert_equal(atimebase, ref_result.atimebase)
-
-    @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_stress_test_read_video_from_file(self, test_video):
-        pytest.skip(
-            "This stress test will iteratively decode the same set of videos."
-            "It helps to detect memory leak but it takes lots of time to run."
-            "By default, it is disabled"
-        )
-        num_iter = 10000
-        # video related
-        width, height, min_dimension, max_dimension = 0, 0, 0, 0
-        video_start_pts, video_end_pts = 0, -1
-        video_timebase_num, video_timebase_den = 0, 1
-        # audio related
-        samples, channels = 0, 0
-        audio_start_pts, audio_end_pts = 0, -1
-        audio_timebase_num, audio_timebase_den = 0, 1
-
-        for _i in range(num_iter):
-            full_path = os.path.join(VIDEO_DIR, test_video)
-
-            # pass 1: decode all frames using new decoder
-            torch.ops.video_reader.read_video_from_file(
-                full_path,
-                SEEK_FRAME_MARGIN,
-                0,  # getPtsOnly
-                1,  # readVideoStream
-                width,
-                height,
-                min_dimension,
-                max_dimension,
-                video_start_pts,
-                video_end_pts,
-                video_timebase_num,
-                video_timebase_den,
-                1,  # readAudioStream
-                samples,
-                channels,
-                audio_start_pts,
-                audio_end_pts,
-                audio_timebase_num,
-                audio_timebase_den,
-            )
-
-    @pytest.mark.parametrize("test_video,config", test_videos.items())
-    def test_read_video_from_file(self, test_video, config):
-        """
-        Test the case when decoder starts with a video file to decode frames.
-        """
-        # video related
-        width, height, min_dimension, max_dimension = 0, 0, 0, 0
-        video_start_pts, video_end_pts = 0, -1
-        video_timebase_num, video_timebase_den = 0, 1
-        # audio related
-        samples, channels = 0, 0
-        audio_start_pts, audio_end_pts = 0, -1
-        audio_timebase_num, audio_timebase_den = 0, 1
-
-        full_path = os.path.join(VIDEO_DIR, test_video)
-
-        # pass 1: decode all frames using new decoder
-        tv_result = torch.ops.video_reader.read_video_from_file(
-            full_path,
-            SEEK_FRAME_MARGIN,
-            0,  # getPtsOnly
-            1,  # readVideoStream
-            width,
-            height,
-            min_dimension,
-            max_dimension,
-            video_start_pts,
-            video_end_pts,
-            video_timebase_num,
-            video_timebase_den,
-            1,  # readAudioStream
-            samples,
-            channels,
-            audio_start_pts,
-            audio_end_pts,
-            audio_timebase_num,
-            audio_timebase_den,
-        )
-        # pass 2: decode all frames using av
-        pyav_result = _decode_frames_by_av_module(full_path)
-        # check results from TorchVision decoder
-        self.check_separate_decoding_result(tv_result, config)
-        # compare decoding results
-        self.compare_decoding_result(tv_result, pyav_result, config)
-
-    @pytest.mark.parametrize("test_video,config", test_videos.items())
-    @pytest.mark.parametrize("read_video_stream,read_audio_stream", [(1, 0), (0, 1)])
-    def test_read_video_from_file_read_single_stream_only(
-        self, test_video, config, read_video_stream, read_audio_stream
-    ):
-        """
-        Test the case when decoder starts with a video file to decode frames, and
-        only reads video stream and ignores audio stream
-        """
-        # video related
-        width, height, min_dimension, max_dimension = 0, 0, 0, 0
-        video_start_pts, video_end_pts = 0, -1
-        video_timebase_num, video_timebase_den = 0, 1
-        # audio related
-        samples, channels = 0, 0
-        audio_start_pts, audio_end_pts = 0, -1
-        audio_timebase_num, audio_timebase_den = 0, 1
-
-        full_path = os.path.join(VIDEO_DIR, test_video)
-        # decode all frames using new decoder
-        tv_result = torch.ops.video_reader.read_video_from_file(
-            full_path,
-            SEEK_FRAME_MARGIN,
-            0,  # getPtsOnly
-            read_video_stream,
-            width,
-            height,
-            min_dimension,
-            max_dimension,
-            video_start_pts,
-            video_end_pts,
-            video_timebase_num,
-            video_timebase_den,
-            read_audio_stream,
-            samples,
-            channels,
-            audio_start_pts,
-            audio_end_pts,
-            audio_timebase_num,
-            audio_timebase_den,
-        )
-
-        (
-            vframes,
-            vframe_pts,
-            vtimebase,
-            vfps,
-            vduration,
-            aframes,
-            aframe_pts,
-            atimebase,
-            asample_rate,
-            aduration,
-        ) = tv_result
-
-        assert (vframes.numel() > 0) is bool(read_video_stream)
-        assert (vframe_pts.numel() > 0) is bool(read_video_stream)
-        assert (vtimebase.numel() > 0) is bool(read_video_stream)
-        assert (vfps.numel() > 0) is bool(read_video_stream)
-
-        expect_audio_data = read_audio_stream == 1 and config.audio_sample_rate is not None
-        assert (aframes.numel() > 0) is bool(expect_audio_data)
-        assert (aframe_pts.numel() > 0) is bool(expect_audio_data)
-        assert (atimebase.numel() > 0) is bool(expect_audio_data)
-        assert (asample_rate.numel() > 0) is bool(expect_audio_data)
-
-    @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_read_video_from_file_rescale_min_dimension(self, test_video):
-        """
-        Test the case when decoder starts with a video file to decode frames, and
-        video min dimension between height and width is set.
-        """
-        # video related
-        width, height, min_dimension, max_dimension = 0, 0, 128, 0
-        video_start_pts, video_end_pts = 0, -1
-        video_timebase_num, video_timebase_den = 0, 1
-        # audio related
-        samples, channels = 0, 0
-        audio_start_pts, audio_end_pts = 0, -1
-        audio_timebase_num, audio_timebase_den = 0, 1
-
-        full_path = os.path.join(VIDEO_DIR, test_video)
-
-        tv_result = torch.ops.video_reader.read_video_from_file(
-            full_path,
-            SEEK_FRAME_MARGIN,
-            0,  # getPtsOnly
-            1,  # readVideoStream
-            width,
-            height,
-            min_dimension,
-            max_dimension,
-            video_start_pts,
-            video_end_pts,
-            video_timebase_num,
-            video_timebase_den,
-            1,  # readAudioStream
-            samples,
-            channels,
-            audio_start_pts,
-            audio_end_pts,
-            audio_timebase_num,
-            audio_timebase_den,
-        )
-        assert min_dimension == min(tv_result[0].size(1), tv_result[0].size(2))
-
-    @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_read_video_from_file_rescale_max_dimension(self, test_video):
-        """
-        Test the case when decoder starts with a video file to decode frames, and
-        video min dimension between height and width is set.
-        """
-        # video related
-        width, height, min_dimension, max_dimension = 0, 0, 0, 85
-        video_start_pts, video_end_pts = 0, -1
-        video_timebase_num, video_timebase_den = 0, 1
-        # audio related
-        samples, channels = 0, 0
-        audio_start_pts, audio_end_pts = 0, -1
-        audio_timebase_num, audio_timebase_den = 0, 1
-
-        full_path = os.path.join(VIDEO_DIR, test_video)
-
-        tv_result = torch.ops.video_reader.read_video_from_file(
-            full_path,
-            SEEK_FRAME_MARGIN,
-            0,  # getPtsOnly
-            1,  # readVideoStream
-            width,
-            height,
-            min_dimension,
-            max_dimension,
-            video_start_pts,
-            video_end_pts,
-            video_timebase_num,
-            video_timebase_den,
-            1,  # readAudioStream
-            samples,
-            channels,
-            audio_start_pts,
-            audio_end_pts,
-            audio_timebase_num,
-            audio_timebase_den,
-        )
-        assert max_dimension == max(tv_result[0].size(1), tv_result[0].size(2))
-
-    @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_read_video_from_file_rescale_both_min_max_dimension(self, test_video):
-        """
-        Test the case when decoder starts with a video file to decode frames, and
-        video min dimension between height and width is set.
-        """
-        # video related
-        width, height, min_dimension, max_dimension = 0, 0, 64, 85
-        video_start_pts, video_end_pts = 0, -1
-        video_timebase_num, video_timebase_den = 0, 1
-        # audio related
-        samples, channels = 0, 0
-        audio_start_pts, audio_end_pts = 0, -1
-        audio_timebase_num, audio_timebase_den = 0, 1
-
-        full_path = os.path.join(VIDEO_DIR, test_video)
-
-        tv_result = torch.ops.video_reader.read_video_from_file(
-            full_path,
-            SEEK_FRAME_MARGIN,
-            0,  # getPtsOnly
-            1,  # readVideoStream
-            width,
-            height,
-            min_dimension,
-            max_dimension,
-            video_start_pts,
-            video_end_pts,
-            video_timebase_num,
-            video_timebase_den,
-            1,  # readAudioStream
-            samples,
-            channels,
-            audio_start_pts,
-            audio_end_pts,
-            audio_timebase_num,
-            audio_timebase_den,
-        )
-        assert min_dimension == min(tv_result[0].size(1), tv_result[0].size(2))
-        assert max_dimension == max(tv_result[0].size(1), tv_result[0].size(2))
-
-    @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_read_video_from_file_rescale_width(self, test_video):
-        """
-        Test the case when decoder starts with a video file to decode frames, and
-        video width is set.
-        """
-        # video related
-        width, height, min_dimension, max_dimension = 256, 0, 0, 0
-        video_start_pts, video_end_pts = 0, -1
-        video_timebase_num, video_timebase_den = 0, 1
-        # audio related
-        samples, channels = 0, 0
-        audio_start_pts, audio_end_pts = 0, -1
-        audio_timebase_num, audio_timebase_den = 0, 1
-
-        full_path = os.path.join(VIDEO_DIR, test_video)
-
-        tv_result = torch.ops.video_reader.read_video_from_file(
-            full_path,
-            SEEK_FRAME_MARGIN,
-            0,  # getPtsOnly
-            1,  # readVideoStream
-            width,
-            height,
-            min_dimension,
-            max_dimension,
-            video_start_pts,
-            video_end_pts,
-            video_timebase_num,
-            video_timebase_den,
-            1,  # readAudioStream
-            samples,
-            channels,
-            audio_start_pts,
-            audio_end_pts,
-            audio_timebase_num,
-            audio_timebase_den,
-        )
-        assert tv_result[0].size(2) == width
-
-    @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_read_video_from_file_rescale_height(self, test_video):
-        """
-        Test the case when decoder starts with a video file to decode frames, and
-        video height is set.
-        """
-        # video related
-        width, height, min_dimension, max_dimension = 0, 224, 0, 0
-        video_start_pts, video_end_pts = 0, -1
-        video_timebase_num, video_timebase_den = 0, 1
-        # audio related
-        samples, channels = 0, 0
-        audio_start_pts, audio_end_pts = 0, -1
-        audio_timebase_num, audio_timebase_den = 0, 1
-
-        full_path = os.path.join(VIDEO_DIR, test_video)
-
-        tv_result = torch.ops.video_reader.read_video_from_file(
-            full_path,
-            SEEK_FRAME_MARGIN,
-            0,  # getPtsOnly
-            1,  # readVideoStream
-            width,
-            height,
-            min_dimension,
-            max_dimension,
-            video_start_pts,
-            video_end_pts,
-            video_timebase_num,
-            video_timebase_den,
-            1,  # readAudioStream
-            samples,
-            channels,
-            audio_start_pts,
-            audio_end_pts,
-            audio_timebase_num,
-            audio_timebase_den,
-        )
-        assert tv_result[0].size(1) == height
-
-    @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_read_video_from_file_rescale_width_and_height(self, test_video):
-        """
-        Test the case when decoder starts with a video file to decode frames, and
-        both video height and width are set.
-        """
-        # video related
-        width, height, min_dimension, max_dimension = 320, 240, 0, 0
-        video_start_pts, video_end_pts = 0, -1
-        video_timebase_num, video_timebase_den = 0, 1
-        # audio related
-        samples, channels = 0, 0
-        audio_start_pts, audio_end_pts = 0, -1
-        audio_timebase_num, audio_timebase_den = 0, 1
-
-        full_path = os.path.join(VIDEO_DIR, test_video)
-
-        tv_result = torch.ops.video_reader.read_video_from_file(
-            full_path,
-            SEEK_FRAME_MARGIN,
-            0,  # getPtsOnly
-            1,  # readVideoStream
-            width,
-            height,
-            min_dimension,
-            max_dimension,
-            video_start_pts,
-            video_end_pts,
-            video_timebase_num,
-            video_timebase_den,
-            1,  # readAudioStream
-            samples,
-            channels,
-            audio_start_pts,
-            audio_end_pts,
-            audio_timebase_num,
-            audio_timebase_den,
-        )
-        assert tv_result[0].size(1) == height
-        assert tv_result[0].size(2) == width
-
-    @pytest.mark.parametrize("test_video", test_videos.keys())
-    @pytest.mark.parametrize("samples", [9600, 96000])
-    def test_read_video_from_file_audio_resampling(self, test_video, samples):
-        """
-        Test the case when decoder starts with a video file to decode frames, and
-        audio waveform are resampled
-        """
-        # video related
-        width, height, min_dimension, max_dimension = 0, 0, 0, 0
-        video_start_pts, video_end_pts = 0, -1
-        video_timebase_num, video_timebase_den = 0, 1
-        # audio related
-        channels = 0
-        audio_start_pts, audio_end_pts = 0, -1
-        audio_timebase_num, audio_timebase_den = 0, 1
-
-        full_path = os.path.join(VIDEO_DIR, test_video)
-
-        tv_result = torch.ops.video_reader.read_video_from_file(
-            full_path,
-            SEEK_FRAME_MARGIN,
-            0,  # getPtsOnly
-            1,  # readVideoStream
-            width,
-            height,
-            min_dimension,
-            max_dimension,
-            video_start_pts,
-            video_end_pts,
-            video_timebase_num,
-            video_timebase_den,
-            1,  # readAudioStream
-            samples,
-            channels,
-            audio_start_pts,
-            audio_end_pts,
-            audio_timebase_num,
-            audio_timebase_den,
-        )
-        (
-            vframes,
-            vframe_pts,
-            vtimebase,
-            vfps,
-            vduration,
-            aframes,
-            aframe_pts,
-            atimebase,
-            asample_rate,
-            aduration,
-        ) = tv_result
-        if aframes.numel() > 0:
-            assert samples == asample_rate.item()
-            assert 1 == aframes.size(1)
-            # when audio stream is found
-            duration = float(aframe_pts[-1]) * float(atimebase[0]) / float(atimebase[1])
-            assert aframes.size(0) == approx(int(duration * asample_rate.item()), abs=0.1 * asample_rate.item())
-
-    @pytest.mark.parametrize("test_video,config", test_videos.items())
-    def test_compare_read_video_from_memory_and_file(self, test_video, config):
-        """
-        Test the case when video is already in memory, and decoder reads data in memory
-        """
-        # video related
-        width, height, min_dimension, max_dimension = 0, 0, 0, 0
-        video_start_pts, video_end_pts = 0, -1
-        video_timebase_num, video_timebase_den = 0, 1
-        # audio related
-        samples, channels = 0, 0
-        audio_start_pts, audio_end_pts = 0, -1
-        audio_timebase_num, audio_timebase_den = 0, 1
-
-        full_path, video_tensor = _get_video_tensor(VIDEO_DIR, test_video)
-
-        # pass 1: decode all frames using cpp decoder
-        tv_result_memory = torch.ops.video_reader.read_video_from_memory(
-            video_tensor,
-            SEEK_FRAME_MARGIN,
-            0,  # getPtsOnly
-            1,  # readVideoStream
-            width,
-            height,
-            min_dimension,
-            max_dimension,
-            video_start_pts,
-            video_end_pts,
-            video_timebase_num,
-            video_timebase_den,
-            1,  # readAudioStream
-            samples,
-            channels,
-            audio_start_pts,
-            audio_end_pts,
-            audio_timebase_num,
-            audio_timebase_den,
-        )
-        self.check_separate_decoding_result(tv_result_memory, config)
-        # pass 2: decode all frames from file
-        tv_result_file = torch.ops.video_reader.read_video_from_file(
-            full_path,
-            SEEK_FRAME_MARGIN,
-            0,  # getPtsOnly
-            1,  # readVideoStream
-            width,
-            height,
-            min_dimension,
-            max_dimension,
-            video_start_pts,
-            video_end_pts,
-            video_timebase_num,
-            video_timebase_den,
-            1,  # readAudioStream
-            samples,
-            channels,
-            audio_start_pts,
-            audio_end_pts,
-            audio_timebase_num,
-            audio_timebase_den,
-        )
-
-        self.check_separate_decoding_result(tv_result_file, config)
-        # finally, compare results decoded from memory and file
-        self.compare_decoding_result(tv_result_memory, tv_result_file)
-
-    @pytest.mark.parametrize("test_video,config", test_videos.items())
-    def test_read_video_from_memory(self, test_video, config):
-        """
-        Test the case when video is already in memory, and decoder reads data in memory
-        """
-        # video related
-        width, height, min_dimension, max_dimension = 0, 0, 0, 0
-        video_start_pts, video_end_pts = 0, -1
-        video_timebase_num, video_timebase_den = 0, 1
-        # audio related
-        samples, channels = 0, 0
-        audio_start_pts, audio_end_pts = 0, -1
-        audio_timebase_num, audio_timebase_den = 0, 1
-
-        full_path, video_tensor = _get_video_tensor(VIDEO_DIR, test_video)
-
-        # pass 1: decode all frames using cpp decoder
-        tv_result = torch.ops.video_reader.read_video_from_memory(
-            video_tensor,
-            SEEK_FRAME_MARGIN,
-            0,  # getPtsOnly
-            1,  # readVideoStream
-            width,
-            height,
-            min_dimension,
-            max_dimension,
-            video_start_pts,
-            video_end_pts,
-            video_timebase_num,
-            video_timebase_den,
-            1,  # readAudioStream
-            samples,
-            channels,
-            audio_start_pts,
-            audio_end_pts,
-            audio_timebase_num,
-            audio_timebase_den,
-        )
-        # pass 2: decode all frames using av
-        pyav_result = _decode_frames_by_av_module(full_path)
-
-        self.check_separate_decoding_result(tv_result, config)
-        self.compare_decoding_result(tv_result, pyav_result, config)
-
-    @pytest.mark.parametrize("test_video,config", test_videos.items())
-    def test_read_video_from_memory_get_pts_only(self, test_video, config):
-        """
-        Test the case when video is already in memory, and decoder reads data in memory.
-        Compare frame pts between decoding for pts only and full decoding
-        for both pts and frame data
-        """
-        # video related
-        width, height, min_dimension, max_dimension = 0, 0, 0, 0
-        video_start_pts, video_end_pts = 0, -1
-        video_timebase_num, video_timebase_den = 0, 1
-        # audio related
-        samples, channels = 0, 0
-        audio_start_pts, audio_end_pts = 0, -1
-        audio_timebase_num, audio_timebase_den = 0, 1
-
-        _, video_tensor = _get_video_tensor(VIDEO_DIR, test_video)
-
-        # pass 1: decode all frames using cpp decoder
-        tv_result = torch.ops.video_reader.read_video_from_memory(
-            video_tensor,
-            SEEK_FRAME_MARGIN,
-            0,  # getPtsOnly
-            1,  # readVideoStream
-            width,
-            height,
-            min_dimension,
-            max_dimension,
-            video_start_pts,
-            video_end_pts,
-            video_timebase_num,
-            video_timebase_den,
-            1,  # readAudioStream
-            samples,
-            channels,
-            audio_start_pts,
-            audio_end_pts,
-            audio_timebase_num,
-            audio_timebase_den,
-        )
-        assert abs(config.video_fps - tv_result[3].item()) < 0.01
-
-        # pass 2: decode all frames to get PTS only using cpp decoder
-        tv_result_pts_only = torch.ops.video_reader.read_video_from_memory(
-            video_tensor,
-            SEEK_FRAME_MARGIN,
-            1,  # getPtsOnly
-            1,  # readVideoStream
-            width,
-            height,
-            min_dimension,
-            max_dimension,
-            video_start_pts,
-            video_end_pts,
-            video_timebase_num,
-            video_timebase_den,
-            1,  # readAudioStream
-            samples,
-            channels,
-            audio_start_pts,
-            audio_end_pts,
-            audio_timebase_num,
-            audio_timebase_den,
-        )
-
-        assert not tv_result_pts_only[0].numel()
-        assert not tv_result_pts_only[5].numel()
-        self.compare_decoding_result(tv_result, tv_result_pts_only)
-
-    @pytest.mark.parametrize("test_video,config", test_videos.items())
-    @pytest.mark.parametrize("num_frames", [4, 8, 16, 32, 64, 128])
-    def test_read_video_in_range_from_memory(self, test_video, config, num_frames):
-        """
-        Test the case when video is already in memory, and decoder reads data in memory.
-        In addition, decoder takes meaningful start- and end PTS as input, and decode
-        frames within that interval
-        """
-        full_path, video_tensor = _get_video_tensor(VIDEO_DIR, test_video)
-        # video related
-        width, height, min_dimension, max_dimension = 0, 0, 0, 0
-        video_start_pts, video_end_pts = 0, -1
-        video_timebase_num, video_timebase_den = 0, 1
-        # audio related
-        samples, channels = 0, 0
-        audio_start_pts, audio_end_pts = 0, -1
-        audio_timebase_num, audio_timebase_den = 0, 1
-        # pass 1: decode all frames using new decoder
-        tv_result = torch.ops.video_reader.read_video_from_memory(
-            video_tensor,
-            SEEK_FRAME_MARGIN,
-            0,  # getPtsOnly
-            1,  # readVideoStream
-            width,
-            height,
-            min_dimension,
-            max_dimension,
-            video_start_pts,
-            video_end_pts,
-            video_timebase_num,
-            video_timebase_den,
-            1,  # readAudioStream
-            samples,
-            channels,
-            audio_start_pts,
-            audio_end_pts,
-            audio_timebase_num,
-            audio_timebase_den,
-        )
-        (
-            vframes,
-            vframe_pts,
-            vtimebase,
-            vfps,
-            vduration,
-            aframes,
-            aframe_pts,
-            atimebase,
-            asample_rate,
-            aduration,
-        ) = tv_result
-        assert abs(config.video_fps - vfps.item()) < 0.01
-
-        start_pts_ind_max = vframe_pts.size(0) - num_frames
-        if start_pts_ind_max <= 0:
-            return
-        # randomly pick start pts
-        start_pts_ind = randint(0, start_pts_ind_max)
-        end_pts_ind = start_pts_ind + num_frames - 1
-        video_start_pts = vframe_pts[start_pts_ind]
-        video_end_pts = vframe_pts[end_pts_ind]
-
-        video_timebase_num, video_timebase_den = vtimebase[0], vtimebase[1]
-        if len(atimebase) > 0:
-            # when audio stream is available
-            audio_timebase_num, audio_timebase_den = atimebase[0], atimebase[1]
-            audio_start_pts = _pts_convert(
-                video_start_pts.item(),
-                Fraction(video_timebase_num.item(), video_timebase_den.item()),
-                Fraction(audio_timebase_num.item(), audio_timebase_den.item()),
-                math.floor,
-            )
-            audio_end_pts = _pts_convert(
-                video_end_pts.item(),
-                Fraction(video_timebase_num.item(), video_timebase_den.item()),
-                Fraction(audio_timebase_num.item(), audio_timebase_den.item()),
-                math.ceil,
-            )
-
-        # pass 2: decode frames in the randomly generated range
-        tv_result = torch.ops.video_reader.read_video_from_memory(
-            video_tensor,
-            SEEK_FRAME_MARGIN,
-            0,  # getPtsOnly
-            1,  # readVideoStream
-            width,
-            height,
-            min_dimension,
-            max_dimension,
-            video_start_pts,
-            video_end_pts,
-            video_timebase_num,
-            video_timebase_den,
-            1,  # readAudioStream
-            samples,
-            channels,
-            audio_start_pts,
-            audio_end_pts,
-            audio_timebase_num,
-            audio_timebase_den,
-        )
-
-        # pass 3: decode frames in range using PyAv
-        video_timebase_av, audio_timebase_av = _get_timebase_by_av_module(full_path)
-
-        video_start_pts_av = _pts_convert(
-            video_start_pts.item(),
-            Fraction(video_timebase_num.item(), video_timebase_den.item()),
-            Fraction(video_timebase_av.numerator, video_timebase_av.denominator),
-            math.floor,
-        )
-        video_end_pts_av = _pts_convert(
-            video_end_pts.item(),
-            Fraction(video_timebase_num.item(), video_timebase_den.item()),
-            Fraction(video_timebase_av.numerator, video_timebase_av.denominator),
-            math.ceil,
-        )
-        if audio_timebase_av:
-            audio_start_pts = _pts_convert(
-                video_start_pts.item(),
-                Fraction(video_timebase_num.item(), video_timebase_den.item()),
-                Fraction(audio_timebase_av.numerator, audio_timebase_av.denominator),
-                math.floor,
-            )
-            audio_end_pts = _pts_convert(
-                video_end_pts.item(),
-                Fraction(video_timebase_num.item(), video_timebase_den.item()),
-                Fraction(audio_timebase_av.numerator, audio_timebase_av.denominator),
-                math.ceil,
-            )
-
-        pyav_result = _decode_frames_by_av_module(
-            full_path,
-            video_start_pts_av,
-            video_end_pts_av,
-            audio_start_pts,
-            audio_end_pts,
-        )
-
-        assert tv_result[0].size(0) == num_frames
-        if pyav_result.vframes.size(0) == num_frames:
-            # if PyAv decodes a different number of video frames, skip
-            # comparing the decoding results between Torchvision video reader
-            # and PyAv
-            self.compare_decoding_result(tv_result, pyav_result, config)
-
-    @pytest.mark.parametrize("test_video,config", test_videos.items())
-    def test_probe_video_from_file(self, test_video, config):
-        """
-        Test the case when decoder probes a video file
-        """
-        full_path = os.path.join(VIDEO_DIR, test_video)
-        probe_result = torch.ops.video_reader.probe_video_from_file(full_path)
-        self.check_probe_result(probe_result, config)
-
-    @pytest.mark.parametrize("test_video,config", test_videos.items())
-    def test_probe_video_from_memory(self, test_video, config):
-        """
-        Test the case when decoder probes a video in memory
-        """
-        _, video_tensor = _get_video_tensor(VIDEO_DIR, test_video)
-        probe_result = torch.ops.video_reader.probe_video_from_memory(video_tensor)
-        self.check_probe_result(probe_result, config)
-
-    @pytest.mark.parametrize("test_video,config", test_videos.items())
-    def test_probe_video_from_memory_script(self, test_video, config):
-        scripted_fun = torch.jit.script(io._probe_video_from_memory)
-        assert scripted_fun is not None
-
-        _, video_tensor = _get_video_tensor(VIDEO_DIR, test_video)
-        probe_result = scripted_fun(video_tensor)
-        self.check_meta_result(probe_result, config)
-
-    @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_read_video_from_memory_scripted(self, test_video):
-        """
-        Test the case when video is already in memory, and decoder reads data in memory
-        """
-        # video related
-        width, height, min_dimension, max_dimension = 0, 0, 0, 0
-        video_start_pts, video_end_pts = 0, -1
-        video_timebase_num, video_timebase_den = 0, 1
-        # audio related
-        samples, channels = 0, 0
-        audio_start_pts, audio_end_pts = 0, -1
-        audio_timebase_num, audio_timebase_den = 0, 1
-
-        scripted_fun = torch.jit.script(io._read_video_from_memory)
-        assert scripted_fun is not None
-
-        _, video_tensor = _get_video_tensor(VIDEO_DIR, test_video)
-
-        # decode all frames using cpp decoder
-        scripted_fun(
-            video_tensor,
-            SEEK_FRAME_MARGIN,
-            1,  # readVideoStream
-            width,
-            height,
-            min_dimension,
-            max_dimension,
-            [video_start_pts, video_end_pts],
-            video_timebase_num,
-            video_timebase_den,
-            1,  # readAudioStream
-            samples,
-            channels,
-            [audio_start_pts, audio_end_pts],
-            audio_timebase_num,
-            audio_timebase_den,
-        )
-        # FUTURE: check value of video / audio frames
-
-    def test_invalid_file(self):
-        set_video_backend("video_reader")
-        with pytest.raises(RuntimeError):
-            io.read_video("foo.mp4")
-
-        set_video_backend("pyav")
-        with pytest.raises(RuntimeError):
-            io.read_video("foo.mp4")
-
-    @pytest.mark.parametrize("test_video", test_videos.keys())
-    @pytest.mark.parametrize("backend", ["video_reader", "pyav"])
-    @pytest.mark.parametrize("start_offset", [0, 500])
-    @pytest.mark.parametrize("end_offset", [3000, None])
-    def test_audio_present_pts(self, test_video, backend, start_offset, end_offset):
-        """Test if audio frames are returned with pts unit."""
-        full_path = os.path.join(VIDEO_DIR, test_video)
-        container = av.open(full_path)
-        if container.streams.audio:
-            set_video_backend(backend)
-            _, audio, _ = io.read_video(full_path, start_offset, end_offset, pts_unit="pts")
-            assert all([dimension > 0 for dimension in audio.shape[:2]])
-
-    @pytest.mark.parametrize("test_video", test_videos.keys())
-    @pytest.mark.parametrize("backend", ["video_reader", "pyav"])
-    @pytest.mark.parametrize("start_offset", [0, 0.1])
-    @pytest.mark.parametrize("end_offset", [0.3, None])
-    def test_audio_present_sec(self, test_video, backend, start_offset, end_offset):
-        """Test if audio frames are returned with sec unit."""
-        full_path = os.path.join(VIDEO_DIR, test_video)
-        container = av.open(full_path)
-        if container.streams.audio:
-            set_video_backend(backend)
-            _, audio, _ = io.read_video(full_path, start_offset, end_offset, pts_unit="sec")
-            assert all([dimension > 0 for dimension in audio.shape[:2]])
-
-
-if __name__ == "__main__":
-    pytest.main([__file__])
diff --git a/test/test_videoapi.py b/test/test_videoapi.py
deleted file mode 100644
index aabcf6407f7..00000000000
--- a/test/test_videoapi.py
+++ /dev/null
@@ -1,312 +0,0 @@
-import collections
-import os
-import urllib
-
-import pytest
-import torch
-import torchvision
-from pytest import approx
-from torchvision.datasets.utils import download_url
-from torchvision.io import _HAS_CPU_VIDEO_DECODER, VideoReader
-
-
-# WARNING: these tests have been skipped forever on the CI because the video ops
-# are never properly available. This is bad, but things have been in a terrible
-# state for a long time already as we write this comment, and we'll hopefully be
-# able to get rid of this all soon.
-
-
-try:
-    import av
-
-    # Do a version test too
-    torchvision.io.video._check_av_available()
-except ImportError:
-    av = None
-
-
-VIDEO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "videos")
-
-CheckerConfig = ["duration", "video_fps", "audio_sample_rate"]
-GroundTruth = collections.namedtuple("GroundTruth", " ".join(CheckerConfig))
-
-
-def backends():
-    backends_ = ["video_reader"]
-    if av is not None:
-        backends_.append("pyav")
-    return backends_
-
-
-def fate(name, path="."):
-    """Download and return a path to a sample from the FFmpeg test suite.
-    See the `FFmpeg Automated Test Environment <https://www.ffmpeg.org/fate.html>`_
-    """
-
-    file_name = name.split("/")[1]
-    download_url("http://fate.ffmpeg.org/fate-suite/" + name, path, file_name)
-    return os.path.join(path, file_name)
-
-
-test_videos = {
-    "RATRACE_wave_f_nm_np1_fr_goo_37.avi": GroundTruth(duration=2.0, video_fps=30.0, audio_sample_rate=None),
-    "SchoolRulesHowTheyHelpUs_wave_f_nm_np1_ba_med_0.avi": GroundTruth(
-        duration=2.0, video_fps=30.0, audio_sample_rate=None
-    ),
-    "TrumanShow_wave_f_nm_np1_fr_med_26.avi": GroundTruth(duration=2.0, video_fps=30.0, audio_sample_rate=None),
-    "v_SoccerJuggling_g23_c01.avi": GroundTruth(duration=8.0, video_fps=29.97, audio_sample_rate=None),
-    "v_SoccerJuggling_g24_c01.avi": GroundTruth(duration=8.0, video_fps=29.97, audio_sample_rate=None),
-    "R6llTwEh07w.mp4": GroundTruth(duration=10.0, video_fps=30.0, audio_sample_rate=44100),
-    "SOX5yA1l24A.mp4": GroundTruth(duration=11.0, video_fps=29.97, audio_sample_rate=48000),
-    "WUzgd7C1pWA.mp4": GroundTruth(duration=11.0, video_fps=29.97, audio_sample_rate=48000),
-}
-
-
-@pytest.mark.skipif(_HAS_CPU_VIDEO_DECODER is False, reason="Didn't compile with ffmpeg")
-class TestVideoApi:
-    @pytest.mark.skipif(av is None, reason="PyAV unavailable")
-    @pytest.mark.parametrize("test_video", test_videos.keys())
-    @pytest.mark.parametrize("backend", backends())
-    def test_frame_reading(self, test_video, backend):
-        torchvision.set_video_backend(backend)
-        full_path = os.path.join(VIDEO_DIR, test_video)
-        with av.open(full_path) as av_reader:
-            if av_reader.streams.video:
-                av_frames, vr_frames = [], []
-                av_pts, vr_pts = [], []
-                # get av frames
-                for av_frame in av_reader.decode(av_reader.streams.video[0]):
-                    av_frames.append(torch.tensor(av_frame.to_rgb().to_ndarray()).permute(2, 0, 1))
-                    av_pts.append(av_frame.pts * av_frame.time_base)
-
-                # get vr frames
-                video_reader = VideoReader(full_path, "video")
-                for vr_frame in video_reader:
-                    vr_frames.append(vr_frame["data"])
-                    vr_pts.append(vr_frame["pts"])
-
-                # same number of frames
-                assert len(vr_frames) == len(av_frames)
-                assert len(vr_pts) == len(av_pts)
-
-                # compare the frames and ptss
-                for i in range(len(vr_frames)):
-                    assert float(av_pts[i]) == approx(vr_pts[i], abs=0.1)
-
-                    mean_delta = torch.mean(torch.abs(av_frames[i].float() - vr_frames[i].float()))
-                    # on average the difference is very small and caused
-                    # by decoding (around 1%)
-                    # TODO: asses empirically how to set this? atm it's 1%
-                    # averaged over all frames
-                    assert mean_delta.item() < 2.55
-
-                del vr_frames, av_frames, vr_pts, av_pts
-
-        # test audio reading compared to PYAV
-        with av.open(full_path) as av_reader:
-            if av_reader.streams.audio:
-                av_frames, vr_frames = [], []
-                av_pts, vr_pts = [], []
-                # get av frames
-                for av_frame in av_reader.decode(av_reader.streams.audio[0]):
-                    av_frames.append(torch.tensor(av_frame.to_ndarray()).permute(1, 0))
-                    av_pts.append(av_frame.pts * av_frame.time_base)
-                av_reader.close()
-
-                # get vr frames
-                video_reader = VideoReader(full_path, "audio")
-                for vr_frame in video_reader:
-                    vr_frames.append(vr_frame["data"])
-                    vr_pts.append(vr_frame["pts"])
-
-                # same number of frames
-                assert len(vr_frames) == len(av_frames)
-                assert len(vr_pts) == len(av_pts)
-
-                # compare the frames and ptss
-                for i in range(len(vr_frames)):
-                    assert float(av_pts[i]) == approx(vr_pts[i], abs=0.1)
-                    max_delta = torch.max(torch.abs(av_frames[i].float() - vr_frames[i].float()))
-                    # we assure that there is never more than 1% difference in signal
-                    assert max_delta.item() < 0.001
-
-    @pytest.mark.parametrize("stream", ["video", "audio"])
-    @pytest.mark.parametrize("test_video", test_videos.keys())
-    @pytest.mark.parametrize("backend", backends())
-    def test_frame_reading_mem_vs_file(self, test_video, stream, backend):
-        torchvision.set_video_backend(backend)
-        full_path = os.path.join(VIDEO_DIR, test_video)
-
-        reader = VideoReader(full_path)
-        reader_md = reader.get_metadata()
-
-        if stream in reader_md:
-            # Test video reading from file vs from memory
-            vr_frames, vr_frames_mem = [], []
-            vr_pts, vr_pts_mem = [], []
-            # get vr frames
-            video_reader = VideoReader(full_path, stream)
-            for vr_frame in video_reader:
-                vr_frames.append(vr_frame["data"])
-                vr_pts.append(vr_frame["pts"])
-
-            # get vr frames = read from memory
-            f = open(full_path, "rb")
-            fbytes = f.read()
-            f.close()
-            video_reader_from_mem = VideoReader(fbytes, stream)
-
-            for vr_frame_from_mem in video_reader_from_mem:
-                vr_frames_mem.append(vr_frame_from_mem["data"])
-                vr_pts_mem.append(vr_frame_from_mem["pts"])
-
-            # same number of frames
-            assert len(vr_frames) == len(vr_frames_mem)
-            assert len(vr_pts) == len(vr_pts_mem)
-
-            # compare the frames and ptss
-            for i in range(len(vr_frames)):
-                assert vr_pts[i] == vr_pts_mem[i]
-                mean_delta = torch.mean(torch.abs(vr_frames[i].float() - vr_frames_mem[i].float()))
-                # on average the difference is very small and caused
-                # by decoding (around 1%)
-                # TODO: asses empirically how to set this? atm it's 1%
-                # averaged over all frames
-                assert mean_delta.item() < 2.55
-
-            del vr_frames, vr_pts, vr_frames_mem, vr_pts_mem
-        else:
-            del reader, reader_md
-
-    @pytest.mark.parametrize("test_video,config", test_videos.items())
-    @pytest.mark.parametrize("backend", backends())
-    def test_metadata(self, test_video, config, backend):
-        """
-        Test that the metadata returned via pyav corresponds to the one returned
-        by the new video decoder API
-        """
-        torchvision.set_video_backend(backend)
-        full_path = os.path.join(VIDEO_DIR, test_video)
-        reader = VideoReader(full_path, "video")
-        reader_md = reader.get_metadata()
-        assert config.video_fps == approx(reader_md["video"]["fps"][0], abs=0.0001)
-        assert config.duration == approx(reader_md["video"]["duration"][0], abs=0.5)
-
-    @pytest.mark.parametrize("test_video", test_videos.keys())
-    @pytest.mark.parametrize("backend", backends())
-    def test_seek_start(self, test_video, backend):
-        torchvision.set_video_backend(backend)
-        full_path = os.path.join(VIDEO_DIR, test_video)
-        video_reader = VideoReader(full_path, "video")
-        num_frames = 0
-        for _ in video_reader:
-            num_frames += 1
-
-        # now seek the container to 0 and do it again
-        # It's often that starting seek can be inprecise
-        # this way and it doesn't start at 0
-        video_reader.seek(0)
-        start_num_frames = 0
-        for _ in video_reader:
-            start_num_frames += 1
-
-        assert start_num_frames == num_frames
-
-        # now seek the container to < 0 to check for unexpected behaviour
-        video_reader.seek(-1)
-        start_num_frames = 0
-        for _ in video_reader:
-            start_num_frames += 1
-
-        assert start_num_frames == num_frames
-
-    @pytest.mark.parametrize("test_video", test_videos.keys())
-    @pytest.mark.parametrize("backend", ["video_reader"])
-    def test_accurateseek_middle(self, test_video, backend):
-        torchvision.set_video_backend(backend)
-        full_path = os.path.join(VIDEO_DIR, test_video)
-        stream = "video"
-        video_reader = VideoReader(full_path, stream)
-        md = video_reader.get_metadata()
-        duration = md[stream]["duration"][0]
-        if duration is not None:
-            num_frames = 0
-            for _ in video_reader:
-                num_frames += 1
-
-            video_reader.seek(duration / 2)
-            middle_num_frames = 0
-            for _ in video_reader:
-                middle_num_frames += 1
-
-            assert middle_num_frames < num_frames
-            assert middle_num_frames == approx(num_frames // 2, abs=1)
-
-            video_reader.seek(duration / 2)
-            frame = next(video_reader)
-            lb = duration / 2 - 1 / md[stream]["fps"][0]
-            ub = duration / 2 + 1 / md[stream]["fps"][0]
-            assert (lb <= frame["pts"]) and (ub >= frame["pts"])
-
-    def test_fate_suite(self):
-        # TODO: remove the try-except statement once the connectivity issues are resolved
-        try:
-            video_path = fate("sub/MovText_capability_tester.mp4", VIDEO_DIR)
-        except (urllib.error.URLError, ConnectionError) as error:
-            pytest.skip(f"Skipping due to connectivity issues: {error}")
-        vr = VideoReader(video_path)
-        metadata = vr.get_metadata()
-
-        assert metadata["subtitles"]["duration"] is not None
-        os.remove(video_path)
-
-    @pytest.mark.skipif(av is None, reason="PyAV unavailable")
-    @pytest.mark.parametrize("test_video,config", test_videos.items())
-    @pytest.mark.parametrize("backend", backends())
-    def test_keyframe_reading(self, test_video, config, backend):
-        torchvision.set_video_backend(backend)
-        full_path = os.path.join(VIDEO_DIR, test_video)
-
-        av_reader = av.open(full_path)
-        # reduce streams to only keyframes
-        av_stream = av_reader.streams.video[0]
-        av_stream.codec_context.skip_frame = "NONKEY"
-
-        av_keyframes = []
-        vr_keyframes = []
-        if av_reader.streams.video:
-
-            # get all keyframes using pyav. Then, seek randomly into video reader
-            # and assert that all the returned values are in AV_KEYFRAMES
-
-            for av_frame in av_reader.decode(av_stream):
-                av_keyframes.append(float(av_frame.pts * av_frame.time_base))
-
-        if len(av_keyframes) > 1:
-            video_reader = VideoReader(full_path, "video")
-            for i in range(1, len(av_keyframes)):
-                seek_val = (av_keyframes[i] + av_keyframes[i - 1]) / 2
-                data = next(video_reader.seek(seek_val, True))
-                vr_keyframes.append(data["pts"])
-
-            data = next(video_reader.seek(config.duration, True))
-            vr_keyframes.append(data["pts"])
-
-            assert len(av_keyframes) == len(vr_keyframes)
-            # NOTE: this video gets different keyframe with different
-            # loaders (0.333 pyav, 0.666 for us)
-            if test_video != "TrumanShow_wave_f_nm_np1_fr_med_26.avi":
-                for i in range(len(av_keyframes)):
-                    assert av_keyframes[i] == approx(vr_keyframes[i], rel=0.001)
-
-    def test_src(self):
-        with pytest.raises(ValueError, match="src cannot be empty"):
-            VideoReader(src="")
-        with pytest.raises(ValueError, match="src must be either string"):
-            VideoReader(src=2)
-        with pytest.raises(TypeError, match="unexpected keyword argument"):
-            VideoReader(path="path")
-
-
-if __name__ == "__main__":
-    pytest.main([__file__])
diff --git a/torchvision/__init__.py b/torchvision/__init__.py
index 5d06156c25f..26f51f856d4 100644
--- a/torchvision/__init__.py
+++ b/torchvision/__init__.py
@@ -28,8 +28,6 @@
 
 _image_backend = "PIL"
 
-_video_backend = "pyav"
-
 
 def set_image_backend(backend):
     """
@@ -53,48 +51,6 @@ def get_image_backend():
     return _image_backend
 
 
-def set_video_backend(backend):
-    """
-    Specifies the package used to decode videos.
-
-    Args:
-        backend (string): Name of the video backend. one of {'pyav', 'video_reader'}.
-            The :mod:`pyav` package uses the 3rd party PyAv library. It is a Pythonic
-            binding for the FFmpeg libraries.
-            The :mod:`video_reader` package includes a native C++ implementation on
-            top of FFMPEG libraries, and a python API of TorchScript custom operator.
-            It generally decodes faster than :mod:`pyav`, but is perhaps less robust.
-
-    .. note::
-        Building with FFMPEG is disabled by default in the latest `main`. If you want to use the 'video_reader'
-        backend, please compile torchvision from source.
-    """
-    global _video_backend
-    if backend not in ["pyav", "video_reader", "cuda"]:
-        raise ValueError("Invalid video backend '%s'. Options are 'pyav', 'video_reader' and 'cuda'" % backend)
-    if backend == "video_reader" and not io._HAS_CPU_VIDEO_DECODER:
-        # TODO: better messages
-        message = "video_reader video backend is not available. Please compile torchvision from source and try again"
-        raise RuntimeError(message)
-    elif backend == "cuda" and not io._HAS_GPU_VIDEO_DECODER:
-        # TODO: better messages
-        message = "cuda video backend is not available."
-        raise RuntimeError(message)
-    else:
-        _video_backend = backend
-
-
-def get_video_backend():
-    """
-    Returns the currently active video backend used to decode videos.
-
-    Returns:
-        str: Name of the video backend. one of {'pyav', 'video_reader'}.
-    """
-
-    return _video_backend
-
-
 def _is_tracing():
     return torch._C._get_tracing_state()
 
diff --git a/torchvision/csrc/io/decoder/audio_sampler.cpp b/torchvision/csrc/io/decoder/audio_sampler.cpp
deleted file mode 100644
index d46b93ddc69..00000000000
--- a/torchvision/csrc/io/decoder/audio_sampler.cpp
+++ /dev/null
@@ -1,251 +0,0 @@
-#include "audio_sampler.h"
-#include <c10/util/Logging.h>
-#include "util.h"
-
-#define AVRESAMPLE_MAX_CHANNELS 32
-
-// www.ffmpeg.org/doxygen/1.1/doc_2examples_2resampling_audio_8c-example.html#a24
-namespace ffmpeg {
-
-namespace {
-int preparePlanes(
-    const AudioFormat& fmt,
-    const uint8_t* buffer,
-    int numSamples,
-    uint8_t** planes) {
-  int result;
-  if ((result = av_samples_fill_arrays(
-           planes,
-           nullptr, // linesize is not needed
-           buffer,
-           fmt.channels,
-           numSamples,
-           (AVSampleFormat)fmt.format,
-           1)) < 0) {
-    LOG(ERROR) << "av_samples_fill_arrays failed, err: "
-               << Util::generateErrorDesc(result)
-               << ", numSamples: " << numSamples << ", fmt: " << fmt.format;
-  }
-  return result;
-}
-} // namespace
-
-AudioSampler::AudioSampler(void* logCtx) : logCtx_(logCtx) {}
-
-AudioSampler::~AudioSampler() {
-  cleanUp();
-}
-
-void AudioSampler::shutdown() {
-  cleanUp();
-}
-
-bool AudioSampler::init(const SamplerParameters& params) {
-  cleanUp();
-
-  if (params.type != MediaType::TYPE_AUDIO) {
-    LOG(ERROR) << "Invalid media type, expected MediaType::TYPE_AUDIO";
-    return false;
-  }
-
-#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100)
-  SwrContext* swrContext_ = NULL;
-  AVChannelLayout channel_out;
-  AVChannelLayout channel_in;
-  av_channel_layout_default(&channel_out, params.out.audio.channels);
-  av_channel_layout_default(&channel_in, params.in.audio.channels);
-  int ret = swr_alloc_set_opts2(
-      &swrContext_,
-      &channel_out,
-      (AVSampleFormat)params.out.audio.format,
-      params.out.audio.samples,
-      &channel_in,
-      (AVSampleFormat)params.in.audio.format,
-      params.in.audio.samples,
-      0,
-      logCtx_);
-#else
-  swrContext_ = swr_alloc_set_opts(
-      nullptr,
-      av_get_default_channel_layout(params.out.audio.channels),
-      (AVSampleFormat)params.out.audio.format,
-      params.out.audio.samples,
-      av_get_default_channel_layout(params.in.audio.channels),
-      (AVSampleFormat)params.in.audio.format,
-      params.in.audio.samples,
-      0,
-      logCtx_);
-#endif
-  if (swrContext_ == nullptr) {
-    LOG(ERROR) << "Cannot allocate SwrContext";
-    return false;
-  }
-
-  int result;
-  if ((result = swr_init(swrContext_)) < 0) {
-    LOG(ERROR) << "swr_init failed, err: " << Util::generateErrorDesc(result)
-               << ", in -> format: " << params.in.audio.format
-               << ", channels: " << params.in.audio.channels
-               << ", samples: " << params.in.audio.samples
-               << ", out -> format: " << params.out.audio.format
-               << ", channels: " << params.out.audio.channels
-               << ", samples: " << params.out.audio.samples;
-    return false;
-  }
-
-  // set formats
-  params_ = params;
-  return true;
-}
-
-int AudioSampler::numOutputSamples(int inSamples) const {
-  return swr_get_out_samples(swrContext_, inSamples);
-}
-
-int AudioSampler::sample(
-    const uint8_t* inPlanes[],
-    int inNumSamples,
-    ByteStorage* out,
-    int outNumSamples) {
-  int result;
-  int outBufferBytes = av_samples_get_buffer_size(
-      nullptr,
-      params_.out.audio.channels,
-      outNumSamples,
-      (AVSampleFormat)params_.out.audio.format,
-      1);
-
-  if (out) {
-    out->ensure(outBufferBytes);
-
-    uint8_t* outPlanes[AVRESAMPLE_MAX_CHANNELS] = {nullptr};
-
-    if ((result = preparePlanes(
-             params_.out.audio,
-             out->writableTail(),
-             outNumSamples,
-             outPlanes)) < 0) {
-      return result;
-    }
-
-    if ((result = swr_convert(
-             swrContext_,
-             &outPlanes[0],
-             outNumSamples,
-             inPlanes,
-             inNumSamples)) < 0) {
-      LOG(ERROR) << "swr_convert failed, err: "
-                 << Util::generateErrorDesc(result);
-      return result;
-    }
-
-    TORCH_CHECK_LE(result, outNumSamples);
-
-    if (result) {
-      if ((result = av_samples_get_buffer_size(
-               nullptr,
-               params_.out.audio.channels,
-               result,
-               (AVSampleFormat)params_.out.audio.format,
-               1)) >= 0) {
-        out->append(result);
-      } else {
-        LOG(ERROR) << "av_samples_get_buffer_size failed, err: "
-                   << Util::generateErrorDesc(result);
-      }
-    }
-  } else {
-    // allocate a temporary buffer
-    auto* tmpBuffer = static_cast<uint8_t*>(av_malloc(outBufferBytes));
-    if (!tmpBuffer) {
-      LOG(ERROR) << "av_alloc failed, for size: " << outBufferBytes;
-      return -1;
-    }
-
-    uint8_t* outPlanes[AVRESAMPLE_MAX_CHANNELS] = {nullptr};
-
-    if ((result = preparePlanes(
-             params_.out.audio, tmpBuffer, outNumSamples, outPlanes)) < 0) {
-      av_free(tmpBuffer);
-      return result;
-    }
-
-    if ((result = swr_convert(
-             swrContext_,
-             &outPlanes[0],
-             outNumSamples,
-             inPlanes,
-             inNumSamples)) < 0) {
-      LOG(ERROR) << "swr_convert failed, err: "
-                 << Util::generateErrorDesc(result);
-      av_free(tmpBuffer);
-      return result;
-    }
-
-    av_free(tmpBuffer);
-
-    TORCH_CHECK_LE(result, outNumSamples);
-
-    if (result) {
-      result = av_samples_get_buffer_size(
-          nullptr,
-          params_.out.audio.channels,
-          result,
-          (AVSampleFormat)params_.out.audio.format,
-          1);
-    }
-  }
-
-  return result;
-}
-
-int AudioSampler::sample(AVFrame* frame, ByteStorage* out) {
-  const auto outNumSamples = numOutputSamples(frame ? frame->nb_samples : 0);
-
-  if (!outNumSamples) {
-    return 0;
-  }
-
-  return sample(
-      frame ? (const uint8_t**)&frame->data[0] : nullptr,
-      frame ? frame->nb_samples : 0,
-      out,
-      outNumSamples);
-}
-
-int AudioSampler::sample(const ByteStorage* in, ByteStorage* out) {
-  const auto inSampleSize =
-      av_get_bytes_per_sample((AVSampleFormat)params_.in.audio.format);
-
-  const auto inNumSamples =
-      !in ? 0 : in->length() / inSampleSize / params_.in.audio.channels;
-
-  const auto outNumSamples = numOutputSamples(inNumSamples);
-
-  if (!outNumSamples) {
-    return 0;
-  }
-
-  uint8_t* inPlanes[AVRESAMPLE_MAX_CHANNELS] = {nullptr};
-  int result;
-  if (in &&
-      (result = preparePlanes(
-           params_.in.audio, in->data(), inNumSamples, inPlanes)) < 0) {
-    return result;
-  }
-
-  return sample(
-      in ? (const uint8_t**)inPlanes : nullptr,
-      inNumSamples,
-      out,
-      outNumSamples);
-}
-
-void AudioSampler::cleanUp() {
-  if (swrContext_) {
-    swr_free(&swrContext_);
-    swrContext_ = nullptr;
-  }
-}
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/audio_sampler.h b/torchvision/csrc/io/decoder/audio_sampler.h
deleted file mode 100644
index e105bbe4de2..00000000000
--- a/torchvision/csrc/io/decoder/audio_sampler.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#pragma once
-
-#include "defs.h"
-
-namespace ffmpeg {
-
-/**
- * Class transcode audio frames from one format into another
- */
-
-class AudioSampler : public MediaSampler {
- public:
-  explicit AudioSampler(void* logCtx);
-  ~AudioSampler() override;
-
-  // MediaSampler overrides
-  bool init(const SamplerParameters& params) override;
-  int sample(const ByteStorage* in, ByteStorage* out) override;
-  void shutdown() override;
-
-  int sample(AVFrame* frame, ByteStorage* out);
-
- private:
-  // close resources
-  void cleanUp();
-  // helper functions for rescaling, cropping, etc.
-  int numOutputSamples(int inSamples) const;
-  int sample(
-      const uint8_t* inPlanes[],
-      int inNumSamples,
-      ByteStorage* out,
-      int outNumSamples);
-
- private:
-  SwrContext* swrContext_{nullptr};
-  void* logCtx_{nullptr};
-};
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/audio_stream.cpp b/torchvision/csrc/io/decoder/audio_stream.cpp
deleted file mode 100644
index c3a003434b8..00000000000
--- a/torchvision/csrc/io/decoder/audio_stream.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-#include "audio_stream.h"
-#include <c10/util/Logging.h>
-#include "util.h"
-
-namespace ffmpeg {
-
-namespace {
-static int get_nb_channels(const AVFrame* frame, const AVCodecContext* codec) {
-#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100)
-  return frame ? frame->ch_layout.nb_channels : codec->ch_layout.nb_channels;
-#else
-  return frame ? frame->channels : codec->channels;
-#endif
-}
-
-bool operator==(const AudioFormat& x, const AVFrame& y) {
-  return x.samples == static_cast<size_t>(y.sample_rate) &&
-      x.channels == static_cast<size_t>(get_nb_channels(&y, nullptr)) &&
-      x.format == y.format;
-}
-
-bool operator==(const AudioFormat& x, const AVCodecContext& y) {
-  return x.samples == static_cast<size_t>(y.sample_rate) &&
-      x.channels == static_cast<size_t>(get_nb_channels(nullptr, &y)) &&
-      x.format == y.sample_fmt;
-}
-
-AudioFormat& toAudioFormat(AudioFormat& x, const AVFrame& y) {
-  x.samples = y.sample_rate;
-  x.channels = get_nb_channels(&y, nullptr);
-  x.format = y.format;
-  return x;
-}
-
-AudioFormat& toAudioFormat(AudioFormat& x, const AVCodecContext& y) {
-  x.samples = y.sample_rate;
-  x.channels = get_nb_channels(nullptr, &y);
-  x.format = y.sample_fmt;
-  return x;
-}
-} // namespace
-
-AudioStream::AudioStream(
-    AVFormatContext* inputCtx,
-    int index,
-    bool convertPtsToWallTime,
-    const AudioFormat& format)
-    : Stream(
-          inputCtx,
-          MediaFormat::makeMediaFormat(format, index),
-          convertPtsToWallTime,
-          0) {}
-
-AudioStream::~AudioStream() {
-  if (sampler_) {
-    sampler_->shutdown();
-    sampler_.reset();
-  }
-}
-
-int AudioStream::initFormat() {
-  // set output format
-  if (format_.format.audio.samples == 0) {
-    format_.format.audio.samples = codecCtx_->sample_rate;
-  }
-#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100)
-  if (format_.format.audio.channels == 0) {
-    format_.format.audio.channels = codecCtx_->ch_layout.nb_channels;
-  }
-#else
-  if (format_.format.audio.channels == 0) {
-    format_.format.audio.channels = codecCtx_->channels;
-  }
-#endif
-  if (format_.format.audio.format == AV_SAMPLE_FMT_NONE) {
-    format_.format.audio.format = codecCtx_->sample_fmt;
-  }
-
-  return format_.format.audio.samples != 0 &&
-          format_.format.audio.channels != 0 &&
-          format_.format.audio.format != AV_SAMPLE_FMT_NONE
-      ? 0
-      : -1;
-}
-
-// copies audio sample bytes via swr_convert call in audio_sampler.cpp
-int AudioStream::copyFrameBytes(ByteStorage* out, bool flush) {
-  if (!sampler_) {
-    sampler_ = std::make_unique<AudioSampler>(codecCtx_);
-  }
-  // check if input format gets changed
-  if (flush ? !(sampler_->getInputFormat().audio == *codecCtx_)
-            : !(sampler_->getInputFormat().audio == *frame_)) {
-    // - reinit sampler
-    SamplerParameters params;
-    params.type = format_.type;
-    params.out = format_.format;
-    params.in = FormatUnion();
-    flush ? toAudioFormat(params.in.audio, *codecCtx_)
-          : toAudioFormat(params.in.audio, *frame_);
-    if (!sampler_->init(params)) {
-      return -1;
-    }
-
-    VLOG(1) << "Set input audio sampler format"
-            << ", samples: " << params.in.audio.samples
-            << ", channels: " << params.in.audio.channels
-            << ", format: " << params.in.audio.format
-            << " : output audio sampler format"
-            << ", samples: " << format_.format.audio.samples
-            << ", channels: " << format_.format.audio.channels
-            << ", format: " << format_.format.audio.format;
-  }
-  // calls to a sampler that converts the audio samples and copies them to the
-  // out buffer via ffmpeg::swr_convert
-  return sampler_->sample(flush ? nullptr : frame_, out);
-}
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/audio_stream.h b/torchvision/csrc/io/decoder/audio_stream.h
deleted file mode 100644
index 2d6457b68f5..00000000000
--- a/torchvision/csrc/io/decoder/audio_stream.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#pragma once
-
-#include "audio_sampler.h"
-#include "stream.h"
-
-namespace ffmpeg {
-
-/**
- * Class uses FFMPEG library to decode one audio stream.
- */
-
-class AudioStream : public Stream {
- public:
-  AudioStream(
-      AVFormatContext* inputCtx,
-      int index,
-      bool convertPtsToWallTime,
-      const AudioFormat& format);
-  ~AudioStream() override;
-
- private:
-  int initFormat() override;
-  int copyFrameBytes(ByteStorage* out, bool flush) override;
-
- private:
-  std::unique_ptr<AudioSampler> sampler_;
-};
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/cc_stream.cpp b/torchvision/csrc/io/decoder/cc_stream.cpp
deleted file mode 100644
index 89174c396fd..00000000000
--- a/torchvision/csrc/io/decoder/cc_stream.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-#include "cc_stream.h"
-
-namespace ffmpeg {
-
-CCStream::CCStream(
-    AVFormatContext* inputCtx,
-    int index,
-    bool convertPtsToWallTime,
-    const SubtitleFormat& format)
-    : SubtitleStream(inputCtx, index, convertPtsToWallTime, format) {
-  format_.type = TYPE_CC;
-}
-
-AVCodec* CCStream::findCodec(AVCodecParameters* params) {
-  if (params->codec_id == AV_CODEC_ID_BIN_DATA &&
-      params->codec_type == AVMEDIA_TYPE_DATA) {
-    // obtain subtitles codec
-    params->codec_id = AV_CODEC_ID_MOV_TEXT;
-    params->codec_type = AVMEDIA_TYPE_SUBTITLE;
-  }
-  return Stream::findCodec(params);
-}
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/cc_stream.h b/torchvision/csrc/io/decoder/cc_stream.h
deleted file mode 100644
index 3a1d169f014..00000000000
--- a/torchvision/csrc/io/decoder/cc_stream.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#pragma once
-
-#include "subtitle_stream.h"
-
-namespace ffmpeg {
-
-/**
- * Class uses FFMPEG library to decode one closed captions stream.
- */
-class CCStream : public SubtitleStream {
- public:
-  CCStream(
-      AVFormatContext* inputCtx,
-      int index,
-      bool convertPtsToWallTime,
-      const SubtitleFormat& format);
-
- private:
-  AVCodec* findCodec(AVCodecParameters* params) override;
-};
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/decoder.cpp b/torchvision/csrc/io/decoder/decoder.cpp
deleted file mode 100644
index cfe762bbc6e..00000000000
--- a/torchvision/csrc/io/decoder/decoder.cpp
+++ /dev/null
@@ -1,763 +0,0 @@
-#include "decoder.h"
-#include <c10/util/Logging.h>
-#include <libavutil/avutil.h>
-#include <future>
-#include <iostream>
-#include <mutex>
-#include "audio_stream.h"
-#include "cc_stream.h"
-#include "subtitle_stream.h"
-#include "util.h"
-#include "video_stream.h"
-
-namespace ffmpeg {
-
-namespace {
-
-constexpr size_t kIoBufferSize = 96 * 1024;
-constexpr size_t kIoPaddingSize = AV_INPUT_BUFFER_PADDING_SIZE;
-constexpr size_t kLogBufferSize = 1024;
-
-bool mapFfmpegType(AVMediaType media, MediaType* type) {
-  switch (media) {
-    case AVMEDIA_TYPE_AUDIO:
-      *type = TYPE_AUDIO;
-      return true;
-    case AVMEDIA_TYPE_VIDEO:
-      *type = TYPE_VIDEO;
-      return true;
-    case AVMEDIA_TYPE_SUBTITLE:
-      *type = TYPE_SUBTITLE;
-      return true;
-    case AVMEDIA_TYPE_DATA:
-      *type = TYPE_CC;
-      return true;
-    default:
-      return false;
-  }
-}
-
-std::unique_ptr<Stream> createStream(
-    MediaType type,
-    AVFormatContext* ctx,
-    int idx,
-    bool convertPtsToWallTime,
-    const FormatUnion& format,
-    int64_t loggingUuid) {
-  switch (type) {
-    case TYPE_AUDIO:
-      return std::make_unique<AudioStream>(
-          ctx, idx, convertPtsToWallTime, format.audio);
-    case TYPE_VIDEO:
-      return std::make_unique<VideoStream>(
-          // negative loggingUuid indicates video streams.
-          ctx,
-          idx,
-          convertPtsToWallTime,
-          format.video,
-          -loggingUuid);
-    case TYPE_SUBTITLE:
-      return std::make_unique<SubtitleStream>(
-          ctx, idx, convertPtsToWallTime, format.subtitle);
-    case TYPE_CC:
-      return std::make_unique<CCStream>(
-          ctx, idx, convertPtsToWallTime, format.subtitle);
-    default:
-      return nullptr;
-  }
-}
-
-} // Namespace
-
-/* static */
-void Decoder::logFunction(void* avcl, int level, const char* cfmt, va_list vl) {
-  if (!avcl) {
-    // Nothing can be done here
-    return;
-  }
-
-  AVClass* avclass = *reinterpret_cast<AVClass**>(avcl);
-  if (!avclass) {
-    // Nothing can be done here
-    return;
-  }
-  Decoder* decoder = nullptr;
-  if (strcmp(avclass->class_name, "AVFormatContext") == 0) {
-    AVFormatContext* context = reinterpret_cast<AVFormatContext*>(avcl);
-    if (context) {
-      decoder = reinterpret_cast<Decoder*>(context->opaque);
-    }
-  } else if (strcmp(avclass->class_name, "AVCodecContext") == 0) {
-    AVCodecContext* context = reinterpret_cast<AVCodecContext*>(avcl);
-    if (context) {
-      decoder = reinterpret_cast<Decoder*>(context->opaque);
-    }
-  } else if (strcmp(avclass->class_name, "AVIOContext") == 0) {
-    AVIOContext* context = reinterpret_cast<AVIOContext*>(avcl);
-    // only if opaque was assigned to Decoder pointer
-    if (context && context->read_packet == Decoder::readFunction) {
-      decoder = reinterpret_cast<Decoder*>(context->opaque);
-    }
-  } else if (strcmp(avclass->class_name, "SWResampler") == 0) {
-    // expect AVCodecContext as parent
-    if (avclass->parent_log_context_offset) {
-      AVClass** parent =
-          *(AVClass***)(((uint8_t*)avcl) + avclass->parent_log_context_offset);
-      AVCodecContext* context = reinterpret_cast<AVCodecContext*>(parent);
-      if (context) {
-        decoder = reinterpret_cast<Decoder*>(context->opaque);
-      }
-    }
-  } else if (strcmp(avclass->class_name, "SWScaler") == 0) {
-    // cannot find a way to pass context pointer through SwsContext struct
-  } else {
-    VLOG(2) << "Unknown context class: " << avclass->class_name;
-  }
-
-  if (decoder != nullptr && decoder->enableLogLevel(level)) {
-    char buf[kLogBufferSize] = {0};
-    // Format the line
-    int* prefix = decoder->getPrintPrefix();
-    *prefix = 1;
-    av_log_format_line(avcl, level, cfmt, vl, buf, sizeof(buf) - 1, prefix);
-    // pass message to the decoder instance
-    std::string msg(buf);
-    decoder->logCallback(level, msg);
-  }
-}
-
-bool Decoder::enableLogLevel(int level) const {
-  return ssize_t(level) <= params_.logLevel;
-}
-
-void Decoder::logCallback(int level, const std::string& message) {
-  LOG(INFO) << "Msg, uuid=" << params_.loggingUuid << " level=" << level
-            << " msg=" << message;
-}
-
-/* static */
-int Decoder::shutdownFunction(void* ctx) {
-  Decoder* decoder = (Decoder*)ctx;
-  if (decoder == nullptr) {
-    return 1;
-  }
-  return decoder->shutdownCallback();
-}
-
-int Decoder::shutdownCallback() {
-  return interrupted_ ? 1 : 0;
-}
-
-/* static */
-int Decoder::readFunction(void* opaque, uint8_t* buf, int size) {
-  Decoder* decoder = reinterpret_cast<Decoder*>(opaque);
-  if (decoder == nullptr) {
-    return 0;
-  }
-  return decoder->readCallback(buf, size);
-}
-
-/* static */
-int64_t Decoder::seekFunction(void* opaque, int64_t offset, int whence) {
-  Decoder* decoder = reinterpret_cast<Decoder*>(opaque);
-  if (decoder == nullptr) {
-    return -1;
-  }
-  return decoder->seekCallback(offset, whence);
-}
-
-int Decoder::readCallback(uint8_t* buf, int size) {
-  return seekableBuffer_.read(buf, size, params_.timeoutMs);
-}
-
-int64_t Decoder::seekCallback(int64_t offset, int whence) {
-  return seekableBuffer_.seek(offset, whence, params_.timeoutMs);
-}
-
-/* static */
-void Decoder::initOnce() {
-  static std::once_flag flagInit;
-  std::call_once(flagInit, []() {
-#if LIBAVUTIL_VERSION_MAJOR < 56 // Before FFMPEG 4.0
-    av_register_all();
-    avcodec_register_all();
-#endif
-    avformat_network_init();
-    av_log_set_callback(Decoder::logFunction);
-    av_log_set_level(AV_LOG_ERROR);
-    VLOG(1) << "Registered ffmpeg libs";
-  });
-}
-
-Decoder::Decoder() {
-  initOnce();
-}
-
-Decoder::~Decoder() {
-  cleanUp();
-}
-
-// Initialise the format context that holds information about the container and
-// fill it with minimal information about the format (codecs are not opened
-// here). Function reads in information about the streams from the container
-// into inputCtx and then passes it to decoder::openStreams. Finally, if seek is
-// specified within the decoder parameters, it seeks into the correct frame
-// (note, the seek defined here is "precise" seek).
-bool Decoder::init(
-    const DecoderParameters& params,
-    DecoderInCallback&& in,
-    std::vector<DecoderMetadata>* metadata) {
-  cleanUp();
-
-  if ((params.uri.empty() || in) && (!params.uri.empty() || !in)) {
-    LOG(ERROR)
-        << "uuid=" << params_.loggingUuid
-        << " either external URI gets provided or explicit input callback";
-    return false;
-  }
-
-  // set callback and params
-  params_ = params;
-
-  if (!(inputCtx_ = avformat_alloc_context())) {
-    LOG(ERROR) << "uuid=" << params_.loggingUuid
-               << " cannot allocate format context";
-    return false;
-  }
-
-  AVInputFormat* fmt = nullptr;
-  int result = 0;
-  if (in) {
-    ImageType type = ImageType::UNKNOWN;
-    if ((result = seekableBuffer_.init(
-             std::forward<DecoderInCallback>(in),
-             params_.timeoutMs,
-             params_.maxSeekableBytes,
-             params_.isImage ? &type : nullptr)) < 0) {
-      LOG(ERROR) << "uuid=" << params_.loggingUuid
-                 << " can't initiate seekable buffer";
-      cleanUp();
-      return false;
-    }
-
-    if (params_.isImage) {
-      const char* fmtName = "image2";
-      switch (type) {
-        case ImageType::JPEG:
-          fmtName = "jpeg_pipe";
-          break;
-        case ImageType::PNG:
-          fmtName = "png_pipe";
-          break;
-        case ImageType::TIFF:
-          fmtName = "tiff_pipe";
-          break;
-        default:
-          break;
-      }
-
-      fmt = (AVInputFormat*)av_find_input_format(fmtName);
-    }
-
-    const size_t avioCtxBufferSize = kIoBufferSize;
-    uint8_t* avioCtxBuffer =
-        (uint8_t*)av_malloc(avioCtxBufferSize + kIoPaddingSize);
-    if (!avioCtxBuffer) {
-      LOG(ERROR) << "uuid=" << params_.loggingUuid
-                 << " av_malloc cannot allocate " << avioCtxBufferSize
-                 << " bytes";
-      cleanUp();
-      return false;
-    }
-
-    if (!(avioCtx_ = avio_alloc_context(
-              avioCtxBuffer,
-              avioCtxBufferSize,
-              0,
-              reinterpret_cast<void*>(this),
-              &Decoder::readFunction,
-              nullptr,
-              result == 1 ? &Decoder::seekFunction : nullptr))) {
-      LOG(ERROR) << "uuid=" << params_.loggingUuid
-                 << " avio_alloc_context failed";
-      av_free(avioCtxBuffer);
-      cleanUp();
-      return false;
-    }
-
-    avioCtx_->max_packet_size = params.maxEncodedBufferSize;
-
-    inputCtx_->pb = avioCtx_;
-    inputCtx_->flags |= AVFMT_FLAG_CUSTOM_IO;
-  }
-
-  inputCtx_->opaque = reinterpret_cast<void*>(this);
-  inputCtx_->interrupt_callback.callback = Decoder::shutdownFunction;
-  inputCtx_->interrupt_callback.opaque = reinterpret_cast<void*>(this);
-
-  // add network timeout
-  inputCtx_->flags |= AVFMT_FLAG_NONBLOCK;
-
-  AVDictionary* options = nullptr;
-  if (params_.listen) {
-    av_dict_set_int(&options, "listen", 1, 0);
-  }
-  if (params_.timeoutMs > 0) {
-    av_dict_set_int(&options, "analyzeduration", params_.timeoutMs * 1000, 0);
-    av_dict_set_int(&options, "stimeout", params_.timeoutMs * 1000, 0);
-    av_dict_set_int(&options, "rw_timeout", params_.timeoutMs * 1000, 0);
-    if (!params_.tlsCertFile.empty()) {
-      av_dict_set(&options, "cert_file", params_.tlsCertFile.data(), 0);
-    }
-    if (!params_.tlsKeyFile.empty()) {
-      av_dict_set(&options, "key_file", params_.tlsKeyFile.data(), 0);
-    }
-  }
-
-  av_dict_set_int(&options, "probesize", params_.probeSize, 0);
-
-  interrupted_ = false;
-
-  // ffmpeg avformat_open_input call can hang if media source doesn't respond
-  // set a guard for handle such situations, if requested
-  std::promise<bool> p;
-  std::future<bool> f = p.get_future();
-  std::unique_ptr<std::thread> guard;
-  if (params_.preventStaleness) {
-    guard = std::make_unique<std::thread>([&f, this]() {
-      auto timeout = std::chrono::milliseconds(params_.timeoutMs);
-      if (std::future_status::timeout == f.wait_for(timeout)) {
-        LOG(ERROR) << "uuid=" << params_.loggingUuid
-                   << " cannot open stream within " << params_.timeoutMs
-                   << " ms";
-        interrupted_ = true;
-      }
-    });
-  }
-
-  if (fmt) {
-    result = avformat_open_input(&inputCtx_, nullptr, fmt, &options);
-  } else {
-    result =
-        avformat_open_input(&inputCtx_, params_.uri.c_str(), nullptr, &options);
-  }
-
-  av_dict_free(&options);
-
-  if (guard) {
-    p.set_value(true);
-    guard->join();
-    guard.reset();
-  }
-
-  if (result < 0 || interrupted_) {
-    LOG(ERROR) << "uuid=" << params_.loggingUuid
-               << " avformat_open_input failed, error="
-               << Util::generateErrorDesc(result);
-    cleanUp();
-    return false;
-  }
-
-  result = avformat_find_stream_info(inputCtx_, nullptr);
-
-  if (result < 0) {
-    LOG(ERROR) << "uuid=" << params_.loggingUuid
-               << " avformat_find_stream_info failed, error="
-               << Util::generateErrorDesc(result);
-    cleanUp();
-    return false;
-  }
-
-  if (!openStreams(metadata)) {
-    LOG(ERROR) << "uuid=" << params_.loggingUuid << " cannot activate streams";
-    cleanUp();
-    return false;
-  }
-  // SyncDecoder inherits Decoder which would override onInit.
-  onInit();
-
-  if (params.startOffset != 0) {
-    auto offset = params.startOffset <= params.seekAccuracy
-        ? 0
-        : params.startOffset - params.seekAccuracy;
-
-    av_seek_frame(inputCtx_, -1, offset, AVSEEK_FLAG_BACKWARD);
-  }
-
-  for (unsigned int i = 0; i < inputCtx_->nb_streams; i++) {
-    if (
-#if LIBAVUTIL_VERSION_MAJOR < 56 // Before FFMPEG 4.0
-        inputCtx_->streams[i]->codec->codec_type == AVMEDIA_TYPE_VIDEO
-#else // FFMPEG 4.0+
-        inputCtx_->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO
-#endif
-        && inputCtx_->streams[i]->duration > 0) {
-      // There is at least two 1/r_frame_rates from the frame before the last
-      // one until the video duration, let's prefer to set duration after the
-      // frame before the last one, but as early as possible
-      double correction = 2 * inputCtx_->streams[i]->r_frame_rate.den /
-              (double)inputCtx_->streams[i]->r_frame_rate.num -
-          1 / (double)AV_TIME_BASE;
-      videoDurationMs_ = 1000 * inputCtx_->streams[i]->duration *
-              inputCtx_->streams[i]->time_base.num /
-              (double)inputCtx_->streams[i]->time_base.den -
-          1000 * correction;
-      break;
-    }
-  }
-
-  VLOG(1) << "Decoder initialized, log level: " << params_.logLevel;
-  VLOG(1) << "Video duration: " << videoDurationMs_;
-  return true;
-}
-
-// open appropriate CODEC for every type of stream and move it to the class
-// variable `streams_` and make sure it is in range for decoding
-bool Decoder::openStreams(std::vector<DecoderMetadata>* metadata) {
-  for (unsigned int i = 0; i < inputCtx_->nb_streams; i++) {
-    // - find the corespondent format at params_.formats set
-    MediaFormat format;
-#if LIBAVUTIL_VERSION_MAJOR < 56 // Before FFMPEG 4.0
-    const auto media = inputCtx_->streams[i]->codec->codec_type;
-#else // FFMPEG 4.0+
-    const auto media = inputCtx_->streams[i]->codecpar->codec_type;
-#endif
-    if (!mapFfmpegType(media, &format.type)) {
-      VLOG(1) << "Stream media: " << media << " at index " << i
-              << " gets ignored, unknown type";
-
-      continue; // unsupported type
-    }
-
-    // check format
-    auto it = params_.formats.find(format);
-    if (it == params_.formats.end()) {
-      VLOG(1) << "Stream type: " << format.type << " at index: " << i
-              << " gets ignored, caller is not interested";
-      continue; // clients don't care about this media format
-    }
-
-    // do we have stream of this type?
-    auto stream = findByType(format);
-
-    // should we process this stream?
-
-    if (it->stream == -2 || // all streams of this type are welcome
-        (!stream && (it->stream == -1 || it->stream == i))) { // new stream
-      VLOG(1) << "Stream type: " << format.type << " found, at index: " << i;
-      auto stream_2 = createStream(
-          format.type,
-          inputCtx_,
-          i,
-          params_.convertPtsToWallTime,
-          it->format,
-          params_.loggingUuid);
-      CHECK(stream_2);
-      if (stream_2->openCodec(metadata, params_.numThreads) < 0) {
-        LOG(ERROR) << "uuid=" << params_.loggingUuid
-                   << " open codec failed, stream_idx=" << i;
-        return false;
-      }
-      streams_.emplace(i, std::move(stream_2));
-      inRange_.set(i, true);
-    }
-  }
-
-  return true;
-}
-
-void Decoder::shutdown() {
-  cleanUp();
-}
-
-void Decoder::interrupt() {
-  interrupted_ = true;
-}
-
-void Decoder::cleanUp() {
-  if (!interrupted_) {
-    interrupted_ = true;
-  }
-
-  if (inputCtx_) {
-    for (auto& stream : streams_) {
-      // Drain stream buffers.
-      DecoderOutputMessage msg;
-      while (msg.payload = nullptr, stream.second->flush(&msg, true) > 0) {
-      }
-      stream.second.reset();
-    }
-    streams_.clear();
-    avformat_close_input(&inputCtx_);
-  }
-  if (avioCtx_) {
-    av_freep(&avioCtx_->buffer);
-    av_freep(&avioCtx_);
-  }
-
-  // reset callback
-  seekableBuffer_.shutdown();
-}
-
-// function does actual work, derived class calls it in working thread
-// periodically. On success method returns 0, ENODATA on EOF, ETIMEDOUT if
-// no frames got decoded in the specified timeout time, AVERROR_BUFFER_TOO_SMALL
-// when unable to allocate packet and error on unrecoverable error
-int Decoder::getFrame(size_t workingTimeInMs) {
-  if (inRange_.none()) {
-    return ENODATA;
-  }
-  // decode frames until cache is full and leave thread
-  // once decode() method gets called and grab some bytes
-  // run this method again
-  // init package
-  // update 03/22: moving memory management to ffmpeg
-  AVPacket* avPacket;
-  avPacket = av_packet_alloc();
-  if (avPacket == nullptr) {
-    LOG(ERROR) << "uuid=" << params_.loggingUuid
-               << " decoder as not able to allocate the packet.";
-    return AVERROR_BUFFER_TOO_SMALL;
-  }
-  avPacket->data = nullptr;
-  avPacket->size = 0;
-
-  auto end = std::chrono::steady_clock::now() +
-      std::chrono::milliseconds(workingTimeInMs);
-  // return true if elapsed time less than timeout
-  auto watcher = [end]() -> bool {
-    return std::chrono::steady_clock::now() <= end;
-  };
-
-  int result = 0;
-  size_t decodingErrors = 0;
-  bool decodedFrame = false;
-  while (!interrupted_ && inRange_.any() && !decodedFrame) {
-    if (watcher() == false) {
-      LOG(ERROR) << "uuid=" << params_.loggingUuid << " hit ETIMEDOUT";
-      result = ETIMEDOUT;
-      break;
-    }
-    result = av_read_frame(inputCtx_, avPacket);
-    if (result == AVERROR(EAGAIN)) {
-      VLOG(4) << "Decoder is busy...";
-      std::this_thread::yield();
-      result = 0; // reset error, EAGAIN is not an error at all
-      // reset the packet to default settings
-      av_packet_unref(avPacket);
-      continue;
-    } else if (result == AVERROR_EOF) {
-      flushStreams();
-      VLOG(1) << "End of stream";
-      result = ENODATA;
-      break;
-    } else if (
-        result == AVERROR(EPERM) && params_.skipOperationNotPermittedPackets) {
-      // reset error, lets skip packets with EPERM
-      result = 0;
-      // reset the packet to default settings
-      av_packet_unref(avPacket);
-      continue;
-    } else if (result < 0) {
-      flushStreams();
-      LOG(ERROR) << "uuid=" << params_.loggingUuid
-                 << " error detected: " << Util::generateErrorDesc(result);
-      break;
-    }
-
-    // get stream; if stream cannot be found reset the packet to
-    // default settings
-    auto stream = findByIndex(avPacket->stream_index);
-    if (stream == nullptr || !inRange_.test(stream->getIndex())) {
-      av_packet_unref(avPacket);
-      continue;
-    }
-
-    size_t numConsecutiveNoBytes = 0;
-    // it can be only partial decoding of the package bytes
-    do {
-      // decode package
-      bool gotFrame = false;
-      bool hasMsg = false;
-      // packet either got consumed completely or not at all
-      if ((result = processPacket(
-               stream, avPacket, &gotFrame, &hasMsg, params_.fastSeek)) < 0) {
-        LOG(ERROR) << "uuid=" << params_.loggingUuid
-                   << " processPacket failed with code: " << result;
-        break;
-      }
-
-      if (!gotFrame && params_.maxProcessNoBytes != 0 &&
-          ++numConsecutiveNoBytes > params_.maxProcessNoBytes) {
-        LOG(ERROR) << "uuid=" << params_.loggingUuid
-                   << " exceeding max amount of consecutive no bytes";
-        break;
-      }
-      if (result > 0) {
-        numConsecutiveNoBytes = 0;
-      }
-
-      decodedFrame |= hasMsg;
-    } while (result == 0);
-
-    // post loop check
-    if (result < 0) {
-      if (params_.maxPackageErrors != 0 && // check errors
-          ++decodingErrors >= params_.maxPackageErrors) { // reached the limit
-        LOG(ERROR) << "uuid=" << params_.loggingUuid
-                   << " exceeding max amount of consecutive package errors";
-        break;
-      }
-    } else {
-      decodingErrors = 0; // reset on success
-    }
-
-    result = 0;
-
-    av_packet_unref(avPacket);
-
-    if (params_.uniformSampling > 1) {
-      if (doSeek_) {
-        double duration =
-            videoDurationMs_ > 0 ? videoDurationMs_ : params_.expectedDuration;
-        double step =
-            (duration * AV_TIME_BASE) / (1000 * (params_.uniformSampling - 1));
-        avformat_seek_file(
-            inputCtx_,
-            -1,
-            static_cast<int64_t>(step * kFramesDecoded_) + 1,
-            static_cast<int64_t>(step * (kFramesDecoded_ + 1)),
-            static_cast<int64_t>(step * (kFramesDecoded_ + 1)),
-            0);
-        ++kFramesDecoded_;
-        doSeek_ = false;
-      }
-    }
-  }
-
-  av_packet_free(&avPacket);
-  VLOG(2) << "Interrupted loop" << ", interrupted_ " << interrupted_
-          << ", inRange_.any() " << inRange_.any() << ", decodedFrame "
-          << decodedFrame << ", result " << result;
-
-  // loop can be terminated, either by:
-  // 1. explicitly interrupted
-  // 3. unrecoverable error or ENODATA (end of stream) or ETIMEDOUT (timeout)
-  // 4. decoded frames pts are out of the specified range
-  // 5. success decoded frame
-  if (interrupted_) {
-    return EINTR;
-  }
-  if (result != 0) {
-    return result;
-  }
-  if (inRange_.none()) {
-    return ENODATA;
-  }
-  return 0;
-}
-
-// find stream by stream index
-Stream* Decoder::findByIndex(int streamIndex) const {
-  auto it = streams_.find(streamIndex);
-  return it != streams_.end() ? it->second.get() : nullptr;
-}
-
-// find stream by type; note finds only the first stream of a given type
-Stream* Decoder::findByType(const MediaFormat& format) const {
-  for (auto& stream : streams_) {
-    if (stream.second->getMediaFormat().type == format.type) {
-      return stream.second.get();
-    }
-  }
-  return nullptr;
-}
-
-// given the stream and packet, decode the frame buffers into the
-// DecoderOutputMessage data structure via stream::decodePacket function.
-int Decoder::processPacket(
-    Stream* stream,
-    AVPacket* packet,
-    bool* gotFrame,
-    bool* hasMsg,
-    bool fastSeek) {
-  // decode package
-  int result;
-  DecoderOutputMessage msg;
-  msg.payload = params_.headerOnly ? nullptr : createByteStorage(0);
-  *hasMsg = false;
-  if ((result = stream->decodePacket(
-           packet, &msg, params_.headerOnly, gotFrame)) >= 0 &&
-      *gotFrame) {
-    // check end offset
-    bool endInRange =
-        params_.endOffset <= 0 || msg.header.pts <= params_.endOffset;
-    inRange_.set(stream->getIndex(), endInRange);
-    // if fastseek is enabled, we're returning the first
-    // frame that we decode after (potential) seek.
-    // By default, we perform accurate seek to the closest
-    // following frame
-    bool startCondition = true;
-    if (!fastSeek) {
-      startCondition = msg.header.pts >= params_.startOffset;
-    }
-    if (endInRange && startCondition) {
-      *hasMsg = pushMsg(std::move(msg));
-    }
-  }
-  return result;
-}
-
-bool Decoder::pushMsg(DecoderOutputMessage&& msg) {
-  pastDecodedPTS_ = currentDecodedPTS_;
-  currentDecodedPTS_ = msg.header.pts;
-
-  if (params_.uniformSampling <= 1) {
-    push(std::move(msg));
-    return true;
-  }
-
-  double duration =
-      videoDurationMs_ > 0 ? videoDurationMs_ : params_.expectedDuration;
-  double step =
-      (duration * AV_TIME_BASE) / (1000 * (params_.uniformSampling - 1));
-  if (pastDecodedPTS_ < step * kFramesDecoded_ &&
-      step * kFramesDecoded_ <= currentDecodedPTS_) {
-    push(std::move(msg));
-    doSeek_ = true;
-    return true;
-  }
-
-  return false;
-}
-
-void Decoder::flushStreams() {
-  VLOG(1) << "Flushing streams...";
-  for (auto& stream : streams_) {
-    DecoderOutputMessage msg;
-    while (msg.payload = (params_.headerOnly ? nullptr : createByteStorage(0)),
-           stream.second->flush(&msg, params_.headerOnly) > 0) {
-      // check end offset
-      bool endInRange =
-          params_.endOffset <= 0 || msg.header.pts <= params_.endOffset;
-      inRange_.set(stream.second->getIndex(), endInRange);
-      if (endInRange && msg.header.pts >= params_.startOffset) {
-        pushMsg(std::move(msg));
-      } else {
-        msg.payload.reset();
-      }
-    }
-  }
-}
-
-int Decoder::decode_all(const DecoderOutCallback& callback) {
-  int result;
-  do {
-    DecoderOutputMessage out;
-    if (0 == (result = decode(&out, params_.timeoutMs))) {
-      callback(std::move(out));
-    }
-  } while (result == 0);
-  return result;
-}
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/decoder.h b/torchvision/csrc/io/decoder/decoder.h
deleted file mode 100644
index 172a011f93e..00000000000
--- a/torchvision/csrc/io/decoder/decoder.h
+++ /dev/null
@@ -1,100 +0,0 @@
-#pragma once
-
-#include <bitset>
-#include <unordered_map>
-#include "seekable_buffer.h"
-#include "stream.h"
-
-#if defined(_MSC_VER)
-#include <BaseTsd.h>
-using ssize_t = SSIZE_T;
-#endif
-
-namespace ffmpeg {
-
-/**
- * Class uses FFMPEG library to decode media streams.
- * Media bytes can be explicitly provided through read-callback
- * or fetched internally by FFMPEG library
- */
-class Decoder : public MediaDecoder {
- public:
-  Decoder();
-  ~Decoder() override;
-
-  // MediaDecoder overrides
-  bool init(
-      const DecoderParameters& params,
-      DecoderInCallback&& in,
-      std::vector<DecoderMetadata>* metadata) override;
-  int decode_all(const DecoderOutCallback& callback) override;
-  void shutdown() override;
-  void interrupt() override;
-
- protected:
-  // function does actual work, derived class calls it in working thread
-  // periodically. On success method returns 0, ENOADATA on EOF, ETIMEDOUT if
-  // no frames got decoded in the specified timeout time, and error on
-  // unrecoverable error.
-  int getFrame(size_t workingTimeInMs = 100);
-
-  // Derived class must override method and consume the provided message
-  virtual void push(DecoderOutputMessage&& buffer) = 0;
-
-  // Fires on init call
-  virtual void onInit() {}
-
- public:
-  // C-style FFMPEG API requires C/static methods for callbacks
-  static void logFunction(void* avcl, int level, const char* cfmt, va_list vl);
-  static int shutdownFunction(void* ctx);
-  static int readFunction(void* opaque, uint8_t* buf, int size);
-  static int64_t seekFunction(void* opaque, int64_t offset, int whence);
-  // can be called by any classes or API
-  static void initOnce();
-
-  int* getPrintPrefix() {
-    return &printPrefix;
-  }
-  double videoDurationMs_ = -1;
-
- private:
-  // mark below function for a proper invocation
-  bool enableLogLevel(int level) const;
-  void logCallback(int level, const std::string& message);
-  int readCallback(uint8_t* buf, int size);
-  int64_t seekCallback(int64_t offset, int whence);
-  int shutdownCallback();
-
-  bool openStreams(std::vector<DecoderMetadata>* metadata);
-  Stream* findByIndex(int streamIndex) const;
-  Stream* findByType(const MediaFormat& format) const;
-  int processPacket(
-      Stream* stream,
-      AVPacket* packet,
-      bool* gotFrame,
-      bool* hasMsg,
-      bool fastSeek = false);
-  void flushStreams();
-  void cleanUp();
-  bool pushMsg(DecoderOutputMessage&&
-                   msg); // returns whether frame is passed to downstream
-
- protected:
-  DecoderParameters params_;
-
- private:
-  SeekableBuffer seekableBuffer_;
-  int printPrefix{1};
-
-  std::atomic<bool> interrupted_{false};
-  AVFormatContext* inputCtx_{nullptr};
-  AVIOContext* avioCtx_{nullptr};
-  std::unordered_map<ssize_t, std::unique_ptr<Stream>> streams_;
-  std::bitset<64> inRange_;
-  int kFramesDecoded_{0};
-  int64_t pastDecodedPTS_{-1};
-  int64_t currentDecodedPTS_{-1};
-  bool doSeek_{false};
-};
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/defs.h b/torchvision/csrc/io/decoder/defs.h
deleted file mode 100644
index d2dc5c7935b..00000000000
--- a/torchvision/csrc/io/decoder/defs.h
+++ /dev/null
@@ -1,415 +0,0 @@
-#pragma once
-
-#include <array>
-#include <functional>
-#include <memory>
-#include <set>
-#include <string>
-#include <unordered_set>
-#include <vector>
-
-extern "C" {
-#include <libavcodec/avcodec.h>
-#include <libavformat/avformat.h>
-#include <libavformat/avio.h>
-#include <libavutil/avutil.h>
-#include <libavutil/imgutils.h>
-#include <libswresample/swresample.h>
-#include "libswscale/swscale.h"
-}
-
-namespace ffmpeg {
-
-// bit mask of formats, keep them in form 2^n
-enum MediaType : size_t {
-  TYPE_AUDIO = 1,
-  TYPE_VIDEO = 2,
-  TYPE_SUBTITLE = 4,
-  TYPE_CC = 8, // closed captions from transport streams
-};
-
-// audio
-struct AudioFormat {
-  // fields are initialized for the auto detection
-  // caller can specify some/all of field values if specific output is desirable
-  bool operator==(const AudioFormat& x) const {
-    return x.format == format && x.samples == samples && x.channels == channels;
-  }
-
-  size_t samples{0}; // number samples per second (frequency)
-  size_t channels{0}; // number of channels
-  long format{-1}; // AVSampleFormat, auto AV_SAMPLE_FMT_NONE
-  size_t padding[2];
-  // -- alignment 40 bytes
-};
-
-// video
-struct VideoFormat {
-  // fields are initialized for the auto detection
-  // caller can specify some/all of field values if specific output is desirable
-  bool operator==(const VideoFormat& x) const {
-    return x.format == format && x.width == width && x.height == height;
-  }
-  /*
-  When width = 0, height = 0, minDimension = 0, and maxDimension = 0,
-    keep the original frame resolution
-  When width = 0, height = 0, minDimension != 0, and maxDimension = 0,
-    keep the aspect ratio and resize the frame so that shorter edge size is
-  minDimension
-  When width = 0, height = 0, minDimension = 0, and maxDimension != 0,
-    keep the aspect ratio and resize the frame so that longer edge size is
-  maxDimension
-  When width = 0, height = 0, minDimension != 0, and maxDimension != 0,
-    resize the frame so that shorter edge size is minDimension, and
-    longer edge size is maxDimension. The aspect ratio may not be preserved
-  When width = 0, height != 0, minDimension = 0, and maxDimension = 0,
-    keep the aspect ratio and resize the frame so that frame height is $height
-  When width != 0, height = 0, minDimension = 0, and maxDimension = 0,
-    keep the aspect ratio and resize the frame so that frame width is $width
-  When width != 0, height != 0, minDimension = 0, and maxDimension = 0,
-    resize the frame so that frame width and  height are set to $width and
-  $height,
-    respectively
-  */
-  size_t width{0}; // width in pixels
-  size_t height{0}; // height in pixels
-  long format{-1}; // AVPixelFormat, auto AV_PIX_FMT_NONE
-  size_t minDimension{0}; // choose min dimension and rescale accordingly
-  size_t maxDimension{0}; // choose max dimension and rescale accordingly
-  size_t cropImage{0}; // request image crop
-  // -- alignment 40 bytes
-};
-
-// subtitle/cc
-struct SubtitleFormat {
-  long type{0}; // AVSubtitleType, auto SUBTITLE_NONE
-  size_t padding[4];
-  // -- alignment 40 bytes
-};
-
-union FormatUnion {
-  FormatUnion() : audio() {}
-  explicit FormatUnion(int) : video() {}
-  explicit FormatUnion(char) : subtitle() {}
-  explicit FormatUnion(double) : subtitle() {}
-  AudioFormat audio;
-  VideoFormat video;
-  SubtitleFormat subtitle;
-  // -- alignment 40 bytes
-};
-
-/*
-  MediaFormat data structure serves as input/output parameter.
-  Caller assigns values for input formats
-  or leave default values for auto detection
-  For output formats all fields will be set to the specific values
-*/
-struct MediaFormat {
-  // for using map/set data structures
-  bool operator<(const MediaFormat& x) const {
-    return type < x.type;
-  }
-  bool operator==(const MediaFormat& x) const {
-    if (type != x.type) {
-      return false;
-    }
-    switch (type) {
-      case TYPE_AUDIO:
-        return format.audio == x.format.audio;
-      case TYPE_VIDEO:
-        return format.video == x.format.video;
-      case TYPE_SUBTITLE:
-      case TYPE_CC:
-        return true;
-      default:
-        return false;
-    }
-  }
-
-  explicit MediaFormat(long s = -1) : type(TYPE_AUDIO), stream(s), format() {}
-  explicit MediaFormat(int x, long s = -1)
-      : type(TYPE_VIDEO), stream(s), format(x) {}
-  explicit MediaFormat(char x, long s = -1)
-      : type(TYPE_SUBTITLE), stream(s), format(x) {}
-  explicit MediaFormat(double x, long s = -1)
-      : type(TYPE_CC), stream(s), format(x) {}
-
-  static MediaFormat makeMediaFormat(AudioFormat format, long stream) {
-    MediaFormat result(stream);
-    result.format.audio = format;
-    return result;
-  }
-
-  static MediaFormat makeMediaFormat(VideoFormat format, long stream) {
-    MediaFormat result(0, stream);
-    result.format.video = format;
-    return result;
-  }
-
-  static MediaFormat makeMediaFormat(SubtitleFormat format, long stream) {
-    MediaFormat result('0', stream);
-    result.format.subtitle = format;
-    return result;
-  }
-
-  // format type
-  MediaType type;
-  // stream index:
-  // set -1 for one stream auto detection, -2 for all streams auto detection,
-  // >= 0, specified stream, if caller knows the stream index (unlikely)
-  long stream;
-  // union keeps one of the possible formats, defined by MediaType
-  FormatUnion format;
-};
-
-struct DecoderParameters {
-  // local file, remote file, http url, rtmp stream uri, etc. anything that
-  // ffmpeg can recognize
-  std::string uri{std::string()};
-  // timeout on getting bytes for decoding
-  size_t timeoutMs{1000};
-  // logging level, default AV_LOG_PANIC
-  long logLevel{0};
-  // when decoder would give up, 0 means never
-  size_t maxPackageErrors{0};
-  // max allowed consecutive times no bytes are processed. 0 means for infinite.
-  size_t maxProcessNoBytes{0};
-  // start offset (us)
-  long startOffset{0};
-  // end offset (us)
-  long endOffset{-1};
-  // logging id
-  int64_t loggingUuid{0};
-  // internal max seekable buffer size
-  size_t maxSeekableBytes{0};
-  // adjust header pts to the epoch time
-  bool convertPtsToWallTime{false};
-  // indicate if input stream is an encoded image
-  bool isImage{false};
-  // listen and wait for new rtmp stream
-  bool listen{false};
-  // don't copy frame body, only header
-  bool headerOnly{false};
-  // enable fast seek (seek only to keyframes)
-  bool fastSeek{false};
-  // interrupt init method on timeout
-  bool preventStaleness{true};
-  // seek tolerated accuracy (us)
-  double seekAccuracy{1000000.0};
-  // Allow multithreaded decoding for numThreads > 1;
-  // 0 numThreads=0 sets up sensible defaults
-  int numThreads{1};
-  // what media types should be processed, default none
-  std::set<MediaFormat> formats;
-
-  // can be used for asynchronous decoders
-  size_t cacheSize{8192}; // mow many bytes to cache before stop reading bytes
-  size_t cacheTimeoutMs{1000}; // timeout on bytes writing
-  bool enforceCacheSize{false}; // drop output frames if cache is full
-  bool mergeAudioMessages{false}; // combine collocated audio messages together
-
-  std::string tlsCertFile;
-  std::string tlsKeyFile;
-
-  // Skip packets that fail with EPERM errors and continue decoding.
-  bool skipOperationNotPermittedPackets{false};
-
-  // probing size in bytes, i.e. the size of the data to analyze to get stream
-  // information. A higher value will enable detecting more information in case
-  // it is dispersed into the stream, but will increase latency. Must be an
-  // integer not lesser than 32. It is 5000000 by default.
-  int64_t probeSize{5000000};
-
-  // Expected duration of the video to be decoded, mainly used with uniform
-  // sampling
-  float expectedDuration{0.0f};
-
-  // Sample N key-frames from the video roughly uniformly across the timeline
-  int uniformSampling{0};
-
-  // with 0, ffmpeg allocates buffers of size 32768 bytes for encoded frames.
-  // Override this with bigger buffer size if needed.
-  int64_t maxEncodedBufferSize{0};
-};
-
-struct DecoderHeader {
-  // message id, from 0 till ...
-  size_t seqno{0};
-  // decoded timestamp in microseconds from either beginning of the stream or
-  // from epoch time, see DecoderParameters::convertPtsToWallTime
-  long pts{0};
-  // decoded key frame
-  size_t keyFrame{0};
-  // frames per second, valid only for video streams
-  double fps{0};
-  // format specifies what kind frame is in a payload
-  MediaFormat format;
-};
-
-// Abstract interface ByteStorage class
-class ByteStorage {
- public:
-  virtual ~ByteStorage() = default;
-  // makes sure that buffer has at least n bytes available for writing, if not
-  // storage must reallocate memory.
-  virtual void ensure(size_t n) = 0;
-  // caller must not to write more than available bytes
-  virtual uint8_t* writableTail() = 0;
-  // caller confirms that n bytes were written to the writable tail
-  virtual void append(size_t n) = 0;
-  // caller confirms that n bytes were read from the read buffer
-  virtual void trim(size_t n) = 0;
-  // gives an access to the beginning of the read buffer
-  virtual const uint8_t* data() const = 0;
-  // returns the stored size in bytes
-  virtual size_t length() const = 0;
-  // returns available capacity for writable tail
-  virtual size_t tail() const = 0;
-  // clears content, keeps capacity
-  virtual void clear() = 0;
-};
-
-struct DecoderOutputMessage {
-  DecoderHeader header;
-  std::unique_ptr<ByteStorage> payload;
-};
-
-/*
- * External provider of the ecnoded bytes, specific implementation is left for
- * different use cases, like file, memory, external network end-points, etc.
- * Normally input/output parameter @out set to valid, not null buffer pointer,
- * which indicates "read" call, however there are "seek" modes as well.
-
- * @out != nullptr => read from the current offset, @whence got ignored,
- * @size bytes to read => return number bytes got read, 0 if no more bytes
- * available, < 0 on error.
-
- * @out == nullptr, @timeoutMs == 0 => does provider support "seek"
- * capability in a first place? @size & @whence got ignored, return 0 on
- * success, < 0 if "seek" mode is not supported.
-
- * @out == nullptr, @timeoutMs != 0 => normal seek call
- * offset == @size, i.e. @whence = [SEEK_SET, SEEK_CUR, SEEK_END, AVSEEK_SIZE)
- * return < 0 on error, position if @whence = [SEEK_SET, SEEK_CUR, SEEK_END],
- * length of buffer if @whence = [AVSEEK_SIZE].
- */
-using DecoderInCallback =
-    std::function<int(uint8_t* out, int size, int whence, uint64_t timeoutMs)>;
-
-using DecoderOutCallback = std::function<void(DecoderOutputMessage&&)>;
-
-struct DecoderMetadata {
-  // time base numerator
-  long num{0};
-  // time base denominator
-  long den{1};
-  // duration of the stream, in miscroseconds, if available
-  long duration{-1};
-  // frames per second, valid only for video streams
-  double fps{0};
-  // format specifies what kind frame is in a payload
-  MediaFormat format;
-};
-/**
- * Abstract class for decoding media bytes
- * It has two different modes. Internal media bytes retrieval for given uri and
- * external media bytes provider in case of memory streams
- */
-class MediaDecoder {
- public:
-  virtual ~MediaDecoder() = default;
-
-  /**
-   * Initializes media decoder with parameters,
-   * calls callback when media bytes are available.
-   * Media bytes get fetched internally from provided URI
-   * or invokes provided input callback to get media bytes.
-   * Input callback must be empty for the internal media provider
-   * Caller can provide non-null pointer for the input container
-   * if headers to obtain the streams metadata (optional)
-   */
-  virtual bool init(
-      const DecoderParameters& params,
-      DecoderInCallback&& in,
-      std::vector<DecoderMetadata>* metadata) = 0;
-
-  /**
-   * Polls available decoded one frame from decoder
-   * Returns error code, 0 - for success
-   */
-  virtual int decode(DecoderOutputMessage* out, uint64_t timeoutMs) = 0;
-
-  /**
-   * Polls available decoded bytes from decoder, till EOF or error
-   */
-  virtual int decode_all(const DecoderOutCallback& callback) = 0;
-
-  /**
-   * Stops calling callback, releases resources
-   */
-  virtual void shutdown() = 0;
-
-  /**
-   * Interrupts whatever decoder is doing at any time
-   */
-  virtual void interrupt() = 0;
-
-  /**
-   * Factory to create ByteStorage class instances, particular implementation is
-   * left to the derived class. Caller provides the initially allocated size
-   */
-  virtual std::unique_ptr<ByteStorage> createByteStorage(size_t n) = 0;
-};
-
-struct SamplerParameters {
-  MediaType type{TYPE_AUDIO};
-  FormatUnion in;
-  FormatUnion out;
-  int64_t loggingUuid{0};
-};
-
-/**
- * Abstract class for sampling media bytes
- */
-class MediaSampler {
- public:
-  virtual ~MediaSampler() = default;
-
-  /**
-   * Initializes media sampler with parameters
-   */
-  virtual bool init(const SamplerParameters& params) = 0;
-
-  /**
-   * Samples media bytes
-   * Returns error code < 0, or >=0 - for success, indicating number of bytes
-   * processed.
-   * set @in to null for flushing data
-   */
-  virtual int sample(const ByteStorage* in, ByteStorage* out) = 0;
-
-  /**
-   * Releases resources
-   */
-  virtual void shutdown() = 0;
-
-  /*
-   * Returns media type
-   */
-  MediaType getMediaType() const {
-    return params_.type;
-  }
-  /*
-   * Returns formats
-   */
-  FormatUnion getInputFormat() const {
-    return params_.in;
-  }
-  FormatUnion getOutFormat() const {
-    return params_.out;
-  }
-
- protected:
-  SamplerParameters params_;
-};
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/gpu/README.rst b/torchvision/csrc/io/decoder/gpu/README.rst
deleted file mode 100644
index e4573d7fe75..00000000000
--- a/torchvision/csrc/io/decoder/gpu/README.rst
+++ /dev/null
@@ -1,21 +0,0 @@
-GPU Decoder
-===========
-
-GPU decoder depends on ffmpeg for demuxing, uses NVDECODE APIs from the nvidia-video-codec sdk and uses cuda for processing on gpu. In order to use this, please follow the following steps:
-
-* Download the latest `nvidia-video-codec-sdk <https://developer.nvidia.com/nvidia-video-codec-sdk/download>`_
-* Extract the zipped file.
-* Set TORCHVISION_INCLUDE environment variable to the location of the video codec headers(`nvcuvid.h` and `cuviddec.h`), which would be under `Interface` directory.
-* Set TORCHVISION_LIBRARY environment variable to the location of the video codec library(`libnvcuvid.so`), which would be under `Lib/linux/stubs/x86_64` directory.
-* Install the latest ffmpeg from `conda-forge` channel.
-
-.. code:: bash
-
-    conda install -c conda-forge ffmpeg
-
-* Set CUDA_HOME environment variable to the cuda root directory.
-* Build torchvision from source:
-
-.. code:: bash
-
-    pip install . -v --no-build-isolation
diff --git a/torchvision/csrc/io/decoder/gpu/decoder.cpp b/torchvision/csrc/io/decoder/gpu/decoder.cpp
deleted file mode 100644
index f7377ede38b..00000000000
--- a/torchvision/csrc/io/decoder/gpu/decoder.cpp
+++ /dev/null
@@ -1,405 +0,0 @@
-#include "decoder.h"
-#include <c10/util/Logging.h>
-#include <nppi_color_conversion.h>
-#include <cmath>
-#include <cstring>
-#include <unordered_map>
-
-static float chroma_height_factor(cudaVideoSurfaceFormat surface_format) {
-  return (surface_format == cudaVideoSurfaceFormat_YUV444 ||
-          surface_format == cudaVideoSurfaceFormat_YUV444_16Bit)
-      ? 1.0
-      : 0.5;
-}
-
-static int chroma_plane_count(cudaVideoSurfaceFormat surface_format) {
-  return (surface_format == cudaVideoSurfaceFormat_YUV444 ||
-          surface_format == cudaVideoSurfaceFormat_YUV444_16Bit)
-      ? 2
-      : 1;
-}
-
-/* Initialise cu_context and video_codec, create context lock and create parser
- * object.
- */
-void Decoder::init(CUcontext context, cudaVideoCodec codec) {
-  cu_context = context;
-  video_codec = codec;
-  check_for_cuda_errors(
-      cuvidCtxLockCreate(&ctx_lock, cu_context), __LINE__, __FILE__);
-
-  CUVIDPARSERPARAMS parser_params = {};
-  parser_params.CodecType = codec;
-  parser_params.ulMaxNumDecodeSurfaces = 1;
-  parser_params.ulClockRate = 1000;
-  parser_params.ulMaxDisplayDelay = 0u;
-  parser_params.pUserData = this;
-  parser_params.pfnSequenceCallback = video_sequence_handler;
-  parser_params.pfnDecodePicture = picture_decode_handler;
-  parser_params.pfnDisplayPicture = picture_display_handler;
-  parser_params.pfnGetOperatingPoint = operating_point_handler;
-
-  check_for_cuda_errors(
-      cuvidCreateVideoParser(&parser, &parser_params), __LINE__, __FILE__);
-}
-
-/* Destroy parser object and context lock.
- */
-Decoder::~Decoder() {
-  if (parser) {
-    cuvidDestroyVideoParser(parser);
-  }
-  cuvidCtxLockDestroy(ctx_lock);
-}
-
-/* Destroy CUvideodecoder object and free up all the unreturned decoded frames.
- */
-void Decoder::release() {
-  cuCtxPushCurrent(cu_context);
-  if (decoder) {
-    cuvidDestroyDecoder(decoder);
-  }
-  cuCtxPopCurrent(nullptr);
-}
-
-/* Trigger video decoding.
- */
-void Decoder::decode(const uint8_t* data, unsigned long size) {
-  CUVIDSOURCEDATAPACKET pkt = {};
-  pkt.flags = CUVID_PKT_TIMESTAMP;
-  pkt.payload_size = size;
-  pkt.payload = data;
-  pkt.timestamp = 0;
-  if (!data || size == 0) {
-    pkt.flags |= CUVID_PKT_ENDOFSTREAM;
-  }
-  check_for_cuda_errors(cuvidParseVideoData(parser, &pkt), __LINE__, __FILE__);
-  cuvidStream = 0;
-}
-
-/* Fetch a decoded frame and remove it from the queue.
- */
-torch::Tensor Decoder::fetch_frame() {
-  if (decoded_frames.empty()) {
-    auto options =
-        torch::TensorOptions().dtype(torch::kU8).device(torch::kCUDA);
-    return torch::zeros({0}, options);
-  }
-  torch::Tensor frame = decoded_frames.front();
-  decoded_frames.pop();
-  return frame;
-}
-
-/* Called when a picture is ready to be decoded.
- */
-int Decoder::handle_picture_decode(CUVIDPICPARAMS* pic_params) {
-  if (!decoder) {
-    TORCH_CHECK(false, "Uninitialised decoder");
-  }
-  pic_num_in_decode_order[pic_params->CurrPicIdx] = decode_pic_count++;
-  check_for_cuda_errors(cuCtxPushCurrent(cu_context), __LINE__, __FILE__);
-  check_for_cuda_errors(
-      cuvidDecodePicture(decoder, pic_params), __LINE__, __FILE__);
-  check_for_cuda_errors(cuCtxPopCurrent(nullptr), __LINE__, __FILE__);
-  return 1;
-}
-
-/* Process the decoded data and copy it to a cuda memory location.
- */
-int Decoder::handle_picture_display(CUVIDPARSERDISPINFO* disp_info) {
-  CUVIDPROCPARAMS proc_params = {};
-  proc_params.progressive_frame = disp_info->progressive_frame;
-  proc_params.second_field = disp_info->repeat_first_field + 1;
-  proc_params.top_field_first = disp_info->top_field_first;
-  proc_params.unpaired_field = disp_info->repeat_first_field < 0;
-  proc_params.output_stream = cuvidStream;
-
-  CUdeviceptr source_frame = 0;
-  unsigned int source_pitch = 0;
-  check_for_cuda_errors(cuCtxPushCurrent(cu_context), __LINE__, __FILE__);
-  check_for_cuda_errors(
-      cuvidMapVideoFrame(
-          decoder,
-          disp_info->picture_index,
-          &source_frame,
-          &source_pitch,
-          &proc_params),
-      __LINE__,
-      __FILE__);
-
-  CUVIDGETDECODESTATUS decode_status;
-  memset(&decode_status, 0, sizeof(decode_status));
-  CUresult result =
-      cuvidGetDecodeStatus(decoder, disp_info->picture_index, &decode_status);
-  if (result == CUDA_SUCCESS &&
-      (decode_status.decodeStatus == cuvidDecodeStatus_Error ||
-       decode_status.decodeStatus == cuvidDecodeStatus_Error_Concealed)) {
-    VLOG(1) << "Decode Error occurred for picture "
-            << pic_num_in_decode_order[disp_info->picture_index];
-  }
-
-  auto options = torch::TensorOptions().dtype(torch::kU8).device(torch::kCUDA);
-  torch::Tensor decoded_frame = torch::empty({get_height(), width, 3}, options);
-  uint8_t* frame_ptr = decoded_frame.data_ptr<uint8_t>();
-  const uint8_t* const source_arr[] = {
-      (const uint8_t* const)source_frame,
-      (const uint8_t* const)(source_frame +
-                             source_pitch * ((surface_height + 1) & ~1))};
-
-  auto err = nppiNV12ToRGB_709CSC_8u_P2C3R(
-      source_arr,
-      source_pitch,
-      frame_ptr,
-      width * 3,
-      {(int)decoded_frame.size(1), (int)decoded_frame.size(0)});
-
-  TORCH_CHECK(
-      err == NPP_NO_ERROR,
-      "Failed to convert from NV12 to RGB. Error code:",
-      err);
-
-  check_for_cuda_errors(cuStreamSynchronize(cuvidStream), __LINE__, __FILE__);
-  decoded_frames.push(decoded_frame);
-  check_for_cuda_errors(cuCtxPopCurrent(nullptr), __LINE__, __FILE__);
-
-  check_for_cuda_errors(
-      cuvidUnmapVideoFrame(decoder, source_frame), __LINE__, __FILE__);
-  return 1;
-}
-
-/* Query the capabilities of the underlying hardware video decoder and
- * verify if the hardware supports decoding the passed video.
- */
-void Decoder::query_hardware(CUVIDEOFORMAT* video_format) {
-  CUVIDDECODECAPS decode_caps = {};
-  decode_caps.eCodecType = video_format->codec;
-  decode_caps.eChromaFormat = video_format->chroma_format;
-  decode_caps.nBitDepthMinus8 = video_format->bit_depth_luma_minus8;
-
-  check_for_cuda_errors(cuCtxPushCurrent(cu_context), __LINE__, __FILE__);
-  check_for_cuda_errors(cuvidGetDecoderCaps(&decode_caps), __LINE__, __FILE__);
-  check_for_cuda_errors(cuCtxPopCurrent(nullptr), __LINE__, __FILE__);
-
-  if (!decode_caps.bIsSupported) {
-    TORCH_CHECK(false, "Codec not supported on this GPU");
-  }
-  if ((video_format->coded_width > decode_caps.nMaxWidth) ||
-      (video_format->coded_height > decode_caps.nMaxHeight)) {
-    TORCH_CHECK(
-        false,
-        "Resolution          : ",
-        video_format->coded_width,
-        "x",
-        video_format->coded_height,
-        "\nMax Supported (wxh) : ",
-        decode_caps.nMaxWidth,
-        "x",
-        decode_caps.nMaxHeight,
-        "\nResolution not supported on this GPU");
-  }
-  if ((video_format->coded_width >> 4) * (video_format->coded_height >> 4) >
-      decode_caps.nMaxMBCount) {
-    TORCH_CHECK(
-        false,
-        "MBCount             : ",
-        (video_format->coded_width >> 4) * (video_format->coded_height >> 4),
-        "\nMax Supported mbcnt : ",
-        decode_caps.nMaxMBCount,
-        "\nMBCount not supported on this GPU");
-  }
-  // Check if output format supported. If not, check fallback options
-  if (!(decode_caps.nOutputFormatMask & (1 << video_output_format))) {
-    if (decode_caps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_NV12)) {
-      video_output_format = cudaVideoSurfaceFormat_NV12;
-    } else if (
-        decode_caps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_P016)) {
-      video_output_format = cudaVideoSurfaceFormat_P016;
-    } else if (
-        decode_caps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_YUV444)) {
-      video_output_format = cudaVideoSurfaceFormat_YUV444;
-    } else if (
-        decode_caps.nOutputFormatMask &
-        (1 << cudaVideoSurfaceFormat_YUV444_16Bit)) {
-      video_output_format = cudaVideoSurfaceFormat_YUV444_16Bit;
-    } else {
-      TORCH_CHECK(false, "No supported output format found");
-    }
-  }
-}
-
-/* Called before decoding frames and/or whenever there is a configuration
- * change.
- */
-int Decoder::handle_video_sequence(CUVIDEOFORMAT* video_format) {
-  // video_codec has been set in init(). Here it's set
-  // again for potential correction.
-  video_codec = video_format->codec;
-  video_chroma_format = video_format->chroma_format;
-  bit_depth_minus8 = video_format->bit_depth_luma_minus8;
-  bytes_per_pixel = bit_depth_minus8 > 0 ? 2 : 1;
-  // Set the output surface format same as chroma format
-  switch (video_chroma_format) {
-    case cudaVideoChromaFormat_Monochrome:
-    case cudaVideoChromaFormat_420:
-      video_output_format = video_format->bit_depth_luma_minus8
-          ? cudaVideoSurfaceFormat_P016
-          : cudaVideoSurfaceFormat_NV12;
-      break;
-    case cudaVideoChromaFormat_444:
-      video_output_format = video_format->bit_depth_luma_minus8
-          ? cudaVideoSurfaceFormat_YUV444_16Bit
-          : cudaVideoSurfaceFormat_YUV444;
-      break;
-    case cudaVideoChromaFormat_422:
-      video_output_format = cudaVideoSurfaceFormat_NV12;
-  }
-
-  query_hardware(video_format);
-
-  if (width && luma_height && chroma_height) {
-    // cuvidCreateDecoder() has been called before and now there's possible
-    // config change.
-    return reconfigure_decoder(video_format);
-  }
-
-  cu_video_format = *video_format;
-  unsigned long decode_surface = video_format->min_num_decode_surfaces;
-  cudaVideoDeinterlaceMode deinterlace_mode = cudaVideoDeinterlaceMode_Adaptive;
-
-  if (video_format->progressive_sequence) {
-    deinterlace_mode = cudaVideoDeinterlaceMode_Weave;
-  }
-
-  CUVIDDECODECREATEINFO video_decode_create_info = {};
-  video_decode_create_info.ulWidth = video_format->coded_width;
-  video_decode_create_info.ulHeight = video_format->coded_height;
-  video_decode_create_info.ulNumDecodeSurfaces = decode_surface;
-  video_decode_create_info.CodecType = video_format->codec;
-  video_decode_create_info.ChromaFormat = video_format->chroma_format;
-  // With PreferCUVID, JPEG is still decoded by CUDA while video is decoded
-  // by NVDEC hardware
-  video_decode_create_info.ulCreationFlags = cudaVideoCreate_PreferCUVID;
-  video_decode_create_info.bitDepthMinus8 = video_format->bit_depth_luma_minus8;
-  video_decode_create_info.OutputFormat = video_output_format;
-  video_decode_create_info.DeinterlaceMode = deinterlace_mode;
-  video_decode_create_info.ulNumOutputSurfaces = 2;
-  video_decode_create_info.vidLock = ctx_lock;
-
-  // AV1 has max width/height of sequence in sequence header
-  if (video_format->codec == cudaVideoCodec_AV1 &&
-      video_format->seqhdr_data_length > 0) {
-    CUVIDEOFORMATEX* video_format_ex = (CUVIDEOFORMATEX*)video_format;
-    max_width = video_format_ex->av1.max_width;
-    max_height = video_format_ex->av1.max_height;
-  }
-  if (max_width < video_format->coded_width) {
-    max_width = video_format->coded_width;
-  }
-  if (max_height < video_format->coded_height) {
-    max_height = video_format->coded_height;
-  }
-  video_decode_create_info.ulMaxWidth = max_width;
-  video_decode_create_info.ulMaxHeight = max_height;
-  width = video_format->display_area.right - video_format->display_area.left;
-  luma_height =
-      video_format->display_area.bottom - video_format->display_area.top;
-  video_decode_create_info.ulTargetWidth = video_format->coded_width;
-  video_decode_create_info.ulTargetHeight = video_format->coded_height;
-  chroma_height =
-      (int)(ceil(luma_height * chroma_height_factor(video_output_format)));
-  num_chroma_planes = chroma_plane_count(video_output_format);
-  surface_height = video_decode_create_info.ulTargetHeight;
-  surface_width = video_decode_create_info.ulTargetWidth;
-  display_rect.bottom = video_decode_create_info.display_area.bottom;
-  display_rect.top = video_decode_create_info.display_area.top;
-  display_rect.left = video_decode_create_info.display_area.left;
-  display_rect.right = video_decode_create_info.display_area.right;
-
-  check_for_cuda_errors(cuCtxPushCurrent(cu_context), __LINE__, __FILE__);
-  check_for_cuda_errors(
-      cuvidCreateDecoder(&decoder, &video_decode_create_info),
-      __LINE__,
-      __FILE__);
-  check_for_cuda_errors(cuCtxPopCurrent(nullptr), __LINE__, __FILE__);
-  return decode_surface;
-}
-
-int Decoder::reconfigure_decoder(CUVIDEOFORMAT* video_format) {
-  if (video_format->bit_depth_luma_minus8 !=
-          cu_video_format.bit_depth_luma_minus8 ||
-      video_format->bit_depth_chroma_minus8 !=
-          cu_video_format.bit_depth_chroma_minus8) {
-    TORCH_CHECK(false, "Reconfigure not supported for bit depth change");
-  }
-  if (video_format->chroma_format != cu_video_format.chroma_format) {
-    TORCH_CHECK(false, "Reconfigure not supported for chroma format change");
-  }
-
-  bool decode_res_change =
-      !(video_format->coded_width == cu_video_format.coded_width &&
-        video_format->coded_height == cu_video_format.coded_height);
-  bool display_rect_change =
-      !(video_format->display_area.bottom ==
-            cu_video_format.display_area.bottom &&
-        video_format->display_area.top == cu_video_format.display_area.top &&
-        video_format->display_area.left == cu_video_format.display_area.left &&
-        video_format->display_area.right == cu_video_format.display_area.right);
-
-  unsigned int decode_surface = video_format->min_num_decode_surfaces;
-
-  if ((video_format->coded_width > max_width) ||
-      (video_format->coded_height > max_height)) {
-    // For VP9, let driver  handle the change if new width/height >
-    // maxwidth/maxheight
-    if (video_codec != cudaVideoCodec_VP9) {
-      TORCH_CHECK(
-          false,
-          "Reconfigure not supported when width/height > maxwidth/maxheight");
-    }
-    return 1;
-  }
-
-  if (!decode_res_change) {
-    // If the coded_width/coded_height hasn't changed but display resolution has
-    // changed, then need to update width/height for correct output without
-    // cropping. Example : 1920x1080 vs 1920x1088.
-    if (display_rect_change) {
-      width =
-          video_format->display_area.right - video_format->display_area.left;
-      luma_height =
-          video_format->display_area.bottom - video_format->display_area.top;
-      chroma_height =
-          (int)ceil(luma_height * chroma_height_factor(video_output_format));
-      num_chroma_planes = chroma_plane_count(video_output_format);
-    }
-    return 1;
-  }
-  cu_video_format.coded_width = video_format->coded_width;
-  cu_video_format.coded_height = video_format->coded_height;
-  CUVIDRECONFIGUREDECODERINFO reconfig_params = {};
-  reconfig_params.ulWidth = video_format->coded_width;
-  reconfig_params.ulHeight = video_format->coded_height;
-  reconfig_params.ulTargetWidth = surface_width;
-  reconfig_params.ulTargetHeight = surface_height;
-  reconfig_params.ulNumDecodeSurfaces = decode_surface;
-  reconfig_params.display_area.bottom = display_rect.bottom;
-  reconfig_params.display_area.top = display_rect.top;
-  reconfig_params.display_area.left = display_rect.left;
-  reconfig_params.display_area.right = display_rect.right;
-
-  check_for_cuda_errors(cuCtxPushCurrent(cu_context), __LINE__, __FILE__);
-  check_for_cuda_errors(
-      cuvidReconfigureDecoder(decoder, &reconfig_params), __LINE__, __FILE__);
-  check_for_cuda_errors(cuCtxPopCurrent(nullptr), __LINE__, __FILE__);
-
-  return decode_surface;
-}
-
-/* Called from AV1 sequence header to get operating point of an AV1 bitstream.
- */
-int Decoder::get_operating_point(CUVIDOPERATINGPOINTINFO* oper_point_info) {
-  return oper_point_info->codec == cudaVideoCodec_AV1 &&
-          oper_point_info->av1.operating_points_cnt > 1
-      ? 0
-      : -1;
-}
diff --git a/torchvision/csrc/io/decoder/gpu/decoder.h b/torchvision/csrc/io/decoder/gpu/decoder.h
deleted file mode 100644
index 5ad685ec746..00000000000
--- a/torchvision/csrc/io/decoder/gpu/decoder.h
+++ /dev/null
@@ -1,89 +0,0 @@
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-#include <cuviddec.h>
-#include <nvcuvid.h>
-#include <torch/torch.h>
-#include <cstdint>
-#include <queue>
-
-static auto check_for_cuda_errors =
-    [](CUresult result, int line_num, std::string file_name) {
-      if (CUDA_SUCCESS != result) {
-        const char* error_name = nullptr;
-
-        TORCH_CHECK(
-            CUDA_SUCCESS != cuGetErrorName(result, &error_name),
-            "CUDA error: ",
-            error_name,
-            " in ",
-            file_name,
-            " at line ",
-            line_num)
-        TORCH_CHECK(
-            false, "Error: ", result, " in ", file_name, " at line ", line_num);
-      }
-    };
-
-struct Rect {
-  int left, top, right, bottom;
-};
-
-class Decoder {
- public:
-  Decoder() {}
-  ~Decoder();
-  void init(CUcontext, cudaVideoCodec);
-  void release();
-  void decode(const uint8_t*, unsigned long);
-  torch::Tensor fetch_frame();
-  int get_height() const {
-    return luma_height;
-  }
-
- private:
-  unsigned int width = 0, luma_height = 0, chroma_height = 0;
-  unsigned int surface_height = 0, surface_width = 0;
-  unsigned int max_width = 0, max_height = 0;
-  unsigned int num_chroma_planes = 0;
-  int bit_depth_minus8 = 0, bytes_per_pixel = 1;
-  int decode_pic_count = 0, pic_num_in_decode_order[32];
-  std::queue<torch::Tensor> decoded_frames;
-  CUcontext cu_context = NULL;
-  CUvideoctxlock ctx_lock;
-  CUvideoparser parser = NULL;
-  CUvideodecoder decoder = NULL;
-  CUstream cuvidStream = 0;
-  cudaVideoCodec video_codec = cudaVideoCodec_NumCodecs;
-  cudaVideoChromaFormat video_chroma_format = cudaVideoChromaFormat_420;
-  cudaVideoSurfaceFormat video_output_format = cudaVideoSurfaceFormat_NV12;
-  CUVIDEOFORMAT cu_video_format = {};
-  Rect display_rect = {};
-
-  static int video_sequence_handler(
-      void* user_data,
-      CUVIDEOFORMAT* video_format) {
-    return ((Decoder*)user_data)->handle_video_sequence(video_format);
-  }
-  static int picture_decode_handler(
-      void* user_data,
-      CUVIDPICPARAMS* pic_params) {
-    return ((Decoder*)user_data)->handle_picture_decode(pic_params);
-  }
-  static int picture_display_handler(
-      void* user_data,
-      CUVIDPARSERDISPINFO* disp_info) {
-    return ((Decoder*)user_data)->handle_picture_display(disp_info);
-  }
-  static int operating_point_handler(
-      void* user_data,
-      CUVIDOPERATINGPOINTINFO* operating_info) {
-    return ((Decoder*)user_data)->get_operating_point(operating_info);
-  }
-
-  void query_hardware(CUVIDEOFORMAT*);
-  int reconfigure_decoder(CUVIDEOFORMAT*);
-  int handle_video_sequence(CUVIDEOFORMAT*);
-  int handle_picture_decode(CUVIDPICPARAMS*);
-  int handle_picture_display(CUVIDPARSERDISPINFO*);
-  int get_operating_point(CUVIDOPERATINGPOINTINFO*);
-};
diff --git a/torchvision/csrc/io/decoder/gpu/demuxer.h b/torchvision/csrc/io/decoder/gpu/demuxer.h
deleted file mode 100644
index f6e72dceee1..00000000000
--- a/torchvision/csrc/io/decoder/gpu/demuxer.h
+++ /dev/null
@@ -1,257 +0,0 @@
-extern "C" {
-#include <libavcodec/avcodec.h>
-#include <libavcodec/bsf.h>
-#include <libavformat/avformat.h>
-#include <libavformat/avio.h>
-}
-
-class Demuxer {
- private:
-  AVFormatContext* fmtCtx = NULL;
-  AVBSFContext* bsfCtx = NULL;
-  AVPacket pkt, pktFiltered;
-  AVCodecID eVideoCodec;
-  uint8_t* dataWithHeader = NULL;
-  bool bMp4H264, bMp4HEVC, bMp4MPEG4;
-  unsigned int frameCount = 0;
-  int iVideoStream;
-  double timeBase = 0.0;
-
- public:
-  Demuxer(const char* filePath) {
-    avformat_network_init();
-    TORCH_CHECK(
-        0 <= avformat_open_input(&fmtCtx, filePath, NULL, NULL),
-        "avformat_open_input() failed at line ",
-        __LINE__,
-        " in demuxer.h\n");
-    if (!fmtCtx) {
-      TORCH_CHECK(
-          false,
-          "Encountered NULL AVFormatContext at line ",
-          __LINE__,
-          " in demuxer.h\n");
-    }
-
-    TORCH_CHECK(
-        0 <= avformat_find_stream_info(fmtCtx, NULL),
-        "avformat_find_stream_info() failed at line ",
-        __LINE__,
-        " in demuxer.h\n");
-    iVideoStream =
-        av_find_best_stream(fmtCtx, AVMEDIA_TYPE_VIDEO, -1, -1, NULL, 0);
-    if (iVideoStream < 0) {
-      TORCH_CHECK(
-          false,
-          "av_find_best_stream() failed at line ",
-          __LINE__,
-          " in demuxer.h\n");
-    }
-
-    eVideoCodec = fmtCtx->streams[iVideoStream]->codecpar->codec_id;
-    AVRational rTimeBase = fmtCtx->streams[iVideoStream]->time_base;
-    timeBase = av_q2d(rTimeBase);
-
-    bMp4H264 = eVideoCodec == AV_CODEC_ID_H264 &&
-        (!strcmp(fmtCtx->iformat->long_name, "QuickTime / MOV") ||
-         !strcmp(fmtCtx->iformat->long_name, "FLV (Flash Video)") ||
-         !strcmp(fmtCtx->iformat->long_name, "Matroska / WebM"));
-    bMp4HEVC = eVideoCodec == AV_CODEC_ID_HEVC &&
-        (!strcmp(fmtCtx->iformat->long_name, "QuickTime / MOV") ||
-         !strcmp(fmtCtx->iformat->long_name, "FLV (Flash Video)") ||
-         !strcmp(fmtCtx->iformat->long_name, "Matroska / WebM"));
-    bMp4MPEG4 = eVideoCodec == AV_CODEC_ID_MPEG4 &&
-        (!strcmp(fmtCtx->iformat->long_name, "QuickTime / MOV") ||
-         !strcmp(fmtCtx->iformat->long_name, "FLV (Flash Video)") ||
-         !strcmp(fmtCtx->iformat->long_name, "Matroska / WebM"));
-
-    av_init_packet(&pkt);
-    pkt.data = NULL;
-    pkt.size = 0;
-    av_init_packet(&pktFiltered);
-    pktFiltered.data = NULL;
-    pktFiltered.size = 0;
-
-    if (bMp4H264) {
-      const AVBitStreamFilter* bsf = av_bsf_get_by_name("h264_mp4toannexb");
-      if (!bsf) {
-        TORCH_CHECK(
-            false,
-            "av_bsf_get_by_name() failed at line ",
-            __LINE__,
-            " in demuxer.h\n");
-      }
-      TORCH_CHECK(
-          0 <= av_bsf_alloc(bsf, &bsfCtx),
-          "av_bsf_alloc() failed at line ",
-          __LINE__,
-          " in demuxer.h\n");
-      avcodec_parameters_copy(
-          bsfCtx->par_in, fmtCtx->streams[iVideoStream]->codecpar);
-      TORCH_CHECK(
-          0 <= av_bsf_init(bsfCtx),
-          "av_bsf_init() failed at line ",
-          __LINE__,
-          " in demuxer.h\n");
-    }
-    if (bMp4HEVC) {
-      const AVBitStreamFilter* bsf = av_bsf_get_by_name("hevc_mp4toannexb");
-      if (!bsf) {
-        TORCH_CHECK(
-            false,
-            "av_bsf_get_by_name() failed at line ",
-            __LINE__,
-            " in demuxer.h\n");
-      }
-      TORCH_CHECK(
-          0 <= av_bsf_alloc(bsf, &bsfCtx),
-          "av_bsf_alloc() failed at line ",
-          __LINE__,
-          " in demuxer.h\n");
-      avcodec_parameters_copy(
-          bsfCtx->par_in, fmtCtx->streams[iVideoStream]->codecpar);
-      TORCH_CHECK(
-          0 <= av_bsf_init(bsfCtx),
-          "av_bsf_init() failed at line ",
-          __LINE__,
-          " in demuxer.h\n");
-    }
-  }
-
-  ~Demuxer() {
-    if (!fmtCtx) {
-      return;
-    }
-    if (pkt.data) {
-      av_packet_unref(&pkt);
-    }
-    if (pktFiltered.data) {
-      av_packet_unref(&pktFiltered);
-    }
-    if (bsfCtx) {
-      av_bsf_free(&bsfCtx);
-    }
-    avformat_close_input(&fmtCtx);
-    if (dataWithHeader) {
-      av_free(dataWithHeader);
-    }
-  }
-
-  AVCodecID get_video_codec() {
-    return eVideoCodec;
-  }
-
-  double get_duration() const {
-    return (double)fmtCtx->duration / AV_TIME_BASE;
-  }
-
-  double get_fps() const {
-    return av_q2d(fmtCtx->streams[iVideoStream]->r_frame_rate);
-  }
-
-  bool demux(uint8_t** video, unsigned long* videoBytes) {
-    if (!fmtCtx) {
-      return false;
-    }
-    *videoBytes = 0;
-
-    if (pkt.data) {
-      av_packet_unref(&pkt);
-    }
-    int e = 0;
-    while ((e = av_read_frame(fmtCtx, &pkt)) >= 0 &&
-           pkt.stream_index != iVideoStream) {
-      av_packet_unref(&pkt);
-    }
-    if (e < 0) {
-      return false;
-    }
-
-    if (bMp4H264 || bMp4HEVC) {
-      if (pktFiltered.data) {
-        av_packet_unref(&pktFiltered);
-      }
-      TORCH_CHECK(
-          0 <= av_bsf_send_packet(bsfCtx, &pkt),
-          "av_bsf_send_packet() failed at line ",
-          __LINE__,
-          " in demuxer.h\n");
-      TORCH_CHECK(
-          0 <= av_bsf_receive_packet(bsfCtx, &pktFiltered),
-          "av_bsf_receive_packet() failed at line ",
-          __LINE__,
-          " in demuxer.h\n");
-      *video = pktFiltered.data;
-      *videoBytes = pktFiltered.size;
-    } else {
-      if (bMp4MPEG4 && (frameCount == 0)) {
-        int extraDataSize =
-            fmtCtx->streams[iVideoStream]->codecpar->extradata_size;
-
-        if (extraDataSize > 0) {
-          dataWithHeader = (uint8_t*)av_malloc(
-              extraDataSize + pkt.size - 3 * sizeof(uint8_t));
-          if (!dataWithHeader) {
-            TORCH_CHECK(
-                false,
-                "av_malloc() failed at line ",
-                __LINE__,
-                " in demuxer.h\n");
-          }
-          memcpy(
-              dataWithHeader,
-              fmtCtx->streams[iVideoStream]->codecpar->extradata,
-              extraDataSize);
-          memcpy(
-              dataWithHeader + extraDataSize,
-              pkt.data + 3,
-              pkt.size - 3 * sizeof(uint8_t));
-          *video = dataWithHeader;
-          *videoBytes = extraDataSize + pkt.size - 3 * sizeof(uint8_t);
-        }
-      } else {
-        *video = pkt.data;
-        *videoBytes = pkt.size;
-      }
-    }
-    frameCount++;
-    return true;
-  }
-
-  void seek(double timestamp, int flag) {
-    int64_t time = timestamp * AV_TIME_BASE;
-    TORCH_CHECK(
-        0 <= av_seek_frame(fmtCtx, -1, time, flag),
-        "av_seek_frame() failed at line ",
-        __LINE__,
-        " in demuxer.h\n");
-  }
-};
-
-inline cudaVideoCodec ffmpeg_to_codec(AVCodecID id) {
-  switch (id) {
-    case AV_CODEC_ID_MPEG1VIDEO:
-      return cudaVideoCodec_MPEG1;
-    case AV_CODEC_ID_MPEG2VIDEO:
-      return cudaVideoCodec_MPEG2;
-    case AV_CODEC_ID_MPEG4:
-      return cudaVideoCodec_MPEG4;
-    case AV_CODEC_ID_WMV3:
-    case AV_CODEC_ID_VC1:
-      return cudaVideoCodec_VC1;
-    case AV_CODEC_ID_H264:
-      return cudaVideoCodec_H264;
-    case AV_CODEC_ID_HEVC:
-      return cudaVideoCodec_HEVC;
-    case AV_CODEC_ID_VP8:
-      return cudaVideoCodec_VP8;
-    case AV_CODEC_ID_VP9:
-      return cudaVideoCodec_VP9;
-    case AV_CODEC_ID_MJPEG:
-      return cudaVideoCodec_JPEG;
-    case AV_CODEC_ID_AV1:
-      return cudaVideoCodec_AV1;
-    default:
-      return cudaVideoCodec_NumCodecs;
-  }
-}
diff --git a/torchvision/csrc/io/decoder/gpu/gpu_decoder.cpp b/torchvision/csrc/io/decoder/gpu/gpu_decoder.cpp
deleted file mode 100644
index 1fe3ec8ab7a..00000000000
--- a/torchvision/csrc/io/decoder/gpu/gpu_decoder.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-#include "gpu_decoder.h"
-#include <c10/cuda/CUDAGuard.h>
-
-/* Set cuda device, create cuda context and initialise the demuxer and decoder.
- */
-GPUDecoder::GPUDecoder(std::string src_file, torch::Device dev)
-    : demuxer(src_file.c_str()) {
-  at::cuda::CUDAGuard device_guard(dev);
-  device = device_guard.current_device().index();
-  check_for_cuda_errors(
-      cuDevicePrimaryCtxRetain(&ctx, device), __LINE__, __FILE__);
-  decoder.init(ctx, ffmpeg_to_codec(demuxer.get_video_codec()));
-  initialised = true;
-}
-
-GPUDecoder::~GPUDecoder() {
-  at::cuda::CUDAGuard device_guard(device);
-  decoder.release();
-  if (initialised) {
-    check_for_cuda_errors(
-        cuDevicePrimaryCtxRelease(device), __LINE__, __FILE__);
-  }
-}
-
-/* Fetch a decoded frame tensor after demuxing and decoding.
- */
-torch::Tensor GPUDecoder::decode() {
-  torch::Tensor frameTensor;
-  unsigned long videoBytes = 0;
-  uint8_t* video = nullptr;
-  at::cuda::CUDAGuard device_guard(device);
-  torch::Tensor frame;
-  do {
-    demuxer.demux(&video, &videoBytes);
-    decoder.decode(video, videoBytes);
-    frame = decoder.fetch_frame();
-  } while (frame.numel() == 0 && videoBytes > 0);
-  return frame;
-}
-
-/* Seek to a passed timestamp. The second argument controls whether to seek to a
- * keyframe.
- */
-void GPUDecoder::seek(double timestamp, bool keyframes_only) {
-  int flag = keyframes_only ? 0 : AVSEEK_FLAG_ANY;
-  demuxer.seek(timestamp, flag);
-}
-
-c10::Dict<std::string, c10::Dict<std::string, double>> GPUDecoder::
-    get_metadata() const {
-  c10::Dict<std::string, c10::Dict<std::string, double>> metadata;
-  c10::Dict<std::string, double> video_metadata;
-  video_metadata.insert("duration", demuxer.get_duration());
-  video_metadata.insert("fps", demuxer.get_fps());
-  metadata.insert("video", video_metadata);
-  return metadata;
-}
-
-TORCH_LIBRARY(torchvision, m) {
-  m.class_<GPUDecoder>("GPUDecoder")
-      .def(torch::init<std::string, torch::Device>())
-      .def("seek", &GPUDecoder::seek)
-      .def("get_metadata", &GPUDecoder::get_metadata)
-      .def("next", &GPUDecoder::decode);
-}
diff --git a/torchvision/csrc/io/decoder/gpu/gpu_decoder.h b/torchvision/csrc/io/decoder/gpu/gpu_decoder.h
deleted file mode 100644
index 22bf680a982..00000000000
--- a/torchvision/csrc/io/decoder/gpu/gpu_decoder.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#include <torch/custom_class.h>
-#include <torch/torch.h>
-#include "decoder.h"
-#include "demuxer.h"
-
-class GPUDecoder : public torch::CustomClassHolder {
- public:
-  GPUDecoder(std::string, torch::Device);
-  ~GPUDecoder();
-  torch::Tensor decode();
-  void seek(double, bool);
-  c10::Dict<std::string, c10::Dict<std::string, double>> get_metadata() const;
-
- private:
-  Demuxer demuxer;
-  CUcontext ctx;
-  Decoder decoder;
-  int64_t device;
-  bool initialised = false;
-};
diff --git a/torchvision/csrc/io/decoder/memory_buffer.cpp b/torchvision/csrc/io/decoder/memory_buffer.cpp
deleted file mode 100644
index 4e420c3b3cd..00000000000
--- a/torchvision/csrc/io/decoder/memory_buffer.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-#include "memory_buffer.h"
-#include <c10/util/Logging.h>
-
-namespace ffmpeg {
-
-MemoryBuffer::MemoryBuffer(const uint8_t* buffer, size_t size)
-    : buffer_(buffer), len_(size) {}
-
-int MemoryBuffer::read(uint8_t* buf, int size) {
-  if (pos_ < len_) {
-    auto available = std::min(int(len_ - pos_), size);
-    memcpy(buf, buffer_ + pos_, available);
-    pos_ += available;
-    return available;
-  }
-
-  return 0;
-}
-
-int64_t MemoryBuffer::seek(int64_t offset, int whence) {
-  if (whence & AVSEEK_SIZE) {
-    return len_;
-  }
-
-  // remove force flag
-  whence &= ~AVSEEK_FORCE;
-
-  switch (whence) {
-    case SEEK_SET:
-      if (offset >= 0 && offset <= len_) {
-        pos_ = offset;
-      }
-      break;
-    case SEEK_END:
-      if (len_ + offset >= 0 && len_ + offset <= len_) {
-        pos_ = len_ + offset;
-      }
-      break;
-    case SEEK_CUR:
-      if (pos_ + offset > 0 && pos_ + offset <= len_) {
-        pos_ += offset;
-      }
-      break;
-    default:
-      LOG(ERROR) << "Unknown whence flag gets provided: " << whence;
-  }
-  return pos_;
-}
-
-/* static */
-DecoderInCallback MemoryBuffer::getCallback(
-    const uint8_t* buffer,
-    size_t size) {
-  MemoryBuffer object(buffer, size);
-  return
-      [object](uint8_t* out, int size, int whence, uint64_t timeoutMs) mutable
-      -> int {
-        if (out) { // see defs.h file
-          // read mode
-          return object.read(out, size);
-        }
-        // seek mode
-        if (!timeoutMs) {
-          // seek capability, yes - supported
-          return 0;
-        }
-        return object.seek(size, whence);
-      };
-}
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/memory_buffer.h b/torchvision/csrc/io/decoder/memory_buffer.h
deleted file mode 100644
index 909626d3cae..00000000000
--- a/torchvision/csrc/io/decoder/memory_buffer.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#pragma once
-
-#include "defs.h"
-
-namespace ffmpeg {
-
-/**
- * Class uses external memory buffer and implements a seekable interface.
- */
-class MemoryBuffer {
- public:
-  explicit MemoryBuffer(const uint8_t* buffer, size_t size);
-  int64_t seek(int64_t offset, int whence);
-  int read(uint8_t* buf, int size);
-
-  // static constructor for decoder callback.
-  static DecoderInCallback getCallback(const uint8_t* buffer, size_t size);
-
- private:
-  const uint8_t* buffer_; // set at construction time
-  long pos_{0}; // current position
-  long len_{0}; // bytes in buffer
-};
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/seekable_buffer.cpp b/torchvision/csrc/io/decoder/seekable_buffer.cpp
deleted file mode 100644
index 41e3e689c7b..00000000000
--- a/torchvision/csrc/io/decoder/seekable_buffer.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-#include "seekable_buffer.h"
-#include <c10/util/Logging.h>
-#include <chrono>
-#include "memory_buffer.h"
-
-namespace ffmpeg {
-
-int SeekableBuffer::init(
-    DecoderInCallback&& in,
-    uint64_t timeoutMs,
-    size_t maxSeekableBytes,
-    ImageType* type) {
-  shutdown();
-  isSeekable_ = in(nullptr, 0, 0, 0) == 0;
-  if (isSeekable_) { // seekable
-    if (type) {
-      if (!readBytes(in, 8, timeoutMs)) {
-        return -1;
-      }
-      setImageType(type);
-      end_ = 0;
-      eof_ = false;
-      std::vector<uint8_t>().swap(buffer_);
-      // reset callback
-      if (in(nullptr, 0, SEEK_SET, timeoutMs)) {
-        return -1;
-      }
-    }
-    inCallback_ = std::forward<DecoderInCallback>(in);
-    return 1;
-  }
-
-  if (!readBytes(in, maxSeekableBytes + (type ? 8 : 0), timeoutMs)) {
-    return -1;
-  }
-
-  if (type) {
-    setImageType(type);
-  }
-
-  if (eof_) {
-    end_ = 0;
-    eof_ = false;
-    // reuse MemoryBuffer functionality
-    inCallback_ = MemoryBuffer::getCallback(buffer_.data(), buffer_.size());
-    isSeekable_ = true;
-    return 1;
-  }
-  inCallback_ = std::forward<DecoderInCallback>(in);
-  return 0;
-}
-
-bool SeekableBuffer::readBytes(
-    DecoderInCallback& in,
-    size_t maxBytes,
-    uint64_t timeoutMs) {
-  // Resize to th minimum 4K page or less
-  buffer_.resize(std::min(maxBytes, size_t(4 * 1024UL)));
-  end_ = 0;
-  eof_ = false;
-
-  auto end =
-      std::chrono::steady_clock::now() + std::chrono::milliseconds(timeoutMs);
-  auto watcher = [end]() -> bool {
-    return std::chrono::steady_clock::now() <= end;
-  };
-
-  bool hasTime = true;
-  while (!eof_ && end_ < maxBytes && (hasTime = watcher())) {
-    // lets read all bytes into available buffer
-    auto res = in(buffer_.data() + end_, buffer_.size() - end_, 0, timeoutMs);
-    if (res > 0) {
-      end_ += res;
-      if (end_ == buffer_.size()) {
-        buffer_.resize(std::min(size_t(end_ * 4UL), maxBytes));
-      }
-    } else if (res == 0) {
-      eof_ = true;
-    } else {
-      // error
-      return false;
-    }
-  }
-
-  buffer_.resize(end_);
-
-  return hasTime;
-}
-
-void SeekableBuffer::setImageType(ImageType* type) {
-  if (buffer_.size() > 2 && buffer_[0] == 0xFF && buffer_[1] == 0xD8 &&
-      buffer_[2] == 0xFF) {
-    *type = ImageType::JPEG;
-  } else if (
-      buffer_.size() > 3 && buffer_[1] == 'P' && buffer_[2] == 'N' &&
-      buffer_[3] == 'G') {
-    *type = ImageType::PNG;
-  } else if (
-      buffer_.size() > 1 &&
-      ((buffer_[0] == 0x49 && buffer_[1] == 0x49) ||
-       (buffer_[0] == 0x4D && buffer_[1] == 0x4D))) {
-    *type = ImageType::TIFF;
-  } else {
-    *type = ImageType::UNKNOWN;
-  }
-}
-
-int SeekableBuffer::read(uint8_t* buf, int size, uint64_t timeoutMs) {
-  if (isSeekable_) {
-    return inCallback_(buf, size, 0, timeoutMs);
-  }
-  if (pos_ < end_) {
-    // read cached bytes for non-seekable callback
-    auto available = std::min(int(end_ - pos_), size);
-    memcpy(buf, buffer_.data() + pos_, available);
-    pos_ += available;
-    return available;
-  } else if (!eof_) {
-    // normal sequential read (see defs.h file), i.e. @buf != null
-    auto res = inCallback_(buf, size, 0, timeoutMs); // read through
-    eof_ = res == 0;
-    return res;
-  } else {
-    return 0;
-  }
-}
-
-int64_t SeekableBuffer::seek(int64_t offset, int whence, uint64_t timeoutMs) {
-  return inCallback_(nullptr, offset, whence, timeoutMs);
-}
-
-void SeekableBuffer::shutdown() {
-  pos_ = end_ = 0;
-  eof_ = false;
-  std::vector<uint8_t>().swap(buffer_);
-  inCallback_ = nullptr;
-}
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/seekable_buffer.h b/torchvision/csrc/io/decoder/seekable_buffer.h
deleted file mode 100644
index 9d5729f5306..00000000000
--- a/torchvision/csrc/io/decoder/seekable_buffer.h
+++ /dev/null
@@ -1,45 +0,0 @@
-#pragma once
-
-#include "defs.h"
-
-namespace ffmpeg {
-
-/**
- * Class uses internal buffer to store initial size bytes as a seekable cache
- * from Media provider and let ffmpeg to seek and read bytes from cache
- * and beyond - reading bytes directly from Media provider
- */
-enum class ImageType {
-  UNKNOWN = 0,
-  JPEG = 1,
-  PNG = 2,
-  TIFF = 3,
-};
-
-class SeekableBuffer {
- public:
-  // @type is optional, not nullptr only is image detection required
-  // \returns 1 is buffer seekable, 0 - if not seekable, < 0 on error
-  int init(
-      DecoderInCallback&& in,
-      uint64_t timeoutMs,
-      size_t maxSeekableBytes,
-      ImageType* type);
-  int read(uint8_t* buf, int size, uint64_t timeoutMs);
-  int64_t seek(int64_t offset, int whence, uint64_t timeoutMs);
-  void shutdown();
-
- private:
-  bool readBytes(DecoderInCallback& in, size_t maxBytes, uint64_t timeoutMs);
-  void setImageType(ImageType* type);
-
- private:
-  DecoderInCallback inCallback_;
-  std::vector<uint8_t> buffer_; // resized at init time
-  long pos_{0}; // current position (SEEK_CUR iff pos_ < end_)
-  long end_{0}; // current buffer size
-  bool eof_{0}; // indicates the EOF
-  bool isSeekable_{false}; // is callback seekable
-};
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/stream.cpp b/torchvision/csrc/io/decoder/stream.cpp
deleted file mode 100644
index 7969741e72c..00000000000
--- a/torchvision/csrc/io/decoder/stream.cpp
+++ /dev/null
@@ -1,288 +0,0 @@
-#include "stream.h"
-#include <c10/util/Logging.h>
-#include <string.h>
-#include "util.h"
-
-namespace ffmpeg {
-const AVRational timeBaseQ = AVRational{1, AV_TIME_BASE};
-
-Stream::Stream(
-    AVFormatContext* inputCtx,
-    MediaFormat format,
-    bool convertPtsToWallTime,
-    int64_t loggingUuid)
-    : inputCtx_(inputCtx),
-      format_(format),
-      convertPtsToWallTime_(convertPtsToWallTime),
-      loggingUuid_(loggingUuid) {}
-
-Stream::~Stream() {
-  if (frame_) {
-    av_free(frame_);
-  }
-  if (codecCtx_) {
-    avcodec_free_context(&codecCtx_);
-  }
-}
-
-// look up the proper CODEC querying the function
-AVCodec* Stream::findCodec(AVCodecParameters* params) {
-  return (AVCodec*)avcodec_find_decoder(params->codec_id);
-}
-
-// Allocate memory for the AVCodecContext, which will hold the context for
-// decode/encode process. Then fill this codec context with CODEC parameters
-// defined in stream parameters. Open the codec, and allocate the global frame
-// defined in the header file
-int Stream::openCodec(std::vector<DecoderMetadata>* metadata, int num_threads) {
-  AVStream* steam = inputCtx_->streams[format_.stream];
-
-  AVCodec* codec = findCodec(steam->codecpar);
-  if (!codec) {
-    LOG(ERROR) << "LoggingUuid #" << loggingUuid_
-               << ", avcodec_find_decoder failed for codec_id: "
-               << int(steam->codecpar->codec_id);
-    return AVERROR(EINVAL);
-  }
-
-  if (!(codecCtx_ = avcodec_alloc_context3(codec))) {
-    LOG(ERROR) << "LoggingUuid #" << loggingUuid_
-               << ", avcodec_alloc_context3 failed";
-    return AVERROR(ENOMEM);
-  }
-  // multithreading heuristics
-  // if user defined,
-  if (num_threads > max_threads) {
-    num_threads = max_threads;
-  }
-
-  if (num_threads > 0) {
-    // if user defined, respect that
-    // note that default thread_type will be used
-    codecCtx_->thread_count = num_threads;
-  } else {
-    // otherwise set sensible defaults
-    codecCtx_->thread_count = 8;
-    codecCtx_->thread_type = FF_THREAD_SLICE;
-  }
-
-  int ret;
-  // Copy codec parameters from input stream to output codec context
-  if ((ret = avcodec_parameters_to_context(codecCtx_, steam->codecpar)) < 0) {
-    LOG(ERROR) << "LoggingUuid #" << loggingUuid_
-               << ", avcodec_parameters_to_context failed";
-    return ret;
-  }
-
-  // after avcodec_open2, value of codecCtx_->time_base is NOT meaningful
-  if ((ret = avcodec_open2(codecCtx_, codec, nullptr)) < 0) {
-    LOG(ERROR) << "LoggingUuid #" << loggingUuid_
-               << ", avcodec_open2 failed: " << Util::generateErrorDesc(ret);
-    avcodec_free_context(&codecCtx_);
-    codecCtx_ = nullptr;
-    return ret;
-  }
-
-  frame_ = av_frame_alloc();
-
-  switch (format_.type) {
-    case TYPE_VIDEO:
-      fps_ = av_q2d(av_guess_frame_rate(inputCtx_, steam, nullptr));
-      break;
-    case TYPE_AUDIO:
-      fps_ = codecCtx_->sample_rate;
-      break;
-    default:
-      fps_ = 30.0;
-  }
-
-  if ((ret = initFormat())) {
-    LOG(ERROR) << "initFormat failed, type: " << format_.type;
-  }
-
-  if (metadata) {
-    DecoderMetadata header;
-    header.format = format_;
-    header.fps = fps_;
-    header.num = steam->time_base.num;
-    header.den = steam->time_base.den;
-    header.duration =
-        av_rescale_q(steam->duration, steam->time_base, timeBaseQ);
-    metadata->push_back(header);
-  }
-
-  return ret;
-}
-
-// send the raw data packet (compressed frame) to the decoder, through the codec
-// context and receive the raw data frame (uncompressed frame) from the
-// decoder, through the same codec context
-int Stream::analyzePacket(const AVPacket* packet, bool* gotFrame) {
-  int consumed = 0;
-  int result = avcodec_send_packet(codecCtx_, packet);
-  if (result == AVERROR(EAGAIN)) {
-    *gotFrame = false; // no bytes get consumed, fetch frame
-  } else if (result == AVERROR_EOF) {
-    *gotFrame = false; // more than one flush packet
-    if (packet) {
-      // got packet after flush, this is an error
-      return result;
-    }
-  } else if (result < 0) {
-    LOG(ERROR) << "avcodec_send_packet failed, err: "
-               << Util::generateErrorDesc(result);
-    return result; // error
-  } else {
-    consumed = packet ? packet->size : 0; // all bytes get consumed
-  }
-
-  result = avcodec_receive_frame(codecCtx_, frame_);
-
-  if (result >= 0) {
-    *gotFrame = true; // frame is available
-  } else if (result == AVERROR(EAGAIN)) {
-    *gotFrame = false; // no frames at this time, needs more packets
-    if (!consumed) {
-      // precaution, if no packages got consumed and no frames are available
-      return result;
-    }
-  } else if (result == AVERROR_EOF) {
-    *gotFrame = false; // the last frame has been flushed
-    // precaution, if no more frames are available assume we consume all bytes
-    consumed = 0;
-  } else { // error
-    LOG(ERROR) << "avcodec_receive_frame failed, err: "
-               << Util::generateErrorDesc(result);
-    return result;
-  }
-  return consumed;
-}
-
-// General decoding function:
-// given the packet, analyse the metadata, and write the
-// metadata and the buffer to the DecoderOutputImage.
-int Stream::decodePacket(
-    const AVPacket* packet,
-    DecoderOutputMessage* out,
-    bool headerOnly,
-    bool* hasMsg) {
-  int consumed;
-  bool gotFrame = false;
-  *hasMsg = false;
-  if ((consumed = analyzePacket(packet, &gotFrame)) >= 0 &&
-      (packet == nullptr || gotFrame)) {
-    int result;
-    if ((result = getMessage(out, !gotFrame, headerOnly)) < 0) {
-      return result; // report error
-    }
-    *hasMsg = result > 0;
-  }
-  return consumed;
-}
-
-int Stream::flush(DecoderOutputMessage* out, bool headerOnly) {
-  bool hasMsg = false;
-  int result = decodePacket(nullptr, out, headerOnly, &hasMsg);
-  if (result < 0) {
-    avcodec_flush_buffers(codecCtx_);
-    return result;
-  }
-  if (!hasMsg) {
-    avcodec_flush_buffers(codecCtx_);
-    return 0;
-  }
-  return 1;
-}
-
-// Sets the header and payload via stream::setHeader and copyFrameBytes
-// functions that are defined in type stream subclass (VideoStream, AudioStream,
-// ...)
-int Stream::getMessage(DecoderOutputMessage* out, bool flush, bool headerOnly) {
-  if (flush) {
-    // only flush of audio frames makes sense
-    if (format_.type == TYPE_AUDIO) {
-      int processed = 0;
-      size_t total = 0;
-      // grab all audio bytes by chunks
-      do {
-        if ((processed = copyFrameBytes(out->payload.get(), flush)) < 0) {
-          return processed;
-        }
-        total += processed;
-      } while (processed);
-
-      if (total) {
-        // set header if message bytes are available
-        setHeader(&out->header, flush);
-        return 1;
-      }
-    }
-    return 0;
-  } else {
-    if (format_.type == TYPE_AUDIO) {
-      int processed = 0;
-      if ((processed = copyFrameBytes(out->payload.get(), flush)) < 0) {
-        return processed;
-      }
-      if (processed) {
-        // set header if message bytes are available
-        setHeader(&out->header, flush);
-        return 1;
-      }
-      return 0;
-    } else {
-      // set header
-      setHeader(&out->header, flush);
-
-      if (headerOnly) {
-        // Only header is requisted
-        return 1;
-      }
-
-      return copyFrameBytes(out->payload.get(), flush);
-    }
-  }
-}
-
-void Stream::setHeader(DecoderHeader* header, bool flush) {
-  header->seqno = numGenerator_++;
-
-  setFramePts(header, flush);
-
-  if (convertPtsToWallTime_) {
-    keeper_.adjust(header->pts);
-  }
-
-  header->format = format_;
-  header->keyFrame = 0;
-  header->fps = std::numeric_limits<double>::quiet_NaN();
-}
-
-void Stream::setFramePts(DecoderHeader* header, bool flush) {
-  if (flush) {
-    header->pts = nextPts_; // already in us
-  } else {
-    header->pts = frame_->best_effort_timestamp;
-    if (header->pts == AV_NOPTS_VALUE) {
-      header->pts = nextPts_;
-    } else {
-      header->pts = av_rescale_q(
-          header->pts,
-          inputCtx_->streams[format_.stream]->time_base,
-          timeBaseQ);
-    }
-
-    switch (format_.type) {
-      case TYPE_AUDIO:
-        nextPts_ = header->pts + frame_->nb_samples * AV_TIME_BASE / fps_;
-        break;
-      case TYPE_VIDEO:
-        nextPts_ = header->pts + AV_TIME_BASE / fps_;
-        break;
-      default:
-        nextPts_ = header->pts;
-    }
-  }
-}
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/stream.h b/torchvision/csrc/io/decoder/stream.h
deleted file mode 100644
index 6250dd9ecd2..00000000000
--- a/torchvision/csrc/io/decoder/stream.h
+++ /dev/null
@@ -1,80 +0,0 @@
-#pragma once
-
-#include <atomic>
-#include "defs.h"
-#include "time_keeper.h"
-
-namespace ffmpeg {
-
-/**
- * Class uses FFMPEG library to decode one media stream (audio or video).
- */
-
-class Stream {
- public:
-  Stream(
-      AVFormatContext* inputCtx,
-      MediaFormat format,
-      bool convertPtsToWallTime,
-      int64_t loggingUuid);
-  virtual ~Stream();
-
-  // returns 0 - on success or negative error
-  // num_threads sets up the codec context for multithreading if needed
-  // default is set to single thread in order to not break BC
-  int openCodec(std::vector<DecoderMetadata>* metadata, int num_threads = 1);
-  // returns 1 - if packet got consumed, 0 - if it's not, and < 0 on error
-  int decodePacket(
-      const AVPacket* packet,
-      DecoderOutputMessage* out,
-      bool headerOnly,
-      bool* hasMsg);
-  // returns stream index
-  int getIndex() const {
-    return format_.stream;
-  }
-  // returns 1 - if message got a payload, 0 - if it's not, and < 0 on error
-  int flush(DecoderOutputMessage* out, bool headerOnly);
-  // return media format
-  MediaFormat getMediaFormat() const {
-    return format_;
-  }
-
- protected:
-  virtual int initFormat() = 0;
-  // returns number processed bytes from packet, or negative error
-  virtual int analyzePacket(const AVPacket* packet, bool* gotFrame);
-  // returns number processed bytes from packet, or negative error
-  virtual int copyFrameBytes(ByteStorage* out, bool flush) = 0;
-  // sets output format
-  virtual void setHeader(DecoderHeader* header, bool flush);
-  // set frame pts
-  virtual void setFramePts(DecoderHeader* header, bool flush);
-  // finds codec
-  virtual AVCodec* findCodec(AVCodecParameters* params);
-
- private:
-  // returns 1 - if message got a payload, 0 - if it's not, and < 0 on error
-  int getMessage(DecoderOutputMessage* out, bool flush, bool headerOnly);
-
- protected:
-  AVFormatContext* const inputCtx_;
-  MediaFormat format_;
-  const bool convertPtsToWallTime_;
-  int64_t loggingUuid_;
-
-  AVCodecContext* codecCtx_{nullptr};
-  AVFrame* frame_{nullptr};
-
-  std::atomic<size_t> numGenerator_{0};
-  TimeKeeper keeper_;
-  // estimated next frame pts for flushing the last frame
-  int64_t nextPts_{0};
-  double fps_{30.};
-  // this is a dumb conservative limit; ideally we'd use
-  // int max_threads = at::get_num_threads(); but this would cause
-  // fb sync to fail as it would add dependency to ATen to the decoder API
-  const int max_threads = 12;
-};
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/subtitle_sampler.cpp b/torchvision/csrc/io/decoder/subtitle_sampler.cpp
deleted file mode 100644
index d0df24d3e35..00000000000
--- a/torchvision/csrc/io/decoder/subtitle_sampler.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-#include "subtitle_sampler.h"
-#include <c10/util/Logging.h>
-#include "util.h"
-
-namespace ffmpeg {
-
-SubtitleSampler::~SubtitleSampler() {
-  cleanUp();
-}
-
-void SubtitleSampler::shutdown() {
-  cleanUp();
-}
-
-bool SubtitleSampler::init(const SamplerParameters& params) {
-  cleanUp();
-  // set formats
-  params_ = params;
-  return true;
-}
-
-int SubtitleSampler::sample(AVSubtitle* sub, ByteStorage* out) {
-  if (!sub || !out) {
-    return 0; // flush
-  }
-
-  out->ensure(Util::size(*sub));
-
-  return Util::serialize(*sub, out);
-}
-
-int SubtitleSampler::sample(const ByteStorage* in, ByteStorage* out) {
-  if (in && out) {
-    // Get a writable copy
-    if (size_t len = in->length()) {
-      out->ensure(len);
-      memcpy(out->writableTail(), in->data(), len);
-    }
-    return out->length();
-  }
-  return 0;
-}
-
-void SubtitleSampler::cleanUp() {}
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/subtitle_sampler.h b/torchvision/csrc/io/decoder/subtitle_sampler.h
deleted file mode 100644
index 4aee811ed56..00000000000
--- a/torchvision/csrc/io/decoder/subtitle_sampler.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#pragma once
-
-#include "defs.h"
-
-namespace ffmpeg {
-
-/**
- * Class transcode audio frames from one format into another
- */
-
-class SubtitleSampler : public MediaSampler {
- public:
-  SubtitleSampler() = default;
-  ~SubtitleSampler() override;
-
-  bool init(const SamplerParameters& params) override;
-  int sample(const ByteStorage* in, ByteStorage* out) override;
-  void shutdown() override;
-
-  // returns number processed/scaling bytes
-  int sample(AVSubtitle* sub, ByteStorage* out);
-
-  // helper serialization/deserialization methods
-  static void serialize(const AVSubtitle& sub, ByteStorage* out);
-  static bool deserialize(const ByteStorage& buf, AVSubtitle* sub);
-
- private:
-  // close resources
-  void cleanUp();
-};
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/subtitle_stream.cpp b/torchvision/csrc/io/decoder/subtitle_stream.cpp
deleted file mode 100644
index 3416f702d7e..00000000000
--- a/torchvision/csrc/io/decoder/subtitle_stream.cpp
+++ /dev/null
@@ -1,96 +0,0 @@
-#include "subtitle_stream.h"
-#include <c10/util/Logging.h>
-#include "util.h"
-
-namespace ffmpeg {
-const AVRational timeBaseQ = AVRational{1, AV_TIME_BASE};
-
-SubtitleStream::SubtitleStream(
-    AVFormatContext* inputCtx,
-    int index,
-    bool convertPtsToWallTime,
-    const SubtitleFormat& format)
-    : Stream(
-          inputCtx,
-          MediaFormat::makeMediaFormat(format, index),
-          convertPtsToWallTime,
-          0) {
-  memset(&sub_, 0, sizeof(sub_));
-}
-
-void SubtitleStream::releaseSubtitle() {
-  if (sub_.release) {
-    avsubtitle_free(&sub_);
-    memset(&sub_, 0, sizeof(sub_));
-  }
-}
-
-SubtitleStream::~SubtitleStream() {
-  releaseSubtitle();
-  sampler_.shutdown();
-}
-
-int SubtitleStream::initFormat() {
-  if (!codecCtx_->subtitle_header) {
-    LOG(ERROR) << "No subtitle header found";
-  } else {
-    VLOG(1) << "Subtitle header found!";
-  }
-  return 0;
-}
-
-int SubtitleStream::analyzePacket(const AVPacket* packet, bool* gotFrame) {
-  // clean-up
-  releaseSubtitle();
-
-  // FIXME: should this even be created?
-  AVPacket* avPacket;
-  avPacket = av_packet_alloc();
-  if (avPacket == nullptr) {
-    LOG(ERROR)
-        << "decoder as not able to allocate the subtitle-specific packet.";
-    // alternative to ENOMEM
-    return AVERROR_BUFFER_TOO_SMALL;
-  }
-  avPacket->data = nullptr;
-  avPacket->size = 0;
-  // check flush packet
-  auto pkt = packet ? packet : avPacket;
-
-  int gotFramePtr = 0;
-  // is these a better way than cast from const?
-  int result =
-      avcodec_decode_subtitle2(codecCtx_, &sub_, &gotFramePtr, (AVPacket*)pkt);
-
-  if (result < 0) {
-    LOG(ERROR) << "avcodec_decode_subtitle2 failed, err: "
-               << Util::generateErrorDesc(result);
-    // free the packet we've created
-    av_packet_free(&avPacket);
-    return result;
-  } else if (result == 0) {
-    result = pkt->size; // discard the rest of the package
-  }
-
-  sub_.release = gotFramePtr;
-  *gotFrame = gotFramePtr > 0;
-
-  // set proper pts in us
-  if (gotFramePtr) {
-    sub_.pts = av_rescale_q(
-        pkt->pts, inputCtx_->streams[format_.stream]->time_base, timeBaseQ);
-  }
-
-  av_packet_free(&avPacket);
-  return result;
-}
-
-int SubtitleStream::copyFrameBytes(ByteStorage* out, bool flush) {
-  return sampler_.sample(flush ? nullptr : &sub_, out);
-}
-
-void SubtitleStream::setFramePts(DecoderHeader* header, bool) {
-  header->pts = sub_.pts; // already in us
-}
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/subtitle_stream.h b/torchvision/csrc/io/decoder/subtitle_stream.h
deleted file mode 100644
index 6c366e11f50..00000000000
--- a/torchvision/csrc/io/decoder/subtitle_stream.h
+++ /dev/null
@@ -1,38 +0,0 @@
-#pragma once
-
-#include "stream.h"
-#include "subtitle_sampler.h"
-
-namespace ffmpeg {
-
-/**
- * Class uses FFMPEG library to decode one subtitle stream.
- */
-struct AVSubtitleKeeper : AVSubtitle {
-  int64_t release{0};
-};
-
-class SubtitleStream : public Stream {
- public:
-  SubtitleStream(
-      AVFormatContext* inputCtx,
-      int index,
-      bool convertPtsToWallTime,
-      const SubtitleFormat& format);
-  ~SubtitleStream() override;
-
- protected:
-  void setFramePts(DecoderHeader* header, bool flush) override;
-
- private:
-  int initFormat() override;
-  int analyzePacket(const AVPacket* packet, bool* gotFrame) override;
-  int copyFrameBytes(ByteStorage* out, bool flush) override;
-  void releaseSubtitle();
-
- private:
-  SubtitleSampler sampler_;
-  AVSubtitleKeeper sub_;
-};
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/sync_decoder.cpp b/torchvision/csrc/io/decoder/sync_decoder.cpp
deleted file mode 100644
index 1f03ef8eb95..00000000000
--- a/torchvision/csrc/io/decoder/sync_decoder.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-#include "sync_decoder.h"
-#include <c10/util/Logging.h>
-
-namespace ffmpeg {
-
-SyncDecoder::AVByteStorage::AVByteStorage(size_t n) {
-  ensure(n);
-}
-
-SyncDecoder::AVByteStorage::~AVByteStorage() {
-  av_free(buffer_);
-}
-
-void SyncDecoder::AVByteStorage::ensure(size_t n) {
-  if (tail() < n) {
-    capacity_ = offset_ + length_ + n;
-    buffer_ = static_cast<uint8_t*>(av_realloc(buffer_, capacity_));
-  }
-}
-
-uint8_t* SyncDecoder::AVByteStorage::writableTail() {
-  TORCH_CHECK_LE(offset_ + length_, capacity_);
-  return buffer_ + offset_ + length_;
-}
-
-void SyncDecoder::AVByteStorage::append(size_t n) {
-  TORCH_CHECK_LE(n, tail());
-  length_ += n;
-}
-
-void SyncDecoder::AVByteStorage::trim(size_t n) {
-  TORCH_CHECK_LE(n, length_);
-  offset_ += n;
-  length_ -= n;
-}
-
-const uint8_t* SyncDecoder::AVByteStorage::data() const {
-  return buffer_ + offset_;
-}
-
-size_t SyncDecoder::AVByteStorage::length() const {
-  return length_;
-}
-
-size_t SyncDecoder::AVByteStorage::tail() const {
-  TORCH_CHECK_LE(offset_ + length_, capacity_);
-  return capacity_ - offset_ - length_;
-}
-
-void SyncDecoder::AVByteStorage::clear() {
-  offset_ = 0;
-  length_ = 0;
-}
-
-std::unique_ptr<ByteStorage> SyncDecoder::createByteStorage(size_t n) {
-  return std::make_unique<AVByteStorage>(n);
-}
-
-void SyncDecoder::onInit() {
-  eof_ = false;
-  queue_.clear();
-}
-
-int SyncDecoder::decode(DecoderOutputMessage* out, uint64_t timeoutMs) {
-  if (eof_ && queue_.empty()) {
-    return ENODATA;
-  }
-
-  if (queue_.empty()) {
-    int result = getFrame(timeoutMs);
-    // assign EOF
-    eof_ = result == ENODATA;
-    // check unrecoverable error, any error but ENODATA
-    if (result && result != ENODATA) {
-      return result;
-    }
-
-    // still empty
-    if (queue_.empty()) {
-      if (eof_) {
-        return ENODATA;
-      } else {
-        LOG(INFO) << "Queue is empty";
-        return ETIMEDOUT;
-      }
-    }
-  }
-
-  *out = std::move(queue_.front());
-  queue_.pop_front();
-  return 0;
-}
-
-void SyncDecoder::push(DecoderOutputMessage&& buffer) {
-  queue_.push_back(std::move(buffer));
-}
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/sync_decoder.h b/torchvision/csrc/io/decoder/sync_decoder.h
deleted file mode 100644
index b7cf7b625ac..00000000000
--- a/torchvision/csrc/io/decoder/sync_decoder.h
+++ /dev/null
@@ -1,48 +0,0 @@
-#pragma once
-
-#include <list>
-#include "decoder.h"
-
-namespace ffmpeg {
-
-/**
- * Class uses FFMPEG library to decode media streams.
- * Media bytes can be explicitly provided through read-callback
- * or fetched internally by FFMPEG library
- */
-class SyncDecoder : public Decoder {
- public:
-  // Allocation of memory must be done with a proper alignment.
-  class AVByteStorage : public ByteStorage {
-   public:
-    explicit AVByteStorage(size_t n);
-    ~AVByteStorage() override;
-    void ensure(size_t n) override;
-    uint8_t* writableTail() override;
-    void append(size_t n) override;
-    void trim(size_t n) override;
-    const uint8_t* data() const override;
-    size_t length() const override;
-    size_t tail() const override;
-    void clear() override;
-
-   private:
-    size_t offset_{0};
-    size_t length_{0};
-    size_t capacity_{0};
-    uint8_t* buffer_{nullptr};
-  };
-
- public:
-  int decode(DecoderOutputMessage* out, uint64_t timeoutMs) override;
-
- private:
-  void push(DecoderOutputMessage&& buffer) override;
-  void onInit() override;
-  std::unique_ptr<ByteStorage> createByteStorage(size_t n) override;
-
- private:
-  std::list<DecoderOutputMessage> queue_;
-  bool eof_{false};
-};
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/sync_decoder_test.cpp b/torchvision/csrc/io/decoder/sync_decoder_test.cpp
deleted file mode 100644
index 085966ce687..00000000000
--- a/torchvision/csrc/io/decoder/sync_decoder_test.cpp
+++ /dev/null
@@ -1,416 +0,0 @@
-#include <c10/util/Logging.h>
-#include <dirent.h>
-#include <gtest/gtest.h>
-#include "memory_buffer.h"
-#include "sync_decoder.h"
-#include "util.h"
-
-using namespace ffmpeg;
-
-namespace {
-struct VideoFileStats {
-  std::string name;
-  size_t durationPts{0};
-  int num{0};
-  int den{0};
-  int fps{0};
-};
-
-void gotAllTestFiles(
-    const std::string& folder,
-    std::vector<VideoFileStats>* stats) {
-  DIR* d = opendir(folder.c_str());
-  CHECK(d);
-  struct dirent* dir;
-  while ((dir = readdir(d))) {
-    if (dir->d_type != DT_DIR && 0 != strcmp(dir->d_name, "README")) {
-      VideoFileStats item;
-      item.name = folder + '/' + dir->d_name;
-      LOG(INFO) << "Found video file: " << item.name;
-      stats->push_back(std::move(item));
-    }
-  }
-  closedir(d);
-}
-
-void gotFilesStats(std::vector<VideoFileStats>& stats) {
-  DecoderParameters params;
-  params.timeoutMs = 10000;
-  params.startOffset = 1000000;
-  params.seekAccuracy = 100000;
-  params.formats = {MediaFormat(0)};
-  params.headerOnly = true;
-  params.preventStaleness = false;
-  size_t avgProvUs = 0;
-  const size_t rounds = 100;
-  for (auto& item : stats) {
-    LOG(INFO) << "Decoding video file in memory: " << item.name;
-    FILE* f = fopen(item.name.c_str(), "rb");
-    CHECK(f != nullptr);
-    fseek(f, 0, SEEK_END);
-    std::vector<uint8_t> buffer(ftell(f));
-    rewind(f);
-    size_t s = fread(buffer.data(), 1, buffer.size(), f);
-    TORCH_CHECK_EQ(buffer.size(), s);
-    fclose(f);
-
-    for (size_t i = 0; i < rounds; ++i) {
-      SyncDecoder decoder;
-      std::vector<DecoderMetadata> metadata;
-      const auto now = std::chrono::steady_clock::now();
-      CHECK(decoder.init(
-          params,
-          MemoryBuffer::getCallback(buffer.data(), buffer.size()),
-          &metadata));
-      const auto then = std::chrono::steady_clock::now();
-      decoder.shutdown();
-      avgProvUs +=
-          std::chrono::duration_cast<std::chrono::microseconds>(then - now)
-              .count();
-      TORCH_CHECK_EQ(metadata.size(), 1);
-      item.num = metadata[0].num;
-      item.den = metadata[0].den;
-      item.fps = metadata[0].fps;
-      item.durationPts =
-          av_rescale_q(metadata[0].duration, AV_TIME_BASE_Q, {1, item.fps});
-    }
-  }
-  LOG(INFO) << "Probing (us) " << avgProvUs / stats.size() / rounds;
-}
-
-size_t measurePerformanceUs(
-    const std::vector<VideoFileStats>& stats,
-    size_t rounds,
-    size_t num,
-    size_t stride) {
-  size_t avgClipDecodingUs = 0;
-  std::srand(time(nullptr));
-  for (const auto& item : stats) {
-    FILE* f = fopen(item.name.c_str(), "rb");
-    CHECK(f != nullptr);
-    fseek(f, 0, SEEK_END);
-    std::vector<uint8_t> buffer(ftell(f));
-    rewind(f);
-    size_t s = fread(buffer.data(), 1, buffer.size(), f);
-    TORCH_CHECK_EQ(buffer.size(), s);
-    fclose(f);
-
-    for (size_t i = 0; i < rounds; ++i) {
-      // randomy select clip
-      size_t rOffset = std::rand();
-      size_t fOffset = rOffset % item.durationPts;
-      size_t clipFrames = num + (num - 1) * stride;
-      if (fOffset + clipFrames > item.durationPts) {
-        fOffset = item.durationPts - clipFrames;
-      }
-
-      DecoderParameters params;
-      params.timeoutMs = 10000;
-      params.startOffset = 1000000;
-      params.seekAccuracy = 100000;
-      params.preventStaleness = false;
-
-      for (size_t n = 0; n < num; ++n) {
-        std::list<DecoderOutputMessage> msgs;
-
-        params.startOffset =
-            av_rescale_q(fOffset, {1, item.fps}, AV_TIME_BASE_Q);
-        params.endOffset = params.startOffset + 100;
-
-        auto now = std::chrono::steady_clock::now();
-        SyncDecoder decoder;
-        CHECK(decoder.init(
-            params,
-            MemoryBuffer::getCallback(buffer.data(), buffer.size()),
-            nullptr));
-        DecoderOutputMessage out;
-        while (0 == decoder.decode(&out, params.timeoutMs)) {
-          msgs.push_back(std::move(out));
-        }
-
-        decoder.shutdown();
-
-        const auto then = std::chrono::steady_clock::now();
-
-        fOffset += 1 + stride;
-
-        avgClipDecodingUs +=
-            std::chrono::duration_cast<std::chrono::microseconds>(then - now)
-                .count();
-      }
-    }
-  }
-
-  return avgClipDecodingUs / rounds / num / stats.size();
-}
-
-void runDecoder(SyncDecoder& decoder) {
-  DecoderOutputMessage out;
-  size_t audioFrames = 0, videoFrames = 0, totalBytes = 0;
-  while (0 == decoder.decode(&out, 10000)) {
-    if (out.header.format.type == TYPE_AUDIO) {
-      ++audioFrames;
-    } else if (out.header.format.type == TYPE_VIDEO) {
-      ++videoFrames;
-    } else if (out.header.format.type == TYPE_SUBTITLE && out.payload) {
-      // deserialize
-      LOG(INFO) << "Deserializing subtitle";
-      AVSubtitle sub;
-      memset(&sub, 0, sizeof(sub));
-      EXPECT_TRUE(Util::deserialize(*out.payload, &sub));
-      LOG(INFO) << "Found subtitles" << ", num rects: " << sub.num_rects;
-      for (int i = 0; i < sub.num_rects; ++i) {
-        std::string text = "picture";
-        if (sub.rects[i]->type == SUBTITLE_TEXT) {
-          text = sub.rects[i]->text;
-        } else if (sub.rects[i]->type == SUBTITLE_ASS) {
-          text = sub.rects[i]->ass;
-        }
-
-        LOG(INFO) << "Rect num: " << i << ", type:" << sub.rects[i]->type
-                  << ", text: " << text;
-      }
-
-      avsubtitle_free(&sub);
-    }
-    if (out.payload) {
-      totalBytes += out.payload->length();
-    }
-  }
-  LOG(INFO) << "Decoded audio frames: " << audioFrames
-            << ", video frames: " << videoFrames
-            << ", total bytes: " << totalBytes;
-}
-} // namespace
-
-TEST(SyncDecoder, TestSyncDecoderPerformance) {
-  // Measure the average time of decoding per clip
-  // 1. list of the videos in testing directory
-  // 2. for each video got number of frames with timestamps
-  // 3. randomly select frame offset
-  // 4. adjust offset for number frames and strides,
-  //    if it's out out upper boundary
-  // 5. repeat multiple times, measuring and accumulating decoding time
-  //    per clip.
-  /*
-  1) 4 x 2
-  2) 8 x 8
-  3) 16 x 8
-  4) 32 x 4
-  */
-  const std::string kFolder = "pytorch/vision/test/assets/videos";
-  std::vector<VideoFileStats> stats;
-  gotAllTestFiles(kFolder, &stats);
-  gotFilesStats(stats);
-
-  const size_t kRounds = 10;
-
-  auto new4x2 = measurePerformanceUs(stats, kRounds, 4, 2);
-  auto new8x8 = measurePerformanceUs(stats, kRounds, 8, 8);
-  auto new16x8 = measurePerformanceUs(stats, kRounds, 16, 8);
-  auto new32x4 = measurePerformanceUs(stats, kRounds, 32, 4);
-  LOG(INFO) << "Clip decoding (us)" << ", new(4x2): " << new4x2
-            << ", new(8x8): " << new8x8 << ", new(16x8): " << new16x8
-            << ", new(32x4): " << new32x4;
-}
-
-TEST(SyncDecoder, Test) {
-  SyncDecoder decoder;
-  DecoderParameters params;
-  params.timeoutMs = 10000;
-  params.startOffset = 1000000;
-  params.seekAccuracy = 100000;
-  params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')};
-  params.uri = "pytorch/vision/test/assets/videos/R6llTwEh07w.mp4";
-  CHECK(decoder.init(params, nullptr, nullptr));
-  runDecoder(decoder);
-  decoder.shutdown();
-}
-
-TEST(SyncDecoder, TestSubtitles) {
-  SyncDecoder decoder;
-  DecoderParameters params;
-  params.timeoutMs = 10000;
-  params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')};
-  params.uri = "vue/synergy/data/robotsub.mp4";
-  CHECK(decoder.init(params, nullptr, nullptr));
-  runDecoder(decoder);
-  decoder.shutdown();
-}
-
-TEST(SyncDecoder, TestHeadersOnly) {
-  SyncDecoder decoder;
-  DecoderParameters params;
-  params.timeoutMs = 10000;
-  params.startOffset = 1000000;
-  params.seekAccuracy = 100000;
-  params.headerOnly = true;
-  params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')};
-
-  params.uri = "pytorch/vision/test/assets/videos/R6llTwEh07w.mp4";
-  CHECK(decoder.init(params, nullptr, nullptr));
-  runDecoder(decoder);
-  decoder.shutdown();
-
-  params.uri = "pytorch/vision/test/assets/videos/SOX5yA1l24A.mp4";
-  CHECK(decoder.init(params, nullptr, nullptr));
-  runDecoder(decoder);
-  decoder.shutdown();
-
-  params.uri = "pytorch/vision/test/assets/videos/WUzgd7C1pWA.mp4";
-  CHECK(decoder.init(params, nullptr, nullptr));
-  runDecoder(decoder);
-  decoder.shutdown();
-}
-
-TEST(SyncDecoder, TestHeadersOnlyDownSampling) {
-  SyncDecoder decoder;
-  DecoderParameters params;
-  params.timeoutMs = 10000;
-  params.startOffset = 1000000;
-  params.seekAccuracy = 100000;
-  params.headerOnly = true;
-  MediaFormat format;
-  format.type = TYPE_AUDIO;
-  format.format.audio.samples = 8000;
-  params.formats.insert(format);
-
-  format.type = TYPE_VIDEO;
-  format.format.video.width = 224;
-  format.format.video.height = 224;
-  params.formats.insert(format);
-
-  params.uri = "pytorch/vision/test/assets/videos/R6llTwEh07w.mp4";
-  CHECK(decoder.init(params, nullptr, nullptr));
-  runDecoder(decoder);
-  decoder.shutdown();
-
-  params.uri = "pytorch/vision/test/assets/videos/SOX5yA1l24A.mp4";
-  CHECK(decoder.init(params, nullptr, nullptr));
-  runDecoder(decoder);
-  decoder.shutdown();
-
-  params.uri = "pytorch/vision/test/assets/videos/WUzgd7C1pWA.mp4";
-  CHECK(decoder.init(params, nullptr, nullptr));
-  runDecoder(decoder);
-  decoder.shutdown();
-}
-
-TEST(SyncDecoder, TestInitOnlyNoShutdown) {
-  SyncDecoder decoder;
-  DecoderParameters params;
-  params.timeoutMs = 10000;
-  params.startOffset = 1000000;
-  params.seekAccuracy = 100000;
-  params.headerOnly = false;
-  params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')};
-  params.uri = "pytorch/vision/test/assets/videos/R6llTwEh07w.mp4";
-  std::vector<DecoderMetadata> metadata;
-  CHECK(decoder.init(params, nullptr, &metadata));
-}
-
-TEST(SyncDecoder, TestMemoryBuffer) {
-  SyncDecoder decoder;
-  DecoderParameters params;
-  params.timeoutMs = 10000;
-  params.startOffset = 1000000;
-  params.endOffset = 9000000;
-  params.seekAccuracy = 10000;
-  params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')};
-
-  FILE* f = fopen(
-      "pytorch/vision/test/assets/videos/RATRACE_wave_f_nm_np1_fr_goo_37.avi",
-      "rb");
-  CHECK(f != nullptr);
-  fseek(f, 0, SEEK_END);
-  std::vector<uint8_t> buffer(ftell(f));
-  rewind(f);
-  size_t s = fread(buffer.data(), 1, buffer.size(), f);
-  TORCH_CHECK_EQ(buffer.size(), s);
-  fclose(f);
-  CHECK(decoder.init(
-      params,
-      MemoryBuffer::getCallback(buffer.data(), buffer.size()),
-      nullptr));
-  LOG(INFO) << "Decoding from memory bytes: " << buffer.size();
-  runDecoder(decoder);
-  decoder.shutdown();
-}
-
-TEST(SyncDecoder, TestMemoryBufferNoSeekableWithFullRead) {
-  SyncDecoder decoder;
-  DecoderParameters params;
-  params.timeoutMs = 10000;
-  params.startOffset = 1000000;
-  params.endOffset = 9000000;
-  params.seekAccuracy = 10000;
-  params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')};
-
-  FILE* f = fopen("pytorch/vision/test/assets/videos/R6llTwEh07w.mp4", "rb");
-  CHECK(f != nullptr);
-  fseek(f, 0, SEEK_END);
-  std::vector<uint8_t> buffer(ftell(f));
-  rewind(f);
-  size_t s = fread(buffer.data(), 1, buffer.size(), f);
-  TORCH_CHECK_EQ(buffer.size(), s);
-  fclose(f);
-
-  params.maxSeekableBytes = buffer.size() + 1;
-  MemoryBuffer object(buffer.data(), buffer.size());
-  CHECK(decoder.init(
-      params,
-      [object](uint8_t* out, int size, int whence, uint64_t timeoutMs) mutable
-      -> int {
-        if (out) { // see defs.h file
-          // read mode
-          return object.read(out, size);
-        }
-        // seek mode
-        if (!timeoutMs) {
-          // seek capability, yes - no
-          return -1;
-        }
-        return object.seek(size, whence);
-      },
-      nullptr));
-  runDecoder(decoder);
-  decoder.shutdown();
-}
-
-TEST(SyncDecoder, TestMemoryBufferNoSeekableWithPartialRead) {
-  SyncDecoder decoder;
-  DecoderParameters params;
-  params.timeoutMs = 10000;
-  params.startOffset = 1000000;
-  params.endOffset = 9000000;
-  params.seekAccuracy = 10000;
-  params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')};
-
-  FILE* f = fopen("pytorch/vision/test/assets/videos/R6llTwEh07w.mp4", "rb");
-  CHECK(f != nullptr);
-  fseek(f, 0, SEEK_END);
-  std::vector<uint8_t> buffer(ftell(f));
-  rewind(f);
-  size_t s = fread(buffer.data(), 1, buffer.size(), f);
-  TORCH_CHECK_EQ(buffer.size(), s);
-  fclose(f);
-
-  params.maxSeekableBytes = buffer.size() / 2;
-  MemoryBuffer object(buffer.data(), buffer.size());
-  CHECK(!decoder.init(
-      params,
-      [object](uint8_t* out, int size, int whence, uint64_t timeoutMs) mutable
-      -> int {
-        if (out) { // see defs.h file
-          // read mode
-          return object.read(out, size);
-        }
-        // seek mode
-        if (!timeoutMs) {
-          // seek capability, yes - no
-          return -1;
-        }
-        return object.seek(size, whence);
-      },
-      nullptr));
-}
diff --git a/torchvision/csrc/io/decoder/time_keeper.cpp b/torchvision/csrc/io/decoder/time_keeper.cpp
deleted file mode 100644
index 845c76cddc8..00000000000
--- a/torchvision/csrc/io/decoder/time_keeper.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-#include "time_keeper.h"
-#include "defs.h"
-
-namespace ffmpeg {
-
-namespace {
-const long kMaxTimeBaseDiference = 10;
-}
-
-long TimeKeeper::adjust(long& decoderTimestamp) {
-  const long now = std::chrono::duration_cast<std::chrono::microseconds>(
-                       std::chrono::system_clock::now().time_since_epoch())
-                       .count();
-
-  if (startTime_ == 0) {
-    startTime_ = now;
-  }
-  if (streamTimestamp_ == 0) {
-    streamTimestamp_ = decoderTimestamp;
-  }
-
-  const auto runOut = startTime_ + decoderTimestamp - streamTimestamp_;
-
-  if (std::labs((now - runOut) / AV_TIME_BASE) > kMaxTimeBaseDiference) {
-    streamTimestamp_ = startTime_ - now + decoderTimestamp;
-  }
-
-  const auto sleepAdvised = runOut - now;
-
-  decoderTimestamp += startTime_ - streamTimestamp_;
-
-  return sleepAdvised > 0 ? sleepAdvised : 0;
-}
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/time_keeper.h b/torchvision/csrc/io/decoder/time_keeper.h
deleted file mode 100644
index e4d4718c705..00000000000
--- a/torchvision/csrc/io/decoder/time_keeper.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#pragma once
-
-#include <stdlib.h>
-#include <chrono>
-
-namespace ffmpeg {
-
-/**
- * Class keeps the track of the decoded timestamps (us) for media streams.
- */
-
-class TimeKeeper {
- public:
-  TimeKeeper() = default;
-
-  // adjust provided @timestamp to the corrected value
-  // return advised sleep time before next frame processing in (us)
-  long adjust(long& decoderTimestamp);
-
- private:
-  long startTime_{0};
-  long streamTimestamp_{0};
-};
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/util.cpp b/torchvision/csrc/io/decoder/util.cpp
deleted file mode 100644
index 7198d2174ed..00000000000
--- a/torchvision/csrc/io/decoder/util.cpp
+++ /dev/null
@@ -1,401 +0,0 @@
-#include "util.h"
-#include <c10/util/Logging.h>
-
-namespace ffmpeg {
-
-namespace Serializer {
-
-// fixed size types
-template <typename T>
-inline size_t getSize(const T& x) {
-  return sizeof(x);
-}
-
-template <typename T>
-inline bool serializeItem(
-    uint8_t* dest,
-    size_t len,
-    size_t& pos,
-    const T& src) {
-  VLOG(6) << "Generic serializeItem";
-  const auto required = sizeof(src);
-  if (len < pos + required) {
-    return false;
-  }
-  memcpy(dest + pos, &src, required);
-  pos += required;
-  return true;
-}
-
-template <typename T>
-inline bool deserializeItem(
-    const uint8_t* src,
-    size_t len,
-    size_t& pos,
-    T& dest) {
-  const auto required = sizeof(dest);
-  if (len < pos + required) {
-    return false;
-  }
-  memcpy(&dest, src + pos, required);
-  pos += required;
-  return true;
-}
-
-// AVSubtitleRect specialization
-inline size_t getSize(const AVSubtitleRect& x) {
-  auto rectBytes = [](const AVSubtitleRect& y) -> size_t {
-    size_t s = 0;
-    switch (y.type) {
-      case SUBTITLE_BITMAP:
-        for (int i = 0; i < y.nb_colors; ++i) {
-          s += sizeof(y.linesize[i]);
-          s += y.linesize[i];
-        }
-        break;
-      case SUBTITLE_TEXT:
-        s += sizeof(size_t);
-        s += strlen(y.text);
-        break;
-      case SUBTITLE_ASS:
-        s += sizeof(size_t);
-        s += strlen(y.ass);
-        break;
-      default:
-        break;
-    }
-    return s;
-  };
-  return getSize(x.x) + getSize(x.y) + getSize(x.w) + getSize(x.h) +
-      getSize(x.nb_colors) + getSize(x.type) + getSize(x.flags) + rectBytes(x);
-}
-
-// AVSubtitle specialization
-inline size_t getSize(const AVSubtitle& x) {
-  auto rectBytes = [](const AVSubtitle& y) -> size_t {
-    size_t s = getSize(y.num_rects);
-    for (unsigned i = 0; i < y.num_rects; ++i) {
-      s += getSize(*y.rects[i]);
-    }
-    return s;
-  };
-  return getSize(x.format) + getSize(x.start_display_time) +
-      getSize(x.end_display_time) + getSize(x.pts) + rectBytes(x);
-}
-
-inline bool serializeItem(
-    uint8_t* dest,
-    size_t len,
-    size_t& pos,
-    const AVSubtitleRect& src) {
-  auto rectSerialize =
-      [](uint8_t* d, size_t l, size_t& p, const AVSubtitleRect& x) -> size_t {
-    switch (x.type) {
-      case SUBTITLE_BITMAP:
-        for (int i = 0; i < x.nb_colors; ++i) {
-          if (!serializeItem(d, l, p, x.linesize[i])) {
-            return false;
-          }
-          if (p + x.linesize[i] > l) {
-            return false;
-          }
-          memcpy(d + p, x.data[i], x.linesize[i]);
-          p += x.linesize[i];
-        }
-        return true;
-      case SUBTITLE_TEXT: {
-        const size_t s = strlen(x.text);
-        if (!serializeItem(d, l, p, s)) {
-          return false;
-        }
-        if (p + s > l) {
-          return false;
-        }
-        memcpy(d + p, x.text, s);
-        p += s;
-        return true;
-      }
-      case SUBTITLE_ASS: {
-        const size_t s = strlen(x.ass);
-        if (!serializeItem(d, l, p, s)) {
-          return false;
-        }
-        if (p + s > l) {
-          return false;
-        }
-        memcpy(d + p, x.ass, s);
-        p += s;
-        return true;
-      }
-      default:
-        return true;
-    }
-  };
-  return serializeItem(dest, len, pos, src.x) &&
-      serializeItem(dest, len, pos, src.y) &&
-      serializeItem(dest, len, pos, src.w) &&
-      serializeItem(dest, len, pos, src.h) &&
-      serializeItem(dest, len, pos, src.nb_colors) &&
-      serializeItem(dest, len, pos, src.type) &&
-      serializeItem(dest, len, pos, src.flags) &&
-      rectSerialize(dest, len, pos, src);
-}
-
-inline bool serializeItem(
-    uint8_t* dest,
-    size_t len,
-    size_t& pos,
-    const AVSubtitle& src) {
-  auto rectSerialize =
-      [](uint8_t* d, size_t l, size_t& p, const AVSubtitle& x) -> bool {
-    bool res = serializeItem(d, l, p, x.num_rects);
-    for (unsigned i = 0; res && i < x.num_rects; ++i) {
-      res = serializeItem(d, l, p, *(x.rects[i]));
-    }
-    return res;
-  };
-  VLOG(6) << "AVSubtitle serializeItem";
-  return serializeItem(dest, len, pos, src.format) &&
-      serializeItem(dest, len, pos, src.start_display_time) &&
-      serializeItem(dest, len, pos, src.end_display_time) &&
-      serializeItem(dest, len, pos, src.pts) &&
-      rectSerialize(dest, len, pos, src);
-}
-
-inline bool deserializeItem(
-    const uint8_t* src,
-    size_t len,
-    size_t& pos,
-    AVSubtitleRect& dest) {
-  auto rectDeserialize =
-      [](const uint8_t* y, size_t l, size_t& p, AVSubtitleRect& x) -> bool {
-    switch (x.type) {
-      case SUBTITLE_BITMAP:
-        for (int i = 0; i < x.nb_colors; ++i) {
-          if (!deserializeItem(y, l, p, x.linesize[i])) {
-            return false;
-          }
-          if (p + x.linesize[i] > l) {
-            return false;
-          }
-          x.data[i] = (uint8_t*)av_malloc(x.linesize[i]);
-          memcpy(x.data[i], y + p, x.linesize[i]);
-          p += x.linesize[i];
-        }
-        return true;
-      case SUBTITLE_TEXT: {
-        size_t s = 0;
-        if (!deserializeItem(y, l, p, s)) {
-          return false;
-        }
-        if (p + s > l) {
-          return false;
-        }
-        x.text = (char*)av_malloc(s + 1);
-        memcpy(x.text, y + p, s);
-        x.text[s] = 0;
-        p += s;
-        return true;
-      }
-      case SUBTITLE_ASS: {
-        size_t s = 0;
-        if (!deserializeItem(y, l, p, s)) {
-          return false;
-        }
-        if (p + s > l) {
-          return false;
-        }
-        x.ass = (char*)av_malloc(s + 1);
-        memcpy(x.ass, y + p, s);
-        x.ass[s] = 0;
-        p += s;
-        return true;
-      }
-      default:
-        return true;
-    }
-  };
-
-  return deserializeItem(src, len, pos, dest.x) &&
-      deserializeItem(src, len, pos, dest.y) &&
-      deserializeItem(src, len, pos, dest.w) &&
-      deserializeItem(src, len, pos, dest.h) &&
-      deserializeItem(src, len, pos, dest.nb_colors) &&
-      deserializeItem(src, len, pos, dest.type) &&
-      deserializeItem(src, len, pos, dest.flags) &&
-      rectDeserialize(src, len, pos, dest);
-}
-
-inline bool deserializeItem(
-    const uint8_t* src,
-    size_t len,
-    size_t& pos,
-    AVSubtitle& dest) {
-  auto rectDeserialize =
-      [](const uint8_t* y, size_t l, size_t& p, AVSubtitle& x) -> bool {
-    bool res = deserializeItem(y, l, p, x.num_rects);
-    if (res && x.num_rects) {
-      x.rects =
-          (AVSubtitleRect**)av_malloc(x.num_rects * sizeof(AVSubtitleRect*));
-    }
-    for (unsigned i = 0; res && i < x.num_rects; ++i) {
-      x.rects[i] = (AVSubtitleRect*)av_malloc(sizeof(AVSubtitleRect));
-      memset(x.rects[i], 0, sizeof(AVSubtitleRect));
-      res = deserializeItem(y, l, p, *x.rects[i]);
-    }
-    return res;
-  };
-  return deserializeItem(src, len, pos, dest.format) &&
-      deserializeItem(src, len, pos, dest.start_display_time) &&
-      deserializeItem(src, len, pos, dest.end_display_time) &&
-      deserializeItem(src, len, pos, dest.pts) &&
-      rectDeserialize(src, len, pos, dest);
-}
-} // namespace Serializer
-
-namespace Util {
-std::string generateErrorDesc(int errorCode) {
-  std::array<char, 1024> buffer;
-  if (av_strerror(errorCode, buffer.data(), buffer.size()) < 0) {
-    return std::string("Unknown error code: ") + std::to_string(errorCode);
-  }
-  buffer.back() = 0;
-  return std::string(buffer.data());
-}
-
-size_t serialize(const AVSubtitle& sub, ByteStorage* out) {
-  const auto len = size(sub);
-  size_t pos = 0;
-  if (!Serializer::serializeItem(out->writableTail(), len, pos, sub)) {
-    return 0;
-  }
-  out->append(len);
-  return len;
-}
-
-bool deserialize(const ByteStorage& buf, AVSubtitle* sub) {
-  size_t pos = 0;
-  return Serializer::deserializeItem(buf.data(), buf.length(), pos, *sub);
-}
-
-size_t size(const AVSubtitle& sub) {
-  return Serializer::getSize(sub);
-}
-
-bool validateVideoFormat(const VideoFormat& f) {
-  // clang-format off
-  /*
-  Valid parameters values for decoder
-  ____________________________________________________________________________________
-  |  W  |  H  | minDimension | maxDimension | cropImage |  algorithm                 |
-  |__________________________________________________________________________________|
-  |  0  |  0  |     0        |  0           |  N/A      |   original                 |
-  |__________________________________________________________________________________|
-  |  >0 |  0  |     N/A      |  N/A         |  N/A      |   scale keeping W          |
-  |__________________________________________________________________________________|
-  |  0  |  >0 |     N/A      |  N/A         |  N/A      |   scale keeping H          |
-  |__________________________________________________________________________________|
-  |  >0 |  >0 |     N/A      |  N/A         |  0        |   stretch/scale            |
-  |__________________________________________________________________________________|
-  |  >0 |  >0 |     N/A      |  N/A         |  >0       |   scale/crop               |
-  |__________________________________________________________________________________|
-  |  0  |  0  |     >0       |  0           |  N/A      |scale to min dimension      |
-  |__________________________________________________________________________________|
-  |  0  |  0  |     0        |  >0          |  N/A      |scale to max dimension      |
-  |__________________________________________________________________________________|
-  |  0  |  0  |     >0       |  >0          |  N/A      |stretch to min/max dimension|
-  |_____|_____|______________|______________|___________|____________________________|
-
-  */
-  // clang-format on
-  return (f.width == 0 && // #1, #6, #7 and #8
-          f.height == 0 && f.cropImage == 0) ||
-      (f.width != 0 && // #4 and #5
-       f.height != 0 && f.minDimension == 0 && f.maxDimension == 0) ||
-      (((f.width != 0 && // #2
-         f.height == 0) ||
-        (f.width == 0 && // #3
-         f.height != 0)) &&
-       f.minDimension == 0 && f.maxDimension == 0 && f.cropImage == 0);
-}
-
-void setFormatDimensions(
-    size_t& destW,
-    size_t& destH,
-    size_t userW,
-    size_t userH,
-    size_t srcW,
-    size_t srcH,
-    size_t minDimension,
-    size_t maxDimension,
-    size_t cropImage) {
-  // rounding rules
-  // int -> double -> round up
-  // if fraction is >= 0.5 or round down if fraction is < 0.5
-  // int result = double(value) + 0.5
-  // here we rounding double to int according to the above rule
-
-  // #1, #6, #7 and #8
-  if (userW == 0 && userH == 0) {
-    if (minDimension > 0 && maxDimension == 0) { // #6
-      if (srcW > srcH) {
-        // landscape
-        destH = minDimension;
-        destW = round(double(srcW * minDimension) / srcH);
-      } else {
-        // portrait
-        destW = minDimension;
-        destH = round(double(srcH * minDimension) / srcW);
-      }
-    } else if (minDimension == 0 && maxDimension > 0) { // #7
-      if (srcW > srcH) {
-        // landscape
-        destW = maxDimension;
-        destH = round(double(srcH * maxDimension) / srcW);
-      } else {
-        // portrait
-        destH = maxDimension;
-        destW = round(double(srcW * maxDimension) / srcH);
-      }
-    } else if (minDimension > 0 && maxDimension > 0) { // #8
-      if (srcW > srcH) {
-        // landscape
-        destW = maxDimension;
-        destH = minDimension;
-      } else {
-        // portrait
-        destW = minDimension;
-        destH = maxDimension;
-      }
-    } else { // #1
-      destW = srcW;
-      destH = srcH;
-    }
-  } else if (userW != 0 && userH == 0) { // #2
-    destW = userW;
-    destH = round(double(srcH * userW) / srcW);
-  } else if (userW == 0 && userH != 0) { // #3
-    destW = round(double(srcW * userH) / srcH);
-    destH = userH;
-  } else { // userW != 0 && userH != 0
-    if (cropImage == 0) { // #4
-      destW = userW;
-      destH = userH;
-    } else { // #5
-      double userSlope = double(userH) / userW;
-      double srcSlope = double(srcH) / srcW;
-      if (srcSlope < userSlope) {
-        destW = round(double(srcW * userH) / srcH);
-        destH = userH;
-      } else {
-        destW = userW;
-        destH = round(double(srcH * userW) / srcW);
-      }
-    }
-  }
-  // prevent zeros
-  destW = std::max(destW, size_t(1UL));
-  destH = std::max(destH, size_t(1UL));
-}
-} // namespace Util
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/util.h b/torchvision/csrc/io/decoder/util.h
deleted file mode 100644
index 01b550e5bbc..00000000000
--- a/torchvision/csrc/io/decoder/util.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#pragma once
-
-#include "defs.h"
-
-namespace ffmpeg {
-
-/**
- * FFMPEG library utility functions.
- */
-
-namespace Util {
-std::string generateErrorDesc(int errorCode);
-size_t serialize(const AVSubtitle& sub, ByteStorage* out);
-bool deserialize(const ByteStorage& buf, AVSubtitle* sub);
-size_t size(const AVSubtitle& sub);
-void setFormatDimensions(
-    size_t& destW,
-    size_t& destH,
-    size_t userW,
-    size_t userH,
-    size_t srcW,
-    size_t srcH,
-    size_t minDimension,
-    size_t maxDimension,
-    size_t cropImage);
-bool validateVideoFormat(const VideoFormat& format);
-} // namespace Util
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/util_test.cpp b/torchvision/csrc/io/decoder/util_test.cpp
deleted file mode 100644
index 0a093d9561b..00000000000
--- a/torchvision/csrc/io/decoder/util_test.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-#include <c10/util/Logging.h>
-#include <gtest/gtest.h>
-#include "util.h"
-
-TEST(Util, TestSetFormatDimensions) {
-  // clang-format off
-  const size_t test_cases[][9] = {
-      // (userW, userH, srcW, srcH, minDimension, maxDimension, cropImage, destW, destH)
-      {0, 0, 172, 128, 0, 0, 0, 172, 128},    // #1
-      {86, 0, 172, 128, 0, 0, 0, 86, 64},     // #2
-      {64, 0, 128, 172, 0, 0, 0, 64, 86},     // #2
-      {0, 32, 172, 128, 0, 0, 0, 43, 32},     // #3
-      {32, 0, 128, 172, 0, 0, 0, 32, 43},     // #3
-      {60, 50, 172, 128, 0, 0, 0, 60, 50},    // #4
-      {50, 60, 128, 172, 0, 0, 0, 50, 60},    // #4
-      {86, 40, 172, 128, 0, 0, 1, 86, 64},    // #5
-      {86, 92, 172, 128, 0, 0, 1, 124, 92},   // #5
-      {0, 0, 172, 128, 256, 0, 0, 344, 256},  // #6
-      {0, 0, 128, 172, 256, 0, 0, 256, 344},  // #6
-      {0, 0, 128, 172, 0, 344, 0, 256, 344},  // #7
-      {0, 0, 172, 128, 0, 344, 0, 344, 256},  // #7
-      {0, 0, 172, 128, 100, 344, 0, 344, 100},// #8
-      {0, 0, 128, 172, 100, 344, 0, 100, 344} // #8
-  };
-  // clang-format onn
-
-  for (const auto& tc : test_cases) {
-      size_t destW = 0;
-      size_t destH = 0;
-      ffmpeg::Util::setFormatDimensions(destW, destH, tc[0], tc[1], tc[2], tc[3], tc[4], tc[5], tc[6]);
-      CHECK(destW == tc[7]);
-      CHECK(destH == tc[8]);
-  }
-}
diff --git a/torchvision/csrc/io/decoder/video_sampler.cpp b/torchvision/csrc/io/decoder/video_sampler.cpp
deleted file mode 100644
index 8b712609e34..00000000000
--- a/torchvision/csrc/io/decoder/video_sampler.cpp
+++ /dev/null
@@ -1,337 +0,0 @@
-#include "video_sampler.h"
-#include <c10/util/Logging.h>
-#include "util.h"
-
-// www.ffmpeg.org/doxygen/0.5/swscale-example_8c-source.html
-
-namespace ffmpeg {
-
-namespace {
-
-// Setup the data pointers and linesizes based on the specified image
-// parameters and the provided array. This sets up "planes" to point to a
-// "buffer"
-// NOTE: this is most likely culprit behind #3534
-//
-// Args:
-// fmt: desired output video format
-// buffer: source constant image buffer (in different format) that will contain
-// the final image after SWScale planes: destination data pointer to be filled
-// lineSize: target destination linesize (always {0})
-int preparePlanes(
-    const VideoFormat& fmt,
-    const uint8_t* buffer,
-    uint8_t** planes,
-    int* lineSize) {
-  int result;
-
-  // NOTE: 1 at the end of av_fill_arrays is the value used for alignment
-  if ((result = av_image_fill_arrays(
-           planes,
-           lineSize,
-           buffer,
-           (AVPixelFormat)fmt.format,
-           fmt.width,
-           fmt.height,
-           1)) < 0) {
-    LOG(ERROR) << "av_image_fill_arrays failed, err: "
-               << Util::generateErrorDesc(result);
-  }
-  return result;
-}
-
-// Scale (and crop) the image slice in srcSlice and put the resulting scaled
-// slice to `planes` buffer, which is mapped to be `out` via preparePlanes as
-// `sws_scale` cannot access buffers directly.
-//
-// Args:
-// context: SWSContext allocated on line 119 (if crop, optional) or 163 (if
-// scale) srcSlice: frame data in YUV420P srcStride: the array containing the
-// strides for each plane of the source
-//            image (from AVFrame->linesize[0])
-// out: destination buffer
-// planes: indirect destination buffer (mapped to "out" via preparePlanes)
-// lines: destination linesize; constant {0}
-int transformImage(
-    SwsContext* context,
-    const uint8_t* const srcSlice[],
-    int srcStride[],
-    VideoFormat inFormat,
-    VideoFormat outFormat,
-    uint8_t* out,
-    uint8_t* planes[],
-    int lines[]) {
-  int result;
-  if ((result = preparePlanes(outFormat, out, planes, lines)) < 0) {
-    return result;
-  }
-  if (context) {
-    // NOTE: srcY stride always 0: this is a parameter of YUV format
-    if ((result = sws_scale(
-             context, srcSlice, srcStride, 0, inFormat.height, planes, lines)) <
-        0) {
-      LOG(ERROR) << "sws_scale failed, err: "
-                 << Util::generateErrorDesc(result);
-      return result;
-    }
-  } else if (
-      inFormat.width == outFormat.width &&
-      inFormat.height == outFormat.height &&
-      inFormat.format == outFormat.format) {
-    // Copy planes without using sws_scale if sws_getContext failed.
-    av_image_copy(
-        planes,
-        lines,
-        (const uint8_t**)srcSlice,
-        srcStride,
-        (AVPixelFormat)inFormat.format,
-        inFormat.width,
-        inFormat.height);
-  } else {
-    LOG(ERROR) << "Invalid scale context format " << inFormat.format;
-    return AVERROR(EINVAL);
-  }
-  return 0;
-}
-} // namespace
-
-VideoSampler::VideoSampler(int swsFlags, int64_t loggingUuid)
-    : swsFlags_(swsFlags), loggingUuid_(loggingUuid) {}
-
-VideoSampler::~VideoSampler() {
-  cleanUp();
-}
-
-void VideoSampler::shutdown() {
-  cleanUp();
-}
-
-bool VideoSampler::init(const SamplerParameters& params) {
-  cleanUp();
-
-  if (params.out.video.cropImage != 0) {
-    if (!Util::validateVideoFormat(params.out.video)) {
-      LOG(ERROR) << "Invalid video format"
-                 << ", width: " << params.out.video.width
-                 << ", height: " << params.out.video.height
-                 << ", format: " << params.out.video.format
-                 << ", minDimension: " << params.out.video.minDimension
-                 << ", crop: " << params.out.video.cropImage;
-
-      return false;
-    }
-
-    scaleFormat_.format = params.out.video.format;
-    Util::setFormatDimensions(
-        scaleFormat_.width,
-        scaleFormat_.height,
-        params.out.video.width,
-        params.out.video.height,
-        params.in.video.width,
-        params.in.video.height,
-        0,
-        0,
-        1);
-
-    if (!(scaleFormat_ == params_.out.video)) { // crop required
-      cropContext_ = sws_getContext(
-          params.out.video.width,
-          params.out.video.height,
-          (AVPixelFormat)params.out.video.format,
-          params.out.video.width,
-          params.out.video.height,
-          (AVPixelFormat)params.out.video.format,
-          swsFlags_,
-          nullptr,
-          nullptr,
-          nullptr);
-
-      if (!cropContext_) {
-        LOG(ERROR) << "sws_getContext failed for crop context";
-        return false;
-      }
-
-      const auto scaleImageSize = av_image_get_buffer_size(
-          (AVPixelFormat)scaleFormat_.format,
-          scaleFormat_.width,
-          scaleFormat_.height,
-          1);
-      scaleBuffer_.resize(scaleImageSize);
-    }
-  } else {
-    scaleFormat_ = params.out.video;
-  }
-
-  VLOG(1) << "Input format #" << loggingUuid_ << ", width "
-          << params.in.video.width << ", height " << params.in.video.height
-          << ", format " << params.in.video.format << ", minDimension "
-          << params.in.video.minDimension << ", cropImage "
-          << params.in.video.cropImage;
-  VLOG(1) << "Scale format #" << loggingUuid_ << ", width "
-          << scaleFormat_.width << ", height " << scaleFormat_.height
-          << ", format " << scaleFormat_.format << ", minDimension "
-          << scaleFormat_.minDimension << ", cropImage "
-          << scaleFormat_.cropImage;
-  VLOG(1) << "Crop format #" << loggingUuid_ << ", width "
-          << params.out.video.width << ", height " << params.out.video.height
-          << ", format " << params.out.video.format << ", minDimension "
-          << params.out.video.minDimension << ", cropImage "
-          << params.out.video.cropImage;
-
-  // set output format
-  params_ = params;
-
-  if (params.in.video.format == AV_PIX_FMT_YUV420P) {
-    /* When the video width and height are not multiples of 8,
-     * and there is no size change in the conversion,
-     * a blurry screen will appear on the right side
-     * This problem was discovered in 2012 and
-     * continues to exist in version 4.1.3 in 2019
-     * This problem can be avoided by increasing SWS_ACCURATE_RND
-     * details https://trac.ffmpeg.org/ticket/1582
-     */
-    if ((params.in.video.width & 0x7) || (params.in.video.height & 0x7)) {
-      VLOG(1) << "The width " << params.in.video.width << " and height "
-              << params.in.video.height << " the image is not a multiple of 8, "
-              << "the decoding speed may be reduced";
-      swsFlags_ |= SWS_ACCURATE_RND;
-    }
-  }
-
-  scaleContext_ = sws_getContext(
-      params.in.video.width,
-      params.in.video.height,
-      (AVPixelFormat)params.in.video.format,
-      scaleFormat_.width,
-      scaleFormat_.height,
-      (AVPixelFormat)scaleFormat_.format,
-      swsFlags_,
-      nullptr,
-      nullptr,
-      nullptr);
-  // sws_getContext might fail if in/out format == AV_PIX_FMT_PAL8 (png format)
-  // Return true if input and output formats/width/height are identical
-  // Check scaleContext_ for nullptr in transformImage to copy planes directly
-
-  if (params.in.video.width == scaleFormat_.width &&
-      params.in.video.height == scaleFormat_.height &&
-      params.in.video.format == scaleFormat_.format) {
-    return true;
-  }
-  return scaleContext_ != nullptr;
-}
-
-// Main body of the sample function called from one of the overloads below
-//
-// Args:
-// srcSlice: decoded AVFrame->data perpared buffer
-// srcStride: linesize (usually obtained from AVFrame->linesize)
-// out: return buffer (ByteStorage*)
-int VideoSampler::sample(
-    const uint8_t* const srcSlice[],
-    int srcStride[],
-    ByteStorage* out) {
-  int result;
-  // scaled and cropped image
-  int outImageSize = av_image_get_buffer_size(
-      (AVPixelFormat)params_.out.video.format,
-      params_.out.video.width,
-      params_.out.video.height,
-      1);
-
-  out->ensure(outImageSize);
-
-  uint8_t* scalePlanes[4] = {nullptr};
-  int scaleLines[4] = {0};
-  // perform scale first
-  if ((result = transformImage(
-           scaleContext_,
-           srcSlice,
-           srcStride,
-           params_.in.video,
-           scaleFormat_,
-           // for crop use internal buffer
-           cropContext_ ? scaleBuffer_.data() : out->writableTail(),
-           scalePlanes,
-           scaleLines))) {
-    return result;
-  }
-
-  // is crop required?
-  if (cropContext_) {
-    uint8_t* cropPlanes[4] = {nullptr};
-    int cropLines[4] = {0};
-
-    if (params_.out.video.height < scaleFormat_.height) {
-      // Destination image is wider of source image: cut top and bottom
-      for (size_t i = 0; i < 4 && scalePlanes[i] != nullptr; ++i) {
-        scalePlanes[i] += scaleLines[i] *
-            (scaleFormat_.height - params_.out.video.height) / 2;
-      }
-    } else {
-      // Source image is wider of destination image: cut sides
-      for (size_t i = 0; i < 4 && scalePlanes[i] != nullptr; ++i) {
-        scalePlanes[i] += scaleLines[i] *
-            (scaleFormat_.width - params_.out.video.width) / 2 /
-            scaleFormat_.width;
-      }
-    }
-
-    // crop image
-    if ((result = transformImage(
-             cropContext_,
-             scalePlanes,
-             scaleLines,
-             params_.out.video,
-             params_.out.video,
-             out->writableTail(),
-             cropPlanes,
-             cropLines))) {
-      return result;
-    }
-  }
-
-  out->append(outImageSize);
-  return outImageSize;
-}
-
-// Call from `video_stream.cpp::114` - occurs during file reads
-int VideoSampler::sample(AVFrame* frame, ByteStorage* out) {
-  if (!frame) {
-    return 0; // no flush for videos
-  }
-
-  return sample(frame->data, frame->linesize, out);
-}
-
-// Call from `video_stream.cpp::114` - not sure when this occurs
-int VideoSampler::sample(const ByteStorage* in, ByteStorage* out) {
-  if (!in) {
-    return 0; // no flush for videos
-  }
-
-  int result;
-  uint8_t* inPlanes[4] = {nullptr};
-  int inLineSize[4] = {0};
-
-  if ((result = preparePlanes(
-           params_.in.video, in->data(), inPlanes, inLineSize)) < 0) {
-    return result;
-  }
-
-  return sample(inPlanes, inLineSize, out);
-}
-
-void VideoSampler::cleanUp() {
-  if (scaleContext_) {
-    sws_freeContext(scaleContext_);
-    scaleContext_ = nullptr;
-  }
-  if (cropContext_) {
-    sws_freeContext(cropContext_);
-    cropContext_ = nullptr;
-    scaleBuffer_.clear();
-  }
-}
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/video_sampler.h b/torchvision/csrc/io/decoder/video_sampler.h
deleted file mode 100644
index 47247f2c0c5..00000000000
--- a/torchvision/csrc/io/decoder/video_sampler.h
+++ /dev/null
@@ -1,44 +0,0 @@
-#pragma once
-
-#include "defs.h"
-
-namespace ffmpeg {
-
-/**
- * Class transcode video frames from one format into another
- */
-
-class VideoSampler : public MediaSampler {
- public:
-  VideoSampler(int swsFlags = SWS_AREA, int64_t loggingUuid = 0);
-
-  ~VideoSampler() override;
-
-  // MediaSampler overrides
-  bool init(const SamplerParameters& params) override;
-  int sample(const ByteStorage* in, ByteStorage* out) override;
-  void shutdown() override;
-
-  // returns number processed/scaling bytes
-  int sample(AVFrame* frame, ByteStorage* out);
-  int getImageBytes() const;
-
- private:
-  // close resources
-  void cleanUp();
-  // helper functions for rescaling, cropping, etc.
-  int sample(
-      const uint8_t* const srcSlice[],
-      int srcStride[],
-      ByteStorage* out);
-
- private:
-  VideoFormat scaleFormat_;
-  SwsContext* scaleContext_{nullptr};
-  SwsContext* cropContext_{nullptr};
-  int swsFlags_{SWS_AREA};
-  std::vector<uint8_t> scaleBuffer_;
-  int64_t loggingUuid_{0};
-};
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/video_stream.cpp b/torchvision/csrc/io/decoder/video_stream.cpp
deleted file mode 100644
index fa08c65cac1..00000000000
--- a/torchvision/csrc/io/decoder/video_stream.cpp
+++ /dev/null
@@ -1,131 +0,0 @@
-#include "video_stream.h"
-#include <c10/util/Logging.h>
-#include "util.h"
-
-namespace ffmpeg {
-
-namespace {
-bool operator==(const VideoFormat& x, const AVFrame& y) {
-  return x.width == static_cast<size_t>(y.width) &&
-      x.height == static_cast<size_t>(y.height) && x.format == y.format;
-}
-
-bool operator==(const VideoFormat& x, const AVCodecContext& y) {
-  return x.width == static_cast<size_t>(y.width) &&
-      x.height == static_cast<size_t>(y.height) && x.format == y.pix_fmt;
-}
-
-VideoFormat& toVideoFormat(VideoFormat& x, const AVFrame& y) {
-  x.width = y.width;
-  x.height = y.height;
-  x.format = y.format;
-  return x;
-}
-
-VideoFormat& toVideoFormat(VideoFormat& x, const AVCodecContext& y) {
-  x.width = y.width;
-  x.height = y.height;
-  x.format = y.pix_fmt;
-  return x;
-}
-} // namespace
-
-VideoStream::VideoStream(
-    AVFormatContext* inputCtx,
-    int index,
-    bool convertPtsToWallTime,
-    const VideoFormat& format,
-    int64_t loggingUuid)
-    : Stream(
-          inputCtx,
-          MediaFormat::makeMediaFormat(format, index),
-          convertPtsToWallTime,
-          loggingUuid) {}
-
-VideoStream::~VideoStream() {
-  if (sampler_) {
-    sampler_->shutdown();
-    sampler_.reset();
-  }
-}
-
-int VideoStream::initFormat() {
-  // set output format
-  if (!Util::validateVideoFormat(format_.format.video)) {
-    LOG(ERROR) << "Invalid video format"
-               << ", width: " << format_.format.video.width
-               << ", height: " << format_.format.video.height
-               << ", format: " << format_.format.video.format
-               << ", minDimension: " << format_.format.video.minDimension
-               << ", crop: " << format_.format.video.cropImage;
-    return -1;
-  }
-
-  // keep aspect ratio
-  Util::setFormatDimensions(
-      format_.format.video.width,
-      format_.format.video.height,
-      format_.format.video.width,
-      format_.format.video.height,
-      codecCtx_->width,
-      codecCtx_->height,
-      format_.format.video.minDimension,
-      format_.format.video.maxDimension,
-      0);
-
-  if (format_.format.video.format == AV_PIX_FMT_NONE) {
-    format_.format.video.format = codecCtx_->pix_fmt;
-  }
-  return format_.format.video.width != 0 && format_.format.video.height != 0 &&
-          format_.format.video.format != AV_PIX_FMT_NONE
-      ? 0
-      : -1;
-}
-
-// copies frame bytes via sws_scale call in video_sampler.cpp
-int VideoStream::copyFrameBytes(ByteStorage* out, bool flush) {
-  if (!sampler_) {
-    sampler_ = std::make_unique<VideoSampler>(SWS_AREA, loggingUuid_);
-  }
-
-  // check if input format gets changed
-  if (flush ? !(sampler_->getInputFormat().video == *codecCtx_)
-            : !(sampler_->getInputFormat().video == *frame_)) {
-    // - reinit sampler
-    SamplerParameters params;
-    params.type = format_.type;
-    params.out = format_.format;
-    params.in = FormatUnion(0);
-    flush ? toVideoFormat(params.in.video, *codecCtx_)
-          : toVideoFormat(params.in.video, *frame_);
-    if (!sampler_->init(params)) {
-      return -1;
-    }
-
-    VLOG(1) << "Set input video sampler format"
-            << ", width: " << params.in.video.width
-            << ", height: " << params.in.video.height
-            << ", format: " << params.in.video.format
-            << " : output video sampler format"
-            << ", width: " << format_.format.video.width
-            << ", height: " << format_.format.video.height
-            << ", format: " << format_.format.video.format
-            << ", minDimension: " << format_.format.video.minDimension
-            << ", crop: " << format_.format.video.cropImage;
-  }
-  // calls to a sampler that converts the frame from YUV422 to RGB24, and
-  // optionally crops and resizes the frame. Frame bytes are copied from
-  // frame_->data to out buffer
-  return sampler_->sample(flush ? nullptr : frame_, out);
-}
-
-void VideoStream::setHeader(DecoderHeader* header, bool flush) {
-  Stream::setHeader(header, flush);
-  if (!flush) { // no frames for video flush
-    header->keyFrame = frame_->key_frame;
-    header->fps = av_q2d(av_guess_frame_rate(
-        inputCtx_, inputCtx_->streams[format_.stream], nullptr));
-  }
-}
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/decoder/video_stream.h b/torchvision/csrc/io/decoder/video_stream.h
deleted file mode 100644
index e6a8bf02b65..00000000000
--- a/torchvision/csrc/io/decoder/video_stream.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#pragma once
-
-#include "stream.h"
-#include "video_sampler.h"
-
-namespace ffmpeg {
-
-/**
- * Class uses FFMPEG library to decode one video stream.
- */
-
-class VideoStream : public Stream {
- public:
-  VideoStream(
-      AVFormatContext* inputCtx,
-      int index,
-      bool convertPtsToWallTime,
-      const VideoFormat& format,
-      int64_t loggingUuid);
-  ~VideoStream() override;
-
- private:
-  int initFormat() override;
-  int copyFrameBytes(ByteStorage* out, bool flush) override;
-  void setHeader(DecoderHeader* header, bool flush) override;
-
- private:
-  std::unique_ptr<VideoSampler> sampler_;
-};
-
-} // namespace ffmpeg
diff --git a/torchvision/csrc/io/video/video.cpp b/torchvision/csrc/io/video/video.cpp
deleted file mode 100644
index 8f1fb3fb5b9..00000000000
--- a/torchvision/csrc/io/video/video.cpp
+++ /dev/null
@@ -1,387 +0,0 @@
-#include "video.h"
-
-#include <regex>
-
-using namespace ffmpeg;
-
-namespace vision {
-namespace video {
-
-namespace {
-
-const size_t decoderTimeoutMs = 600000;
-const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24;
-
-// returns number of written bytes
-template <typename T>
-size_t fillTensorList(DecoderOutputMessage& msgs, torch::Tensor& frame) {
-  const auto& msg = msgs;
-  T* frameData = frame.numel() > 0 ? frame.data_ptr<T>() : nullptr;
-  if (frameData) {
-    auto sizeInBytes = msg.payload->length();
-    memcpy(frameData, msg.payload->data(), sizeInBytes);
-  }
-  return sizeof(T);
-}
-
-size_t fillVideoTensor(DecoderOutputMessage& msgs, torch::Tensor& videoFrame) {
-  return fillTensorList<uint8_t>(msgs, videoFrame);
-}
-
-size_t fillAudioTensor(DecoderOutputMessage& msgs, torch::Tensor& audioFrame) {
-  return fillTensorList<float>(msgs, audioFrame);
-}
-
-std::array<std::pair<std::string, ffmpeg::MediaType>, 4>::const_iterator
-_parse_type(const std::string& stream_string) {
-  static const std::array<std::pair<std::string, MediaType>, 4> types = {{
-      {"video", TYPE_VIDEO},
-      {"audio", TYPE_AUDIO},
-      {"subtitle", TYPE_SUBTITLE},
-      {"cc", TYPE_CC},
-  }};
-  auto device = std::find_if(
-      types.begin(),
-      types.end(),
-      [stream_string](const std::pair<std::string, MediaType>& p) {
-        return p.first == stream_string;
-      });
-  if (device != types.end()) {
-    return device;
-  }
-  TORCH_CHECK(
-      false, "Expected one of [audio, video, subtitle, cc] ", stream_string);
-}
-
-std::string parse_type_to_string(const std::string& stream_string) {
-  auto device = _parse_type(stream_string);
-  return device->first;
-}
-
-MediaType parse_type_to_mt(const std::string& stream_string) {
-  auto device = _parse_type(stream_string);
-  return device->second;
-}
-
-std::tuple<std::string, long> _parseStream(const std::string& streamString) {
-  TORCH_CHECK(!streamString.empty(), "Stream string must not be empty");
-  static const std::regex regex("([a-zA-Z_]+)(?::([1-9]\\d*|0))?");
-  std::smatch match;
-
-  TORCH_CHECK(
-      std::regex_match(streamString, match, regex),
-      "Invalid stream string: '",
-      streamString,
-      "'");
-
-  std::string type_ = "video";
-  type_ = parse_type_to_string(match[1].str());
-  long index_ = -1;
-  if (match[2].matched) {
-    try {
-      index_ = std::stoi(match[2].str());
-    } catch (const std::exception&) {
-      TORCH_CHECK(
-          false,
-          "Could not parse device index '",
-          match[2].str(),
-          "' in device string '",
-          streamString,
-          "'");
-    }
-  }
-  return std::make_tuple(type_, index_);
-}
-
-} // namespace
-
-void Video::_getDecoderParams(
-    double videoStartS,
-    int64_t getPtsOnly,
-    std::string stream,
-    long stream_id = -1,
-    bool fastSeek = true,
-    bool all_streams = false,
-    int64_t num_threads = 1,
-    double seekFrameMarginUs = 10) {
-  int64_t videoStartUs = int64_t(videoStartS * 1e6);
-
-  params.timeoutMs = decoderTimeoutMs;
-  params.startOffset = videoStartUs;
-  params.seekAccuracy = seekFrameMarginUs;
-  params.fastSeek = fastSeek;
-  params.headerOnly = false;
-  params.numThreads = num_threads;
-
-  params.preventStaleness = false; // not sure what this is about
-
-  if (all_streams == true) {
-    MediaFormat format;
-    format.stream = -2;
-    format.type = TYPE_AUDIO;
-    params.formats.insert(format);
-
-    format.type = TYPE_VIDEO;
-    format.stream = -2;
-    format.format.video.width = 0;
-    format.format.video.height = 0;
-    format.format.video.cropImage = 0;
-    format.format.video.format = defaultVideoPixelFormat;
-    params.formats.insert(format);
-
-    format.type = TYPE_SUBTITLE;
-    format.stream = -2;
-    params.formats.insert(format);
-
-    format.type = TYPE_CC;
-    format.stream = -2;
-    params.formats.insert(format);
-  } else {
-    // parse stream type
-    MediaType stream_type = parse_type_to_mt(stream);
-
-    // TODO: reset params.formats
-    std::set<MediaFormat> formats;
-    params.formats = formats;
-    // Define new format
-    MediaFormat format;
-    format.type = stream_type;
-    format.stream = stream_id;
-    if (stream_type == TYPE_VIDEO) {
-      format.format.video.width = 0;
-      format.format.video.height = 0;
-      format.format.video.cropImage = 0;
-      format.format.video.format = defaultVideoPixelFormat;
-    }
-    params.formats.insert(format);
-  }
-
-} // _get decoder params
-
-void Video::initFromFile(
-    std::string videoPath,
-    std::string stream,
-    int64_t numThreads) {
-  TORCH_CHECK(!initialized, "Video object can only be initialized once");
-  initialized = true;
-  params.uri = videoPath;
-  _init(stream, numThreads);
-}
-
-void Video::initFromMemory(
-    torch::Tensor videoTensor,
-    std::string stream,
-    int64_t numThreads) {
-  TORCH_CHECK(!initialized, "Video object can only be initialized once");
-  initialized = true;
-  callback = MemoryBuffer::getCallback(
-      videoTensor.data_ptr<uint8_t>(), videoTensor.size(0));
-  _init(stream, numThreads);
-}
-
-void Video::_init(std::string stream, int64_t numThreads) {
-  // set number of threads global
-  numThreads_ = numThreads;
-  // parse stream information
-  current_stream = _parseStream(stream);
-  // note that in the initial call we want to get all streams
-  _getDecoderParams(
-      0, // video start
-      0, // headerOnly
-      std::get<0>(current_stream), // stream info - remove that
-      long(-1), // stream_id parsed from info above change to -2
-      false, // fastseek: we're using the default param here
-      true, // read all streams
-      numThreads_ // global number of Threads for decoding
-  );
-
-  std::string logMessage, logType;
-
-  // locals
-  std::vector<double> audioFPS, videoFPS;
-  std::vector<double> audioDuration, videoDuration, ccDuration, subsDuration;
-  std::vector<double> audioTB, videoTB, ccTB, subsTB;
-  c10::Dict<std::string, std::vector<double>> audioMetadata;
-  c10::Dict<std::string, std::vector<double>> videoMetadata;
-  c10::Dict<std::string, std::vector<double>> ccMetadata;
-  c10::Dict<std::string, std::vector<double>> subsMetadata;
-
-  // callback and metadata defined in struct
-  DecoderInCallback tmp_callback = callback;
-  succeeded = decoder.init(params, std::move(tmp_callback), &metadata);
-  if (succeeded) {
-    for (const auto& header : metadata) {
-      double fps = double(header.fps);
-      double duration = double(header.duration) * 1e-6; // * timeBase;
-
-      if (header.format.type == TYPE_VIDEO) {
-        videoFPS.push_back(fps);
-        videoDuration.push_back(duration);
-      } else if (header.format.type == TYPE_AUDIO) {
-        audioFPS.push_back(fps);
-        audioDuration.push_back(duration);
-      } else if (header.format.type == TYPE_CC) {
-        ccDuration.push_back(duration);
-      } else if (header.format.type == TYPE_SUBTITLE) {
-        subsDuration.push_back(duration);
-      };
-    }
-  }
-  // audio
-  audioMetadata.insert("duration", audioDuration);
-  audioMetadata.insert("framerate", audioFPS);
-  // video
-  videoMetadata.insert("duration", videoDuration);
-  videoMetadata.insert("fps", videoFPS);
-  // subs
-  subsMetadata.insert("duration", subsDuration);
-  // cc
-  ccMetadata.insert("duration", ccDuration);
-  // put all to a data
-  streamsMetadata.insert("video", videoMetadata);
-  streamsMetadata.insert("audio", audioMetadata);
-  streamsMetadata.insert("subtitles", subsMetadata);
-  streamsMetadata.insert("cc", ccMetadata);
-
-  succeeded = setCurrentStream(stream);
-  if (std::get<1>(current_stream) != -1) {
-    LOG(INFO)
-        << "Stream index set to " << std::get<1>(current_stream)
-        << ". If you encounter trouble, consider switching it to automatic stream discovery. \n";
-  }
-}
-
-Video::Video(std::string videoPath, std::string stream, int64_t numThreads) {
-  C10_LOG_API_USAGE_ONCE("torchvision.csrc.io.video.video.Video");
-  if (!videoPath.empty()) {
-    initFromFile(videoPath, stream, numThreads);
-  }
-} // video
-
-bool Video::setCurrentStream(std::string stream = "video") {
-  TORCH_CHECK(initialized, "Video object has to be initialized first");
-  if ((!stream.empty()) && (_parseStream(stream) != current_stream)) {
-    current_stream = _parseStream(stream);
-  }
-
-  double ts = 0;
-  if (seekTS > 0) {
-    ts = seekTS;
-  }
-
-  _getDecoderParams(
-      ts, // video start
-      0, // headerOnly
-      std::get<0>(current_stream), // stream
-      long(std::get<1>(
-          current_stream)), // stream_id parsed from info above change to -2
-      false, // fastseek param set to 0 false by default (changed in seek)
-      false, // read all streams
-      numThreads_ // global number of threads
-  );
-
-  // callback and metadata defined in Video.h
-  DecoderInCallback tmp_callback = callback;
-  return (decoder.init(params, std::move(tmp_callback), &metadata));
-}
-
-std::tuple<std::string, int64_t> Video::getCurrentStream() const {
-  TORCH_CHECK(initialized, "Video object has to be initialized first");
-  return current_stream;
-}
-
-c10::Dict<std::string, c10::Dict<std::string, std::vector<double>>> Video::
-    getStreamMetadata() const {
-  TORCH_CHECK(initialized, "Video object has to be initialized first");
-  return streamsMetadata;
-}
-
-void Video::Seek(double ts, bool fastSeek = false) {
-  TORCH_CHECK(initialized, "Video object has to be initialized first");
-  // initialize the class variables used for seeking and retrurn
-  _getDecoderParams(
-      ts, // video start
-      0, // headerOnly
-      std::get<0>(current_stream), // stream
-      long(std::get<1>(
-          current_stream)), // stream_id parsed from info above change to -2
-      fastSeek, // fastseek
-      false, // read all streams
-      numThreads_ // global number of threads
-  );
-
-  // callback and metadata defined in Video.h
-  DecoderInCallback tmp_callback = callback;
-  succeeded = decoder.init(params, std::move(tmp_callback), &metadata);
-}
-
-std::tuple<torch::Tensor, double> Video::Next() {
-  TORCH_CHECK(initialized, "Video object has to be initialized first");
-  // if failing to decode simply return a null tensor (note, should we
-  // raise an exception?)
-  double frame_pts_s;
-  torch::Tensor outFrame = torch::zeros({0}, torch::kByte);
-
-  // decode single frame
-  DecoderOutputMessage out;
-  int64_t res = decoder.decode(&out, decoderTimeoutMs);
-  // if successful
-  if (res == 0) {
-    frame_pts_s = double(double(out.header.pts) * 1e-6);
-
-    auto header = out.header;
-    const auto& format = header.format;
-
-    // initialize the output variables based on type
-
-    if (format.type == TYPE_VIDEO) {
-      // note: this can potentially be optimized
-      // by having the global tensor that we fill at decode time
-      // (would avoid allocations)
-      int outHeight = format.format.video.height;
-      int outWidth = format.format.video.width;
-      int numChannels = 3;
-      outFrame = torch::zeros({outHeight, outWidth, numChannels}, torch::kByte);
-      fillVideoTensor(out, outFrame);
-      outFrame = outFrame.permute({2, 0, 1});
-
-    } else if (format.type == TYPE_AUDIO) {
-      int outAudioChannels = format.format.audio.channels;
-      int bytesPerSample = av_get_bytes_per_sample(
-          static_cast<AVSampleFormat>(format.format.audio.format));
-      int frameSizeTotal = out.payload->length();
-
-      TORCH_CHECK_EQ(frameSizeTotal % (outAudioChannels * bytesPerSample), 0);
-      int numAudioSamples =
-          frameSizeTotal / (outAudioChannels * bytesPerSample);
-
-      outFrame =
-          torch::zeros({numAudioSamples, outAudioChannels}, torch::kFloat);
-
-      fillAudioTensor(out, outFrame);
-    }
-    // currently not supporting other formats (will do soon)
-
-    out.payload.reset();
-  } else if (res == ENODATA) {
-    LOG(INFO) << "Decoder ran out of frames (ENODATA)\n";
-  } else {
-    LOG(ERROR) << "Decoder failed with ERROR_CODE " << res;
-  }
-
-  return std::make_tuple(outFrame, frame_pts_s);
-}
-
-static auto registerVideo =
-    torch::class_<Video>("torchvision", "Video")
-        .def(torch::init<std::string, std::string, int64_t>())
-        .def("init_from_file", &Video::initFromFile)
-        .def("init_from_memory", &Video::initFromMemory)
-        .def("get_current_stream", &Video::getCurrentStream)
-        .def("set_current_stream", &Video::setCurrentStream)
-        .def("get_metadata", &Video::getStreamMetadata)
-        .def("seek", &Video::Seek)
-        .def("next", &Video::Next);
-
-} // namespace video
-} // namespace vision
diff --git a/torchvision/csrc/io/video/video.h b/torchvision/csrc/io/video/video.h
deleted file mode 100644
index e57fc3ae6b7..00000000000
--- a/torchvision/csrc/io/video/video.h
+++ /dev/null
@@ -1,75 +0,0 @@
-#pragma once
-
-#include <torch/types.h>
-
-#include "../decoder/defs.h"
-#include "../decoder/memory_buffer.h"
-#include "../decoder/sync_decoder.h"
-
-namespace vision {
-namespace video {
-
-struct Video : torch::CustomClassHolder {
-  std::tuple<std::string, long> current_stream; // stream type, id
-  // global video metadata
-  c10::Dict<std::string, c10::Dict<std::string, std::vector<double>>>
-      streamsMetadata;
-  int64_t numThreads_{0};
-
- public:
-  Video(
-      std::string videoPath = std::string(),
-      std::string stream = std::string("video"),
-      int64_t numThreads = 0);
-  void initFromFile(
-      std::string videoPath,
-      std::string stream,
-      int64_t numThreads);
-  void initFromMemory(
-      torch::Tensor videoTensor,
-      std::string stream,
-      int64_t numThreads);
-
-  std::tuple<std::string, int64_t> getCurrentStream() const;
-  c10::Dict<std::string, c10::Dict<std::string, std::vector<double>>>
-  getStreamMetadata() const;
-  void Seek(double ts, bool fastSeek);
-  bool setCurrentStream(std::string stream);
-  std::tuple<torch::Tensor, double> Next();
-
- private:
-  bool succeeded = false; // decoder init flag
-  // seekTS and doSeek act as a flag - if it's not set, next function simply
-  // returns the next frame. If it's set, we look at the global seek
-  // time in combination with any_frame settings
-  double seekTS = -1;
-
-  bool initialized = false;
-
-  void _init(
-      std::string stream,
-      int64_t numThreads); // expects params.uri OR callback to be set
-
-  void _getDecoderParams(
-      double videoStartS,
-      int64_t getPtsOnly,
-      std::string stream,
-      long stream_id,
-      bool fastSeek,
-      bool all_streams,
-      int64_t num_threads,
-      double seekFrameMarginUs); // this needs to be improved
-
-  std::map<std::string, std::vector<double>> streamTimeBase; // not used
-
-  ffmpeg::DecoderInCallback callback = nullptr;
-  std::vector<ffmpeg::DecoderMetadata> metadata;
-
- protected:
-  ffmpeg::SyncDecoder decoder;
-  ffmpeg::DecoderParameters params;
-
-}; // struct Video
-
-} // namespace video
-} // namespace vision
diff --git a/torchvision/csrc/io/video_reader/video_reader.cpp b/torchvision/csrc/io/video_reader/video_reader.cpp
deleted file mode 100644
index f9a5e9085d8..00000000000
--- a/torchvision/csrc/io/video_reader/video_reader.cpp
+++ /dev/null
@@ -1,677 +0,0 @@
-#include "video_reader.h"
-
-#include "../decoder/memory_buffer.h"
-#include "../decoder/sync_decoder.h"
-
-// If we are in a Windows environment, we need to define
-// initialization functions for the _custom_ops extension
-#ifdef _WIN32
-void* PyInit_video_reader(void) {
-  return nullptr;
-}
-#endif
-
-using namespace ffmpeg;
-
-namespace vision {
-namespace video_reader {
-
-namespace {
-
-const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24;
-const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT;
-const AVRational timeBaseQ = AVRational{1, AV_TIME_BASE};
-const size_t decoderTimeoutMs = 600000;
-// A jitter can be added to the end of the range to avoid conversion/rounding
-// error, small value 100us won't be enough to select the next frame, but enough
-// to compensate rounding error due to the multiple conversions.
-const size_t timeBaseJitterUs = 100;
-
-DecoderParameters getDecoderParams(
-    int64_t videoStartUs,
-    int64_t videoEndUs,
-    double seekFrameMarginUs,
-    int64_t getPtsOnly,
-    int64_t readVideoStream,
-    int videoWidth,
-    int videoHeight,
-    int videoMinDimension,
-    int videoMaxDimension,
-    int64_t readAudioStream,
-    int audioSamples,
-    int audioChannels) {
-  DecoderParameters params;
-  params.headerOnly = getPtsOnly != 0;
-  params.seekAccuracy = seekFrameMarginUs;
-  params.startOffset = videoStartUs;
-  params.endOffset = videoEndUs;
-  params.timeoutMs = decoderTimeoutMs;
-  params.preventStaleness = false;
-
-  if (readVideoStream == 1) {
-    MediaFormat videoFormat(0);
-    videoFormat.type = TYPE_VIDEO;
-    videoFormat.format.video.format = defaultVideoPixelFormat;
-    videoFormat.format.video.width = videoWidth;
-    videoFormat.format.video.height = videoHeight;
-    videoFormat.format.video.minDimension = videoMinDimension;
-    videoFormat.format.video.maxDimension = videoMaxDimension;
-    params.formats.insert(videoFormat);
-  }
-
-  if (readAudioStream == 1) {
-    MediaFormat audioFormat;
-    audioFormat.type = TYPE_AUDIO;
-    audioFormat.format.audio.format = defaultAudioSampleFormat;
-    audioFormat.format.audio.samples = audioSamples;
-    audioFormat.format.audio.channels = audioChannels;
-    params.formats.insert(audioFormat);
-  }
-
-  return params;
-}
-
-// returns number of written bytes
-template <typename T>
-size_t fillTensor(
-    std::vector<DecoderOutputMessage>& msgs,
-    torch::Tensor& frame,
-    torch::Tensor& framePts,
-    int64_t num,
-    int64_t den) {
-  if (msgs.empty()) {
-    return 0;
-  }
-  T* frameData = frame.numel() > 0 ? frame.data_ptr<T>() : nullptr;
-  int64_t* framePtsData = framePts.data_ptr<int64_t>();
-  TORCH_CHECK_EQ(framePts.size(0), (int64_t)msgs.size());
-  size_t avgElementsInFrame = frame.numel() / msgs.size();
-
-  size_t offset = 0;
-  for (size_t i = 0; i < msgs.size(); ++i) {
-    const auto& msg = msgs[i];
-    // convert pts into original time_base
-    AVRational avr = AVRational{(int)num, (int)den};
-    framePtsData[i] = av_rescale_q(msg.header.pts, timeBaseQ, avr);
-    VLOG(2) << "PTS type: " << sizeof(T) << ", us: " << msg.header.pts
-            << ", original: " << framePtsData[i];
-
-    if (frameData) {
-      auto sizeInBytes = msg.payload->length();
-      memcpy(frameData + offset, msg.payload->data(), sizeInBytes);
-      if (sizeof(T) == sizeof(uint8_t)) {
-        // Video - move by allocated frame size
-        offset += avgElementsInFrame / sizeof(T);
-      } else {
-        // Audio - move by number of samples
-        offset += sizeInBytes / sizeof(T);
-      }
-    }
-  }
-  return offset * sizeof(T);
-}
-
-size_t fillVideoTensor(
-    std::vector<DecoderOutputMessage>& msgs,
-    torch::Tensor& videoFrame,
-    torch::Tensor& videoFramePts,
-    int64_t num,
-    int64_t den) {
-  return fillTensor<uint8_t>(msgs, videoFrame, videoFramePts, num, den);
-}
-
-size_t fillAudioTensor(
-    std::vector<DecoderOutputMessage>& msgs,
-    torch::Tensor& audioFrame,
-    torch::Tensor& audioFramePts,
-    int64_t num,
-    int64_t den) {
-  return fillTensor<float>(msgs, audioFrame, audioFramePts, num, den);
-}
-
-void offsetsToUs(
-    double& seekFrameMargin,
-    int64_t readVideoStream,
-    int64_t videoStartPts,
-    int64_t videoEndPts,
-    int64_t videoTimeBaseNum,
-    int64_t videoTimeBaseDen,
-    int64_t readAudioStream,
-    int64_t audioStartPts,
-    int64_t audioEndPts,
-    int64_t audioTimeBaseNum,
-    int64_t audioTimeBaseDen,
-    int64_t& videoStartUs,
-    int64_t& videoEndUs) {
-  seekFrameMargin *= AV_TIME_BASE;
-  videoStartUs = 0;
-  videoEndUs = -1;
-
-  if (readVideoStream) {
-    AVRational vr = AVRational{(int)videoTimeBaseNum, (int)videoTimeBaseDen};
-    if (videoStartPts > 0) {
-      videoStartUs = av_rescale_q(videoStartPts, vr, timeBaseQ);
-    }
-    if (videoEndPts > 0) {
-      // Add jitter to the end of the range to avoid conversion/rounding error.
-      // Small value 100us won't be enough to select the next frame, but enough
-      // to compensate rounding error due to the multiple conversions.
-      videoEndUs = timeBaseJitterUs + av_rescale_q(videoEndPts, vr, timeBaseQ);
-    }
-  } else if (readAudioStream) {
-    AVRational ar = AVRational{(int)audioTimeBaseNum, (int)audioTimeBaseDen};
-    if (audioStartPts > 0) {
-      videoStartUs = av_rescale_q(audioStartPts, ar, timeBaseQ);
-    }
-    if (audioEndPts > 0) {
-      // Add jitter to the end of the range to avoid conversion/rounding error.
-      // Small value 100us won't be enough to select the next frame, but enough
-      // to compensate rounding error due to the multiple conversions.
-      videoEndUs = timeBaseJitterUs + av_rescale_q(audioEndPts, ar, timeBaseQ);
-    }
-  }
-}
-
-torch::List<torch::Tensor> readVideo(
-    bool isReadFile,
-    const torch::Tensor& input_video,
-    std::string videoPath,
-    double seekFrameMargin,
-    int64_t getPtsOnly,
-    int64_t readVideoStream,
-    int64_t width,
-    int64_t height,
-    int64_t minDimension,
-    int64_t maxDimension,
-    int64_t videoStartPts,
-    int64_t videoEndPts,
-    int64_t videoTimeBaseNum,
-    int64_t videoTimeBaseDen,
-    int64_t readAudioStream,
-    int64_t audioSamples,
-    int64_t audioChannels,
-    int64_t audioStartPts,
-    int64_t audioEndPts,
-    int64_t audioTimeBaseNum,
-    int64_t audioTimeBaseDen) {
-  int64_t videoStartUs, videoEndUs;
-
-  offsetsToUs(
-      seekFrameMargin,
-      readVideoStream,
-      videoStartPts,
-      videoEndPts,
-      videoTimeBaseNum,
-      videoTimeBaseDen,
-      readAudioStream,
-      audioStartPts,
-      audioEndPts,
-      audioTimeBaseNum,
-      audioTimeBaseDen,
-      videoStartUs,
-      videoEndUs);
-
-  DecoderParameters params = getDecoderParams(
-      videoStartUs, // videoStartPts
-      videoEndUs, // videoEndPts
-      seekFrameMargin, // seekFrameMargin
-      getPtsOnly, // getPtsOnly
-      readVideoStream, // readVideoStream
-      width, // width
-      height, // height
-      minDimension, // minDimension
-      maxDimension, // maxDimension
-      readAudioStream, // readAudioStream
-      audioSamples, // audioSamples
-      audioChannels // audioChannels
-  );
-
-  SyncDecoder decoder;
-  std::vector<DecoderOutputMessage> audioMessages, videoMessages;
-  DecoderInCallback callback = nullptr;
-  std::string logMessage, logType;
-  if (isReadFile) {
-    params.uri = videoPath;
-    logType = "file";
-    logMessage = videoPath;
-  } else {
-    callback = MemoryBuffer::getCallback(
-        input_video.data_ptr<uint8_t>(), input_video.size(0));
-    logType = "memory";
-    logMessage = std::to_string(input_video.size(0));
-  }
-
-  VLOG(1) << "Video decoding from " << logType << " [" << logMessage
-          << "] has started";
-
-  const auto now = std::chrono::system_clock::now();
-
-  bool succeeded;
-  DecoderMetadata audioMetadata, videoMetadata;
-  std::vector<DecoderMetadata> metadata;
-  if ((succeeded = decoder.init(params, std::move(callback), &metadata))) {
-    for (const auto& header : metadata) {
-      if (header.format.type == TYPE_VIDEO) {
-        videoMetadata = header;
-      } else if (header.format.type == TYPE_AUDIO) {
-        audioMetadata = header;
-      }
-    }
-    int res;
-    DecoderOutputMessage msg;
-    while (0 == (res = decoder.decode(&msg, decoderTimeoutMs))) {
-      if (msg.header.format.type == TYPE_VIDEO) {
-        videoMessages.push_back(std::move(msg));
-      }
-      if (msg.header.format.type == TYPE_AUDIO) {
-        audioMessages.push_back(std::move(msg));
-      }
-      msg.payload.reset();
-    }
-  } else {
-    LOG(ERROR) << "Decoder initialization has failed";
-  }
-  const auto then = std::chrono::system_clock::now();
-  VLOG(1) << "Video decoding from " << logType << " [" << logMessage
-          << "] has finished, "
-          << std::chrono::duration_cast<std::chrono::microseconds>(then - now)
-                 .count()
-          << " us";
-
-  decoder.shutdown();
-
-  // video section
-  torch::Tensor videoFrame = torch::zeros({0}, torch::kByte);
-  torch::Tensor videoFramePts = torch::zeros({0}, torch::kLong);
-  torch::Tensor videoTimeBase = torch::zeros({0}, torch::kInt);
-  torch::Tensor videoFps = torch::zeros({0}, torch::kFloat);
-  torch::Tensor videoDuration = torch::zeros({0}, torch::kLong);
-
-  if (succeeded && readVideoStream == 1) {
-    if (!videoMessages.empty()) {
-      const auto& header = videoMetadata;
-      const auto& format = header.format.format.video;
-      int numVideoFrames = videoMessages.size();
-      int outHeight = format.height;
-      int outWidth = format.width;
-      int numChannels = 3; // decoder guarantees the default AV_PIX_FMT_RGB24
-
-      size_t expectedWrittenBytes = 0;
-      if (getPtsOnly == 0) {
-        videoFrame = torch::zeros(
-            {numVideoFrames, outHeight, outWidth, numChannels}, torch::kByte);
-        expectedWrittenBytes =
-            (size_t)numVideoFrames * outHeight * outWidth * numChannels;
-      }
-
-      videoFramePts = torch::zeros({numVideoFrames}, torch::kLong);
-
-      VLOG(2) << "video duration: " << header.duration
-              << ", fps: " << header.fps << ", num: " << header.num
-              << ", den: " << header.den << ", num frames: " << numVideoFrames;
-
-      auto numberWrittenBytes = fillVideoTensor(
-          videoMessages, videoFrame, videoFramePts, header.num, header.den);
-
-      TORCH_CHECK_EQ(numberWrittenBytes, expectedWrittenBytes);
-
-      videoTimeBase = torch::zeros({2}, torch::kInt);
-      int* videoTimeBaseData = videoTimeBase.data_ptr<int>();
-      videoTimeBaseData[0] = header.num;
-      videoTimeBaseData[1] = header.den;
-
-      videoFps = torch::zeros({1}, torch::kFloat);
-      float* videoFpsData = videoFps.data_ptr<float>();
-      videoFpsData[0] = header.fps;
-
-      videoDuration = torch::zeros({1}, torch::kLong);
-      int64_t* videoDurationData = videoDuration.data_ptr<int64_t>();
-      AVRational vr = AVRational{(int)header.num, (int)header.den};
-      videoDurationData[0] = av_rescale_q(header.duration, timeBaseQ, vr);
-      VLOG(1) << "Video decoding from " << logType << " [" << logMessage
-              << "] filled video tensors";
-    } else {
-      VLOG(1) << "Miss video stream";
-    }
-  }
-
-  // audio section
-  torch::Tensor audioFrame = torch::zeros({0}, torch::kFloat);
-  torch::Tensor audioFramePts = torch::zeros({0}, torch::kLong);
-  torch::Tensor audioTimeBase = torch::zeros({0}, torch::kInt);
-  torch::Tensor audioSampleRate = torch::zeros({0}, torch::kInt);
-  torch::Tensor audioDuration = torch::zeros({0}, torch::kLong);
-  if (succeeded && readAudioStream == 1) {
-    if (!audioMessages.empty()) {
-      const auto& header = audioMetadata;
-      const auto& format = header.format.format.audio;
-
-      int64_t outAudioChannels = format.channels;
-      int bytesPerSample =
-          av_get_bytes_per_sample(static_cast<AVSampleFormat>(format.format));
-
-      int numAudioFrames = audioMessages.size();
-      int64_t numAudioSamples = 0;
-      if (getPtsOnly == 0) {
-        int64_t frameSizeTotal = 0;
-        for (auto const& audioMessage : audioMessages) {
-          frameSizeTotal += audioMessage.payload->length();
-        }
-
-        TORCH_CHECK_EQ(frameSizeTotal % (outAudioChannels * bytesPerSample), 0);
-        numAudioSamples = frameSizeTotal / (outAudioChannels * bytesPerSample);
-
-        audioFrame =
-            torch::zeros({numAudioSamples, outAudioChannels}, torch::kFloat);
-      }
-      audioFramePts = torch::zeros({numAudioFrames}, torch::kLong);
-
-      VLOG(2) << "audio duration: " << header.duration
-              << ", channels: " << format.channels
-              << ", sample rate: " << format.samples << ", num: " << header.num
-              << ", den: " << header.den;
-
-      auto numberWrittenBytes = fillAudioTensor(
-          audioMessages, audioFrame, audioFramePts, header.num, header.den);
-      TORCH_CHECK_EQ(
-          numberWrittenBytes,
-          numAudioSamples * outAudioChannels * sizeof(float));
-
-      audioTimeBase = torch::zeros({2}, torch::kInt);
-      int* audioTimeBaseData = audioTimeBase.data_ptr<int>();
-      audioTimeBaseData[0] = header.num;
-      audioTimeBaseData[1] = header.den;
-
-      audioSampleRate = torch::zeros({1}, torch::kInt);
-      int* audioSampleRateData = audioSampleRate.data_ptr<int>();
-      audioSampleRateData[0] = format.samples;
-
-      audioDuration = torch::zeros({1}, torch::kLong);
-      int64_t* audioDurationData = audioDuration.data_ptr<int64_t>();
-      AVRational ar = AVRational{(int)header.num, (int)header.den};
-      audioDurationData[0] = av_rescale_q(header.duration, timeBaseQ, ar);
-      VLOG(1) << "Video decoding from " << logType << " [" << logMessage
-              << "] filled audio tensors";
-    } else {
-      VLOG(1) << "Miss audio stream";
-    }
-  }
-
-  torch::List<torch::Tensor> result;
-  result.push_back(std::move(videoFrame));
-  result.push_back(std::move(videoFramePts));
-  result.push_back(std::move(videoTimeBase));
-  result.push_back(std::move(videoFps));
-  result.push_back(std::move(videoDuration));
-  result.push_back(std::move(audioFrame));
-  result.push_back(std::move(audioFramePts));
-  result.push_back(std::move(audioTimeBase));
-  result.push_back(std::move(audioSampleRate));
-  result.push_back(std::move(audioDuration));
-
-  VLOG(1) << "Video decoding from " << logType << " [" << logMessage
-          << "] about to return";
-
-  return result;
-}
-
-torch::List<torch::Tensor> probeVideo(
-    bool isReadFile,
-    const torch::Tensor& input_video,
-    std::string videoPath) {
-  DecoderParameters params = getDecoderParams(
-      0, // videoStartUs
-      -1, // videoEndUs
-      0, // seekFrameMargin
-      1, // getPtsOnly
-      1, // readVideoStream
-      0, // width
-      0, // height
-      0, // minDimension
-      0, // maxDimension
-      1, // readAudioStream
-      0, // audioSamples
-      0 // audioChannels
-  );
-
-  SyncDecoder decoder;
-  DecoderInCallback callback = nullptr;
-  std::string logMessage, logType;
-  if (isReadFile) {
-    params.uri = videoPath;
-    logType = "file";
-    logMessage = videoPath;
-  } else {
-    callback = MemoryBuffer::getCallback(
-        input_video.data_ptr<uint8_t>(), input_video.size(0));
-    logType = "memory";
-    logMessage = std::to_string(input_video.size(0));
-  }
-
-  VLOG(1) << "Video probing from " << logType << " [" << logMessage
-          << "] has started";
-
-  const auto now = std::chrono::system_clock::now();
-
-  bool succeeded;
-  bool gotAudio = false, gotVideo = false;
-  DecoderMetadata audioMetadata, videoMetadata;
-  std::vector<DecoderMetadata> metadata;
-  if ((succeeded = decoder.init(params, std::move(callback), &metadata))) {
-    for (const auto& header : metadata) {
-      if (header.format.type == TYPE_VIDEO) {
-        gotVideo = true;
-        videoMetadata = header;
-      } else if (header.format.type == TYPE_AUDIO) {
-        gotAudio = true;
-        audioMetadata = header;
-      }
-    }
-    const auto then = std::chrono::system_clock::now();
-    VLOG(1) << "Video probing from " << logType << " [" << logMessage
-            << "] has finished, "
-            << std::chrono::duration_cast<std::chrono::microseconds>(then - now)
-                   .count()
-            << " us";
-  } else {
-    LOG(ERROR) << "Decoder initialization has failed";
-  }
-
-  decoder.shutdown();
-
-  // video section
-  torch::Tensor videoTimeBase = torch::zeros({0}, torch::kInt);
-  torch::Tensor videoFps = torch::zeros({0}, torch::kFloat);
-  torch::Tensor videoDuration = torch::zeros({0}, torch::kLong);
-
-  if (succeeded && gotVideo) {
-    videoTimeBase = torch::zeros({2}, torch::kInt);
-    int* videoTimeBaseData = videoTimeBase.data_ptr<int>();
-    const auto& header = videoMetadata;
-
-    videoTimeBaseData[0] = header.num;
-    videoTimeBaseData[1] = header.den;
-
-    videoFps = torch::zeros({1}, torch::kFloat);
-    float* videoFpsData = videoFps.data_ptr<float>();
-    videoFpsData[0] = header.fps;
-
-    videoDuration = torch::zeros({1}, torch::kLong);
-    int64_t* videoDurationData = videoDuration.data_ptr<int64_t>();
-    AVRational avr = AVRational{(int)header.num, (int)header.den};
-    videoDurationData[0] = av_rescale_q(header.duration, timeBaseQ, avr);
-
-    VLOG(2) << "Prob fps: " << header.fps << ", duration: " << header.duration
-            << ", num: " << header.num << ", den: " << header.den;
-
-    VLOG(1) << "Video probing from " << logType << " [" << logMessage
-            << "] filled video tensors";
-  } else {
-    LOG(ERROR) << "Miss video stream";
-  }
-
-  // audio section
-  torch::Tensor audioTimeBase = torch::zeros({0}, torch::kInt);
-  torch::Tensor audioSampleRate = torch::zeros({0}, torch::kInt);
-  torch::Tensor audioDuration = torch::zeros({0}, torch::kLong);
-
-  if (succeeded && gotAudio) {
-    audioTimeBase = torch::zeros({2}, torch::kInt);
-    int* audioTimeBaseData = audioTimeBase.data_ptr<int>();
-    const auto& header = audioMetadata;
-    const auto& media = header.format;
-    const auto& format = media.format.audio;
-
-    audioTimeBaseData[0] = header.num;
-    audioTimeBaseData[1] = header.den;
-
-    audioSampleRate = torch::zeros({1}, torch::kInt);
-    int* audioSampleRateData = audioSampleRate.data_ptr<int>();
-    audioSampleRateData[0] = format.samples;
-
-    audioDuration = torch::zeros({1}, torch::kLong);
-    int64_t* audioDurationData = audioDuration.data_ptr<int64_t>();
-    AVRational avr = AVRational{(int)header.num, (int)header.den};
-    audioDurationData[0] = av_rescale_q(header.duration, timeBaseQ, avr);
-
-    VLOG(2) << "Prob sample rate: " << format.samples
-            << ", duration: " << header.duration << ", num: " << header.num
-            << ", den: " << header.den;
-
-    VLOG(1) << "Video probing from " << logType << " [" << logMessage
-            << "] filled audio tensors";
-  } else {
-    VLOG(1) << "Miss audio stream";
-  }
-
-  torch::List<torch::Tensor> result;
-  result.push_back(std::move(videoTimeBase));
-  result.push_back(std::move(videoFps));
-  result.push_back(std::move(videoDuration));
-  result.push_back(std::move(audioTimeBase));
-  result.push_back(std::move(audioSampleRate));
-  result.push_back(std::move(audioDuration));
-
-  VLOG(1) << "Video probing from " << logType << " [" << logMessage
-          << "] is about to return";
-
-  return result;
-}
-
-} // namespace
-
-torch::List<torch::Tensor> read_video_from_memory(
-    torch::Tensor input_video,
-    double seekFrameMargin,
-    int64_t getPtsOnly,
-    int64_t readVideoStream,
-    int64_t width,
-    int64_t height,
-    int64_t minDimension,
-    int64_t maxDimension,
-    int64_t videoStartPts,
-    int64_t videoEndPts,
-    int64_t videoTimeBaseNum,
-    int64_t videoTimeBaseDen,
-    int64_t readAudioStream,
-    int64_t audioSamples,
-    int64_t audioChannels,
-    int64_t audioStartPts,
-    int64_t audioEndPts,
-    int64_t audioTimeBaseNum,
-    int64_t audioTimeBaseDen) {
-  C10_LOG_API_USAGE_ONCE(
-      "torchvision.csrc.io.video_reader.video_reader.read_video_from_memory");
-  return readVideo(
-      false,
-      input_video,
-      "", // videoPath
-      seekFrameMargin,
-      getPtsOnly,
-      readVideoStream,
-      width,
-      height,
-      minDimension,
-      maxDimension,
-      videoStartPts,
-      videoEndPts,
-      videoTimeBaseNum,
-      videoTimeBaseDen,
-      readAudioStream,
-      audioSamples,
-      audioChannels,
-      audioStartPts,
-      audioEndPts,
-      audioTimeBaseNum,
-      audioTimeBaseDen);
-}
-
-torch::List<torch::Tensor> read_video_from_file(
-    std::string videoPath,
-    double seekFrameMargin,
-    int64_t getPtsOnly,
-    int64_t readVideoStream,
-    int64_t width,
-    int64_t height,
-    int64_t minDimension,
-    int64_t maxDimension,
-    int64_t videoStartPts,
-    int64_t videoEndPts,
-    int64_t videoTimeBaseNum,
-    int64_t videoTimeBaseDen,
-    int64_t readAudioStream,
-    int64_t audioSamples,
-    int64_t audioChannels,
-    int64_t audioStartPts,
-    int64_t audioEndPts,
-    int64_t audioTimeBaseNum,
-    int64_t audioTimeBaseDen) {
-  C10_LOG_API_USAGE_ONCE(
-      "torchvision.csrc.io.video_reader.video_reader.read_video_from_file");
-  torch::Tensor dummy_input_video = torch::ones({0});
-  return readVideo(
-      true,
-      dummy_input_video,
-      videoPath,
-      seekFrameMargin,
-      getPtsOnly,
-      readVideoStream,
-      width,
-      height,
-      minDimension,
-      maxDimension,
-      videoStartPts,
-      videoEndPts,
-      videoTimeBaseNum,
-      videoTimeBaseDen,
-      readAudioStream,
-      audioSamples,
-      audioChannels,
-      audioStartPts,
-      audioEndPts,
-      audioTimeBaseNum,
-      audioTimeBaseDen);
-}
-
-torch::List<torch::Tensor> probe_video_from_memory(torch::Tensor input_video) {
-  C10_LOG_API_USAGE_ONCE(
-      "torchvision.csrc.io.video_reader.video_reader.probe_video_from_memory");
-  return probeVideo(false, input_video, "");
-}
-
-torch::List<torch::Tensor> probe_video_from_file(std::string videoPath) {
-  C10_LOG_API_USAGE_ONCE(
-      "torchvision.csrc.io.video_reader.video_reader.probe_video_from_file");
-  torch::Tensor dummy_input_video = torch::ones({0});
-  return probeVideo(true, dummy_input_video, videoPath);
-}
-
-TORCH_LIBRARY_FRAGMENT(video_reader, m) {
-  m.def("read_video_from_memory", read_video_from_memory);
-  m.def("read_video_from_file", read_video_from_file);
-  m.def("probe_video_from_memory", probe_video_from_memory);
-  m.def("probe_video_from_file", probe_video_from_file);
-}
-
-} // namespace video_reader
-} // namespace vision
diff --git a/torchvision/csrc/io/video_reader/video_reader.h b/torchvision/csrc/io/video_reader/video_reader.h
deleted file mode 100644
index 48c4c841219..00000000000
--- a/torchvision/csrc/io/video_reader/video_reader.h
+++ /dev/null
@@ -1,55 +0,0 @@
-#pragma once
-
-#include <torch/types.h>
-
-namespace vision {
-namespace video_reader {
-
-torch::List<torch::Tensor> read_video_from_memory(
-    torch::Tensor input_video,
-    double seekFrameMargin,
-    int64_t getPtsOnly,
-    int64_t readVideoStream,
-    int64_t width,
-    int64_t height,
-    int64_t minDimension,
-    int64_t maxDimension,
-    int64_t videoStartPts,
-    int64_t videoEndPts,
-    int64_t videoTimeBaseNum,
-    int64_t videoTimeBaseDen,
-    int64_t readAudioStream,
-    int64_t audioSamples,
-    int64_t audioChannels,
-    int64_t audioStartPts,
-    int64_t audioEndPts,
-    int64_t audioTimeBaseNum,
-    int64_t audioTimeBaseDen);
-
-torch::List<torch::Tensor> read_video_from_file(
-    std::string videoPath,
-    double seekFrameMargin,
-    int64_t getPtsOnly,
-    int64_t readVideoStream,
-    int64_t width,
-    int64_t height,
-    int64_t minDimension,
-    int64_t maxDimension,
-    int64_t videoStartPts,
-    int64_t videoEndPts,
-    int64_t videoTimeBaseNum,
-    int64_t videoTimeBaseDen,
-    int64_t readAudioStream,
-    int64_t audioSamples,
-    int64_t audioChannels,
-    int64_t audioStartPts,
-    int64_t audioEndPts,
-    int64_t audioTimeBaseNum,
-    int64_t audioTimeBaseDen);
-
-torch::List<torch::Tensor> probe_video_from_memory(torch::Tensor input_video);
-
-torch::List<torch::Tensor> probe_video_from_file(std::string videoPath);
-
-} // namespace video_reader
-} // namespace vision
diff --git a/torchvision/datasets/video_utils.py b/torchvision/datasets/video_utils.py
index d9214beaa68..03de0914cc5 100644
--- a/torchvision/datasets/video_utils.py
+++ b/torchvision/datasets/video_utils.py
@@ -2,10 +2,10 @@
 import math
 import warnings
 from fractions import Fraction
-from typing import Any, Callable, cast, Optional, TypeVar, Union
+from typing import Any, Callable, Optional, TypeVar, Union
 
 import torch
-from torchvision.io import _probe_video_from_file, _read_video_from_file, read_video, read_video_timestamps
+from torchvision.io import read_video, read_video_timestamps
 
 from .utils import tqdm
 
@@ -305,11 +305,7 @@ def get_clip(self, idx: int) -> tuple[torch.Tensor, torch.Tensor, dict[str, Any]
         video_path = self.video_paths[video_idx]
         clip_pts = self.clips[video_idx][clip_idx]
 
-        from torchvision import get_video_backend
-
-        backend = get_video_backend()
-
-        if backend == "pyav":
+        if True:
             # check for invalid options
             if self._video_width != 0:
                 raise ValueError("pyav backend doesn't support _video_width != 0")
@@ -322,43 +318,10 @@ def get_clip(self, idx: int) -> tuple[torch.Tensor, torch.Tensor, dict[str, Any]
             if self._audio_samples != 0:
                 raise ValueError("pyav backend doesn't support _audio_samples != 0")
 
-        if backend == "pyav":
+        if True:
             start_pts = clip_pts[0].item()
             end_pts = clip_pts[-1].item()
             video, audio, info = read_video(video_path, start_pts, end_pts)
-        else:
-            _info = _probe_video_from_file(video_path)
-            video_fps = _info.video_fps
-            audio_fps = None
-
-            video_start_pts = cast(int, clip_pts[0].item())
-            video_end_pts = cast(int, clip_pts[-1].item())
-
-            audio_start_pts, audio_end_pts = 0, -1
-            audio_timebase = Fraction(0, 1)
-            video_timebase = Fraction(_info.video_timebase.numerator, _info.video_timebase.denominator)
-            if _info.has_audio:
-                audio_timebase = Fraction(_info.audio_timebase.numerator, _info.audio_timebase.denominator)
-                audio_start_pts = pts_convert(video_start_pts, video_timebase, audio_timebase, math.floor)
-                audio_end_pts = pts_convert(video_end_pts, video_timebase, audio_timebase, math.ceil)
-                audio_fps = _info.audio_sample_rate
-            video, audio, _ = _read_video_from_file(
-                video_path,
-                video_width=self._video_width,
-                video_height=self._video_height,
-                video_min_dimension=self._video_min_dimension,
-                video_max_dimension=self._video_max_dimension,
-                video_pts_range=(video_start_pts, video_end_pts),
-                video_timebase=video_timebase,
-                audio_samples=self._audio_samples,
-                audio_channels=self._audio_channels,
-                audio_pts_range=(audio_start_pts, audio_end_pts),
-                audio_timebase=audio_timebase,
-            )
-
-            info = {"video_fps": video_fps}
-            if audio_fps is not None:
-                info["audio_fps"] = audio_fps
 
         if self.frame_rate is not None:
             resampling_idx = self.resampling_idxs[video_idx][clip_idx]
diff --git a/torchvision/io/__init__.py b/torchvision/io/__init__.py
index 03bd5d23cb2..dd95bd7e33a 100644
--- a/torchvision/io/__init__.py
+++ b/torchvision/io/__init__.py
@@ -1,20 +1,3 @@
-try:
-    from ._load_gpu_decoder import _HAS_GPU_VIDEO_DECODER
-except ModuleNotFoundError:
-    _HAS_GPU_VIDEO_DECODER = False
-
-from ._video_opt import (
-    _HAS_CPU_VIDEO_DECODER,
-    _HAS_VIDEO_OPT,
-    _probe_video_from_file,
-    _probe_video_from_memory,
-    _read_video_from_file,
-    _read_video_from_memory,
-    _read_video_timestamps_from_file,
-    _read_video_timestamps_from_memory,
-    Timebase,
-    VideoMetaData,
-)
 from .image import (
     decode_avif,
     decode_gif,
@@ -33,26 +16,12 @@
     write_png,
 )
 from .video import read_video, read_video_timestamps, write_video
-from .video_reader import VideoReader
 
 
 __all__ = [
     "write_video",
     "read_video",
     "read_video_timestamps",
-    "_read_video_from_file",
-    "_read_video_timestamps_from_file",
-    "_probe_video_from_file",
-    "_read_video_from_memory",
-    "_read_video_timestamps_from_memory",
-    "_probe_video_from_memory",
-    "_HAS_CPU_VIDEO_DECODER",
-    "_HAS_VIDEO_OPT",
-    "_HAS_GPU_VIDEO_DECODER",
-    "_read_video_clip_from_memory",
-    "_read_video_meta_data",
-    "VideoMetaData",
-    "Timebase",
     "ImageReadMode",
     "decode_image",
     "decode_jpeg",
@@ -68,6 +37,4 @@
     "write_file",
     "write_jpeg",
     "write_png",
-    "Video",
-    "VideoReader",
 ]
diff --git a/torchvision/io/_load_gpu_decoder.py b/torchvision/io/_load_gpu_decoder.py
deleted file mode 100644
index cfd40c545d8..00000000000
--- a/torchvision/io/_load_gpu_decoder.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from ..extension import _load_library
-
-
-try:
-    _load_library("gpu_decoder")
-    _HAS_GPU_VIDEO_DECODER = True
-except (ImportError, OSError):
-    _HAS_GPU_VIDEO_DECODER = False
diff --git a/torchvision/io/_video_opt.py b/torchvision/io/_video_opt.py
deleted file mode 100644
index 5dbf035886f..00000000000
--- a/torchvision/io/_video_opt.py
+++ /dev/null
@@ -1,521 +0,0 @@
-import math
-import warnings
-from fractions import Fraction
-from typing import Optional, Union
-
-import torch
-
-from ..extension import _load_library
-from ._video_deprecation_warning import _raise_video_deprecation_warning
-
-
-try:
-    _load_library("video_reader")
-    _HAS_CPU_VIDEO_DECODER = True
-except (ImportError, OSError):
-    _HAS_CPU_VIDEO_DECODER = False
-
-_HAS_VIDEO_OPT = _HAS_CPU_VIDEO_DECODER  # For BC
-default_timebase = Fraction(0, 1)
-
-
-# simple class for torch scripting
-# the complex Fraction class from fractions module is not scriptable
-class Timebase:
-    __annotations__ = {"numerator": int, "denominator": int}
-    __slots__ = ["numerator", "denominator"]
-
-    def __init__(
-        self,
-        numerator: int,
-        denominator: int,
-    ) -> None:
-        self.numerator = numerator
-        self.denominator = denominator
-
-
-class VideoMetaData:
-    __annotations__ = {
-        "has_video": bool,
-        "video_timebase": Timebase,
-        "video_duration": float,
-        "video_fps": float,
-        "has_audio": bool,
-        "audio_timebase": Timebase,
-        "audio_duration": float,
-        "audio_sample_rate": float,
-    }
-    __slots__ = [
-        "has_video",
-        "video_timebase",
-        "video_duration",
-        "video_fps",
-        "has_audio",
-        "audio_timebase",
-        "audio_duration",
-        "audio_sample_rate",
-    ]
-
-    def __init__(self) -> None:
-        self.has_video = False
-        self.video_timebase = Timebase(0, 1)
-        self.video_duration = 0.0
-        self.video_fps = 0.0
-        self.has_audio = False
-        self.audio_timebase = Timebase(0, 1)
-        self.audio_duration = 0.0
-        self.audio_sample_rate = 0.0
-
-
-def _validate_pts(pts_range: tuple[int, int]) -> None:
-
-    if pts_range[0] > pts_range[1] > 0:
-        raise ValueError(
-            f"Start pts should not be smaller than end pts, got start pts: {pts_range[0]} and end pts: {pts_range[1]}"
-        )
-
-
-def _fill_info(
-    vtimebase: torch.Tensor,
-    vfps: torch.Tensor,
-    vduration: torch.Tensor,
-    atimebase: torch.Tensor,
-    asample_rate: torch.Tensor,
-    aduration: torch.Tensor,
-) -> VideoMetaData:
-    """
-    Build update VideoMetaData struct with info about the video
-    """
-    meta = VideoMetaData()
-    if vtimebase.numel() > 0:
-        meta.video_timebase = Timebase(int(vtimebase[0].item()), int(vtimebase[1].item()))
-        timebase = vtimebase[0].item() / float(vtimebase[1].item())
-        if vduration.numel() > 0:
-            meta.has_video = True
-            meta.video_duration = float(vduration.item()) * timebase
-    if vfps.numel() > 0:
-        meta.video_fps = float(vfps.item())
-    if atimebase.numel() > 0:
-        meta.audio_timebase = Timebase(int(atimebase[0].item()), int(atimebase[1].item()))
-        timebase = atimebase[0].item() / float(atimebase[1].item())
-        if aduration.numel() > 0:
-            meta.has_audio = True
-            meta.audio_duration = float(aduration.item()) * timebase
-    if asample_rate.numel() > 0:
-        meta.audio_sample_rate = float(asample_rate.item())
-
-    return meta
-
-
-def _align_audio_frames(
-    aframes: torch.Tensor, aframe_pts: torch.Tensor, audio_pts_range: tuple[int, int]
-) -> torch.Tensor:
-    start, end = aframe_pts[0], aframe_pts[-1]
-    num_samples = aframes.size(0)
-    step_per_aframe = float(end - start + 1) / float(num_samples)
-    s_idx = 0
-    e_idx = num_samples
-    if start < audio_pts_range[0]:
-        s_idx = int((audio_pts_range[0] - start) / step_per_aframe)
-    if audio_pts_range[1] != -1 and end > audio_pts_range[1]:
-        e_idx = int((audio_pts_range[1] - end) / step_per_aframe)
-    return aframes[s_idx:e_idx, :]
-
-
-def _read_video_from_file(
-    filename: str,
-    seek_frame_margin: float = 0.25,
-    read_video_stream: bool = True,
-    video_width: int = 0,
-    video_height: int = 0,
-    video_min_dimension: int = 0,
-    video_max_dimension: int = 0,
-    video_pts_range: tuple[int, int] = (0, -1),
-    video_timebase: Fraction = default_timebase,
-    read_audio_stream: bool = True,
-    audio_samples: int = 0,
-    audio_channels: int = 0,
-    audio_pts_range: tuple[int, int] = (0, -1),
-    audio_timebase: Fraction = default_timebase,
-) -> tuple[torch.Tensor, torch.Tensor, VideoMetaData]:
-    """
-    Reads a video from a file, returning both the video frames and the audio frames
-
-    Args:
-    filename (str): path to the video file
-    seek_frame_margin (double, optional): seeking frame in the stream is imprecise. Thus,
-        when video_start_pts is specified, we seek the pts earlier by seek_frame_margin seconds
-    read_video_stream (int, optional): whether read video stream. If yes, set to 1. Otherwise, 0
-    video_width/video_height/video_min_dimension/video_max_dimension (int): together decide
-        the size of decoded frames:
-
-            - When video_width = 0, video_height = 0, video_min_dimension = 0,
-                and video_max_dimension = 0, keep the original frame resolution
-            - When video_width = 0, video_height = 0, video_min_dimension != 0,
-                and video_max_dimension = 0, keep the aspect ratio and resize the
-                frame so that shorter edge size is video_min_dimension
-            - When video_width = 0, video_height = 0, video_min_dimension = 0,
-                and video_max_dimension != 0, keep the aspect ratio and resize
-                the frame so that longer edge size is video_max_dimension
-            - When video_width = 0, video_height = 0, video_min_dimension != 0,
-                and video_max_dimension != 0, resize the frame so that shorter
-                edge size is video_min_dimension, and longer edge size is
-                video_max_dimension. The aspect ratio may not be preserved
-            - When video_width = 0, video_height != 0, video_min_dimension = 0,
-                and video_max_dimension = 0, keep the aspect ratio and resize
-                the frame so that frame video_height is $video_height
-            - When video_width != 0, video_height == 0, video_min_dimension = 0,
-                and video_max_dimension = 0, keep the aspect ratio and resize
-                the frame so that frame video_width is $video_width
-            - When video_width != 0, video_height != 0, video_min_dimension = 0,
-                and video_max_dimension = 0, resize the frame so that frame
-                video_width and  video_height are set to $video_width and
-                $video_height, respectively
-    video_pts_range (list(int), optional): the start and end presentation timestamp of video stream
-    video_timebase (Fraction, optional): a Fraction rational number which denotes timebase in video stream
-    read_audio_stream (int, optional): whether read audio stream. If yes, set to 1. Otherwise, 0
-    audio_samples (int, optional): audio sampling rate
-    audio_channels (int optional): audio channels
-    audio_pts_range (list(int), optional): the start and end presentation timestamp of audio stream
-    audio_timebase (Fraction, optional): a Fraction rational number which denotes time base in audio stream
-
-    Returns
-        vframes (Tensor[T, H, W, C]): the `T` video frames
-        aframes (Tensor[L, K]): the audio frames, where `L` is the number of points and
-            `K` is the number of audio_channels
-        info (Dict): metadata for the video and audio. Can contain the fields video_fps (float)
-            and audio_fps (int)
-    """
-    _raise_video_deprecation_warning()
-    _validate_pts(video_pts_range)
-    _validate_pts(audio_pts_range)
-
-    result = torch.ops.video_reader.read_video_from_file(
-        filename,
-        seek_frame_margin,
-        0,  # getPtsOnly
-        read_video_stream,
-        video_width,
-        video_height,
-        video_min_dimension,
-        video_max_dimension,
-        video_pts_range[0],
-        video_pts_range[1],
-        video_timebase.numerator,
-        video_timebase.denominator,
-        read_audio_stream,
-        audio_samples,
-        audio_channels,
-        audio_pts_range[0],
-        audio_pts_range[1],
-        audio_timebase.numerator,
-        audio_timebase.denominator,
-    )
-    vframes, _vframe_pts, vtimebase, vfps, vduration, aframes, aframe_pts, atimebase, asample_rate, aduration = result
-    info = _fill_info(vtimebase, vfps, vduration, atimebase, asample_rate, aduration)
-    if aframes.numel() > 0:
-        # when audio stream is found
-        aframes = _align_audio_frames(aframes, aframe_pts, audio_pts_range)
-    return vframes, aframes, info
-
-
-def _read_video_timestamps_from_file(filename: str) -> tuple[list[int], list[int], VideoMetaData]:
-    """
-    Decode all video- and audio frames in the video. Only pts
-    (presentation timestamp) is returned. The actual frame pixel data is not
-    copied. Thus, it is much faster than read_video(...)
-    """
-    result = torch.ops.video_reader.read_video_from_file(
-        filename,
-        0,  # seek_frame_margin
-        1,  # getPtsOnly
-        1,  # read_video_stream
-        0,  # video_width
-        0,  # video_height
-        0,  # video_min_dimension
-        0,  # video_max_dimension
-        0,  # video_start_pts
-        -1,  # video_end_pts
-        0,  # video_timebase_num
-        1,  # video_timebase_den
-        1,  # read_audio_stream
-        0,  # audio_samples
-        0,  # audio_channels
-        0,  # audio_start_pts
-        -1,  # audio_end_pts
-        0,  # audio_timebase_num
-        1,  # audio_timebase_den
-    )
-    _vframes, vframe_pts, vtimebase, vfps, vduration, _aframes, aframe_pts, atimebase, asample_rate, aduration = result
-    info = _fill_info(vtimebase, vfps, vduration, atimebase, asample_rate, aduration)
-
-    vframe_pts = vframe_pts.numpy().tolist()
-    aframe_pts = aframe_pts.numpy().tolist()
-    return vframe_pts, aframe_pts, info
-
-
-def _probe_video_from_file(filename: str) -> VideoMetaData:
-    """
-    Probe a video file and return VideoMetaData with info about the video
-    """
-    _raise_video_deprecation_warning()
-    result = torch.ops.video_reader.probe_video_from_file(filename)
-    vtimebase, vfps, vduration, atimebase, asample_rate, aduration = result
-    info = _fill_info(vtimebase, vfps, vduration, atimebase, asample_rate, aduration)
-    return info
-
-
-def _read_video_from_memory(
-    video_data: torch.Tensor,
-    seek_frame_margin: float = 0.25,
-    read_video_stream: int = 1,
-    video_width: int = 0,
-    video_height: int = 0,
-    video_min_dimension: int = 0,
-    video_max_dimension: int = 0,
-    video_pts_range: tuple[int, int] = (0, -1),
-    video_timebase_numerator: int = 0,
-    video_timebase_denominator: int = 1,
-    read_audio_stream: int = 1,
-    audio_samples: int = 0,
-    audio_channels: int = 0,
-    audio_pts_range: tuple[int, int] = (0, -1),
-    audio_timebase_numerator: int = 0,
-    audio_timebase_denominator: int = 1,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    """
-    Reads a video from memory, returning both the video frames as the audio frames
-    This function is torchscriptable.
-
-    Args:
-    video_data (data type could be 1) torch.Tensor, dtype=torch.int8 or 2) python bytes):
-        compressed video content stored in either 1) torch.Tensor 2) python bytes
-    seek_frame_margin (double, optional): seeking frame in the stream is imprecise.
-        Thus, when video_start_pts is specified, we seek the pts earlier by seek_frame_margin seconds
-    read_video_stream (int, optional): whether read video stream. If yes, set to 1. Otherwise, 0
-    video_width/video_height/video_min_dimension/video_max_dimension (int): together decide
-        the size of decoded frames:
-
-            - When video_width = 0, video_height = 0, video_min_dimension = 0,
-                and video_max_dimension = 0, keep the original frame resolution
-            - When video_width = 0, video_height = 0, video_min_dimension != 0,
-                and video_max_dimension = 0, keep the aspect ratio and resize the
-                frame so that shorter edge size is video_min_dimension
-            - When video_width = 0, video_height = 0, video_min_dimension = 0,
-                and video_max_dimension != 0, keep the aspect ratio and resize
-                the frame so that longer edge size is video_max_dimension
-            - When video_width = 0, video_height = 0, video_min_dimension != 0,
-                and video_max_dimension != 0, resize the frame so that shorter
-                edge size is video_min_dimension, and longer edge size is
-                video_max_dimension. The aspect ratio may not be preserved
-            - When video_width = 0, video_height != 0, video_min_dimension = 0,
-                and video_max_dimension = 0, keep the aspect ratio and resize
-                the frame so that frame video_height is $video_height
-            - When video_width != 0, video_height == 0, video_min_dimension = 0,
-                and video_max_dimension = 0, keep the aspect ratio and resize
-                the frame so that frame video_width is $video_width
-            - When video_width != 0, video_height != 0, video_min_dimension = 0,
-                and video_max_dimension = 0, resize the frame so that frame
-                video_width and  video_height are set to $video_width and
-                $video_height, respectively
-    video_pts_range (list(int), optional): the start and end presentation timestamp of video stream
-    video_timebase_numerator / video_timebase_denominator (float, optional): a rational
-        number which denotes timebase in video stream
-    read_audio_stream (int, optional): whether read audio stream. If yes, set to 1. Otherwise, 0
-    audio_samples (int, optional): audio sampling rate
-    audio_channels (int optional): audio audio_channels
-    audio_pts_range (list(int), optional): the start and end presentation timestamp of audio stream
-    audio_timebase_numerator / audio_timebase_denominator (float, optional):
-        a rational number which denotes time base in audio stream
-
-    Returns:
-        vframes (Tensor[T, H, W, C]): the `T` video frames
-        aframes (Tensor[L, K]): the audio frames, where `L` is the number of points and
-            `K` is the number of channels
-    """
-
-    _raise_video_deprecation_warning()
-    _validate_pts(video_pts_range)
-    _validate_pts(audio_pts_range)
-
-    if not isinstance(video_data, torch.Tensor):
-        with warnings.catch_warnings():
-            # Ignore the warning because we actually don't modify the buffer in this function
-            warnings.filterwarnings("ignore", message="The given buffer is not writable")
-            video_data = torch.frombuffer(video_data, dtype=torch.uint8)
-
-    result = torch.ops.video_reader.read_video_from_memory(
-        video_data,
-        seek_frame_margin,
-        0,  # getPtsOnly
-        read_video_stream,
-        video_width,
-        video_height,
-        video_min_dimension,
-        video_max_dimension,
-        video_pts_range[0],
-        video_pts_range[1],
-        video_timebase_numerator,
-        video_timebase_denominator,
-        read_audio_stream,
-        audio_samples,
-        audio_channels,
-        audio_pts_range[0],
-        audio_pts_range[1],
-        audio_timebase_numerator,
-        audio_timebase_denominator,
-    )
-
-    vframes, _vframe_pts, vtimebase, vfps, vduration, aframes, aframe_pts, atimebase, asample_rate, aduration = result
-
-    if aframes.numel() > 0:
-        # when audio stream is found
-        aframes = _align_audio_frames(aframes, aframe_pts, audio_pts_range)
-
-    return vframes, aframes
-
-
-def _read_video_timestamps_from_memory(
-    video_data: torch.Tensor,
-) -> tuple[list[int], list[int], VideoMetaData]:
-    """
-    Decode all frames in the video. Only pts (presentation timestamp) is returned.
-    The actual frame pixel data is not copied. Thus, read_video_timestamps(...)
-    is much faster than read_video(...)
-    """
-    if not isinstance(video_data, torch.Tensor):
-        with warnings.catch_warnings():
-            # Ignore the warning because we actually don't modify the buffer in this function
-            warnings.filterwarnings("ignore", message="The given buffer is not writable")
-            video_data = torch.frombuffer(video_data, dtype=torch.uint8)
-    result = torch.ops.video_reader.read_video_from_memory(
-        video_data,
-        0,  # seek_frame_margin
-        1,  # getPtsOnly
-        1,  # read_video_stream
-        0,  # video_width
-        0,  # video_height
-        0,  # video_min_dimension
-        0,  # video_max_dimension
-        0,  # video_start_pts
-        -1,  # video_end_pts
-        0,  # video_timebase_num
-        1,  # video_timebase_den
-        1,  # read_audio_stream
-        0,  # audio_samples
-        0,  # audio_channels
-        0,  # audio_start_pts
-        -1,  # audio_end_pts
-        0,  # audio_timebase_num
-        1,  # audio_timebase_den
-    )
-    _raise_video_deprecation_warning()
-    _vframes, vframe_pts, vtimebase, vfps, vduration, _aframes, aframe_pts, atimebase, asample_rate, aduration = result
-    info = _fill_info(vtimebase, vfps, vduration, atimebase, asample_rate, aduration)
-
-    vframe_pts = vframe_pts.numpy().tolist()
-    aframe_pts = aframe_pts.numpy().tolist()
-    return vframe_pts, aframe_pts, info
-
-
-def _probe_video_from_memory(
-    video_data: torch.Tensor,
-) -> VideoMetaData:
-    """
-    Probe a video in memory and return VideoMetaData with info about the video
-    This function is torchscriptable
-    """
-    _raise_video_deprecation_warning()
-    if not isinstance(video_data, torch.Tensor):
-        with warnings.catch_warnings():
-            # Ignore the warning because we actually don't modify the buffer in this function
-            warnings.filterwarnings("ignore", message="The given buffer is not writable")
-            video_data = torch.frombuffer(video_data, dtype=torch.uint8)
-    result = torch.ops.video_reader.probe_video_from_memory(video_data)
-    vtimebase, vfps, vduration, atimebase, asample_rate, aduration = result
-    info = _fill_info(vtimebase, vfps, vduration, atimebase, asample_rate, aduration)
-    return info
-
-
-def _read_video(
-    filename: str,
-    start_pts: Union[float, Fraction] = 0,
-    end_pts: Optional[Union[float, Fraction]] = None,
-    pts_unit: str = "pts",
-) -> tuple[torch.Tensor, torch.Tensor, dict[str, float]]:
-    _raise_video_deprecation_warning()
-    if end_pts is None:
-        end_pts = float("inf")
-
-    if pts_unit == "pts":
-        warnings.warn(
-            "The pts_unit 'pts' gives wrong results and will be removed in a "
-            + "follow-up version. Please use pts_unit 'sec'."
-        )
-
-    info = _probe_video_from_file(filename)
-
-    has_video = info.has_video
-    has_audio = info.has_audio
-
-    def get_pts(time_base):
-        start_offset = start_pts
-        end_offset = end_pts
-        if pts_unit == "sec":
-            start_offset = int(math.floor(start_pts * (1 / time_base)))
-            if end_offset != float("inf"):
-                end_offset = int(math.ceil(end_pts * (1 / time_base)))
-        if end_offset == float("inf"):
-            end_offset = -1
-        return start_offset, end_offset
-
-    video_pts_range = (0, -1)
-    video_timebase = default_timebase
-    if has_video:
-        video_timebase = Fraction(info.video_timebase.numerator, info.video_timebase.denominator)
-        video_pts_range = get_pts(video_timebase)
-
-    audio_pts_range = (0, -1)
-    audio_timebase = default_timebase
-    if has_audio:
-        audio_timebase = Fraction(info.audio_timebase.numerator, info.audio_timebase.denominator)
-        audio_pts_range = get_pts(audio_timebase)
-
-    vframes, aframes, info = _read_video_from_file(
-        filename,
-        read_video_stream=True,
-        video_pts_range=video_pts_range,
-        video_timebase=video_timebase,
-        read_audio_stream=True,
-        audio_pts_range=audio_pts_range,
-        audio_timebase=audio_timebase,
-    )
-    _info = {}
-    if has_video:
-        _info["video_fps"] = info.video_fps
-    if has_audio:
-        _info["audio_fps"] = info.audio_sample_rate
-
-    return vframes, aframes, _info
-
-
-def _read_video_timestamps(
-    filename: str, pts_unit: str = "pts"
-) -> tuple[Union[list[int], list[Fraction]], Optional[float]]:
-    _raise_video_deprecation_warning()
-    if pts_unit == "pts":
-        warnings.warn(
-            "The pts_unit 'pts' gives wrong results and will be removed in a "
-            + "follow-up version. Please use pts_unit 'sec'."
-        )
-
-    pts: Union[list[int], list[Fraction]]
-    pts, _, info = _read_video_timestamps_from_file(filename)
-
-    if pts_unit == "sec":
-        video_time_base = Fraction(info.video_timebase.numerator, info.video_timebase.denominator)
-        pts = [x * video_time_base for x in pts]
-
-    video_fps = info.video_fps if info.has_video else None
-
-    return pts, video_fps
diff --git a/torchvision/io/video.py b/torchvision/io/video.py
index 14edcf50aaa..092a1cab46b 100644
--- a/torchvision/io/video.py
+++ b/torchvision/io/video.py
@@ -1,6 +1,5 @@
 import gc
 import math
-import os
 import re
 import warnings
 from fractions import Fraction
@@ -10,7 +9,6 @@
 import torch
 
 from ..utils import _log_api_usage_once
-from . import _video_opt
 from ._video_deprecation_warning import _raise_video_deprecation_warning
 
 try:
@@ -72,7 +70,7 @@ def write_video(
     .. warning::
 
         DEPRECATED: All the video decoding and encoding capabilities of torchvision
-        are deprecated from version 0.22 and will be removed in version 0.24.  We
+        are deprecated from version 0.22 and will be removed in version 0.25.  We
         recommend that you migrate to
         `TorchCodec <https://github.com/pytorch/torchcodec>`__, where we'll
         consolidate the future decoding/encoding capabilities of PyTorch
@@ -256,21 +254,6 @@ def _read_from_stream(
     return result
 
 
-def _align_audio_frames(
-    aframes: torch.Tensor, audio_frames: list["av.frame.Frame"], ref_start: int, ref_end: float
-) -> torch.Tensor:
-    start, end = audio_frames[0].pts, audio_frames[-1].pts
-    total_aframes = aframes.shape[1]
-    step_per_aframe = (end - start + 1) / total_aframes
-    s_idx = 0
-    e_idx = total_aframes
-    if start < ref_start:
-        s_idx = int((ref_start - start) / step_per_aframe)
-    if end > ref_end:
-        e_idx = int((ref_end - end) / step_per_aframe)
-    return aframes[:, s_idx:e_idx]
-
-
 def read_video(
     filename: str,
     start_pts: Union[float, Fraction] = 0,
@@ -311,13 +294,7 @@ def read_video(
     if output_format not in ("THWC", "TCHW"):
         raise ValueError(f"output_format should be either 'THWC' or 'TCHW', got {output_format}.")
 
-    from torchvision import get_video_backend
-
-    if get_video_backend() != "pyav":
-        if not os.path.exists(filename):
-            raise RuntimeError(f"File not found: {filename}")
-        vframes, aframes, info = _video_opt._read_video(filename, start_pts, end_pts, pts_unit)
-    else:
+    if True:  # ignore, this is to avoid a bigger diff in https://github.com/pytorch/vision/pull/9189
         _check_av_available()
 
         if end_pts is None:
@@ -331,7 +308,7 @@ def read_video(
         info = {}
         video_frames = []
         audio_frames = []
-        audio_timebase = _video_opt.default_timebase
+        audio_timebase = Fraction(0, 1)
 
         try:
             with av.open(filename, metadata_errors="ignore") as container:
@@ -415,7 +392,7 @@ def read_video_timestamps(filename: str, pts_unit: str = "pts") -> tuple[list[in
     .. warning::
 
         DEPRECATED: All the video decoding and encoding capabilities of torchvision
-        are deprecated from version 0.22 and will be removed in version 0.24.  We
+        are deprecated from version 0.22 and will be removed in version 0.25.  We
         recommend that you migrate to
         `TorchCodec <https://github.com/pytorch/torchcodec>`__, where we'll
         consolidate the future decoding/encoding capabilities of PyTorch
@@ -436,10 +413,6 @@ def read_video_timestamps(filename: str, pts_unit: str = "pts") -> tuple[list[in
     _raise_video_deprecation_warning()
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(read_video_timestamps)
-    from torchvision import get_video_backend
-
-    if get_video_backend() != "pyav":
-        return _video_opt._read_video_timestamps(filename, pts_unit)
 
     _check_av_available()
 
diff --git a/torchvision/io/video_reader.py b/torchvision/io/video_reader.py
deleted file mode 100644
index efc58c47905..00000000000
--- a/torchvision/io/video_reader.py
+++ /dev/null
@@ -1,296 +0,0 @@
-import io
-import warnings
-from collections.abc import Iterator
-
-from typing import Any
-
-import torch
-
-from ..utils import _log_api_usage_once
-from ._video_deprecation_warning import _raise_video_deprecation_warning
-
-from ._video_opt import _HAS_CPU_VIDEO_DECODER
-
-if _HAS_CPU_VIDEO_DECODER:
-
-    def _has_video_opt() -> bool:
-        return True
-
-else:
-
-    def _has_video_opt() -> bool:
-        return False
-
-
-try:
-    import av
-
-    av.logging.set_level(av.logging.ERROR)
-    if not hasattr(av.video.frame.VideoFrame, "pict_type"):
-        av = ImportError(
-            """\
-Your version of PyAV is too old for the necessary video operations in torchvision.
-If you are on Python 3.5, you will have to build from source (the conda-forge
-packages are not up-to-date).  See
-https://github.com/mikeboers/PyAV#installation for instructions on how to
-install PyAV on your system.
-"""
-        )
-except ImportError:
-    av = ImportError(
-        """\
-PyAV is not installed, and is necessary for the video operations in torchvision.
-See https://github.com/mikeboers/PyAV#installation for instructions on how to
-install PyAV on your system.
-"""
-    )
-
-
-class VideoReader:
-    """[DEPRECATED] Fine-grained video-reading API.
-    Supports frame-by-frame reading of various streams from a single video
-    container. Much like previous video_reader API it supports the following
-    backends: video_reader, pyav, and cuda.
-    Backends can be set via `torchvision.set_video_backend` function.
-
-    .. warning::
-
-        DEPRECATED: All the video decoding and encoding capabilities of torchvision
-        are deprecated from version 0.22 and will be removed in version 0.24.  We
-        recommend that you migrate to
-        `TorchCodec <https://github.com/pytorch/torchcodec>`__, where we'll
-        consolidate the future decoding/encoding capabilities of PyTorch
-
-    .. betastatus:: VideoReader class
-
-    Example:
-        The following examples creates a :mod:`VideoReader` object, seeks into 2s
-        point, and returns a single frame::
-
-            import torchvision
-            video_path = "path_to_a_test_video"
-            reader = torchvision.io.VideoReader(video_path, "video")
-            reader.seek(2.0)
-            frame = next(reader)
-
-        :mod:`VideoReader` implements the iterable API, which makes it suitable to
-        using it in conjunction with :mod:`itertools` for more advanced reading.
-        As such, we can use a :mod:`VideoReader` instance inside for loops::
-
-            reader.seek(2)
-            for frame in reader:
-                frames.append(frame['data'])
-            # additionally, `seek` implements a fluent API, so we can do
-            for frame in reader.seek(2):
-                frames.append(frame['data'])
-
-        With :mod:`itertools`, we can read all frames between 2 and 5 seconds with the
-        following code::
-
-            for frame in itertools.takewhile(lambda x: x['pts'] <= 5, reader.seek(2)):
-                frames.append(frame['data'])
-
-        and similarly, reading 10 frames after the 2s timestamp can be achieved
-        as follows::
-
-            for frame in itertools.islice(reader.seek(2), 10):
-                frames.append(frame['data'])
-
-    .. note::
-
-        Each stream descriptor consists of two parts: stream type (e.g. 'video') and
-        a unique stream id (which are determined by the video encoding).
-        In this way, if the video container contains multiple
-        streams of the same type, users can access the one they want.
-        If only stream type is passed, the decoder auto-detects first stream of that type.
-
-    Args:
-        src (string, bytes object, or tensor): The media source.
-            If string-type, it must be a file path supported by FFMPEG.
-            If bytes, should be an in-memory representation of a file supported by FFMPEG.
-            If Tensor, it is interpreted internally as byte buffer.
-            It must be one-dimensional, of type ``torch.uint8``.
-
-        stream (string, optional): descriptor of the required stream, followed by the stream id,
-            in the format ``{stream_type}:{stream_id}``. Defaults to ``"video:0"``.
-            Currently available options include ``['video', 'audio']``
-
-        num_threads (int, optional): number of threads used by the codec to decode video.
-            Default value (0) enables multithreading with codec-dependent heuristic. The performance
-            will depend on the version of FFMPEG codecs supported.
-    """
-
-    def __init__(
-        self,
-        src: str,
-        stream: str = "video",
-        num_threads: int = 0,
-    ) -> None:
-        _raise_video_deprecation_warning()
-        _log_api_usage_once(self)
-        from .. import get_video_backend
-
-        self.backend = get_video_backend()
-        if isinstance(src, str):
-            if not src:
-                raise ValueError("src cannot be empty")
-        elif isinstance(src, bytes):
-            if self.backend in ["cuda"]:
-                raise RuntimeError(
-                    "VideoReader cannot be initialized from bytes object when using cuda or pyav backend."
-                )
-            elif self.backend == "pyav":
-                src = io.BytesIO(src)
-            else:
-                with warnings.catch_warnings():
-                    # Ignore the warning because we actually don't modify the buffer in this function
-                    warnings.filterwarnings("ignore", message="The given buffer is not writable")
-                    src = torch.frombuffer(src, dtype=torch.uint8)
-        elif isinstance(src, torch.Tensor):
-            if self.backend in ["cuda", "pyav"]:
-                raise RuntimeError(
-                    "VideoReader cannot be initialized from Tensor object when using cuda or pyav backend."
-                )
-        else:
-            raise ValueError(f"src must be either string, Tensor or bytes object. Got {type(src)}")
-
-        if self.backend == "cuda":
-            device = torch.device("cuda")
-            self._c = torch.classes.torchvision.GPUDecoder(src, device)
-
-        elif self.backend == "video_reader":
-            if isinstance(src, str):
-                self._c = torch.classes.torchvision.Video(src, stream, num_threads)
-            elif isinstance(src, torch.Tensor):
-                self._c = torch.classes.torchvision.Video("", "", 0)
-                self._c.init_from_memory(src, stream, num_threads)
-
-        elif self.backend == "pyav":
-            self.container = av.open(src, metadata_errors="ignore")
-            # TODO: load metadata
-            stream_type = stream.split(":")[0]
-            stream_id = 0 if len(stream.split(":")) == 1 else int(stream.split(":")[1])
-            self.pyav_stream = {stream_type: stream_id}
-            self._c = self.container.decode(**self.pyav_stream)
-
-            # TODO: add extradata exception
-
-        else:
-            raise RuntimeError(f"Unknown video backend: {self.backend}")
-
-    def __next__(self) -> dict[str, Any]:
-        """Decodes and returns the next frame of the current stream.
-        Frames are encoded as a dict with mandatory
-        data and pts fields, where data is a tensor, and pts is a
-        presentation timestamp of the frame expressed in seconds
-        as a float.
-
-        Returns:
-            (dict): a dictionary and containing decoded frame (``data``)
-            and corresponding timestamp (``pts``) in seconds
-
-        """
-        if self.backend == "cuda":
-            frame = self._c.next()
-            if frame.numel() == 0:
-                raise StopIteration
-            return {"data": frame, "pts": None}
-        elif self.backend == "video_reader":
-            frame, pts = self._c.next()
-        else:
-            try:
-                frame = next(self._c)
-                pts = float(frame.pts * frame.time_base)
-                if "video" in self.pyav_stream:
-                    frame = torch.as_tensor(frame.to_rgb().to_ndarray()).permute(2, 0, 1)
-                elif "audio" in self.pyav_stream:
-                    frame = torch.as_tensor(frame.to_ndarray()).permute(1, 0)
-                else:
-                    frame = None
-            except av.error.EOFError:
-                raise StopIteration
-
-        if frame.numel() == 0:
-            raise StopIteration
-
-        return {"data": frame, "pts": pts}
-
-    def __iter__(self) -> Iterator[dict[str, Any]]:
-        return self
-
-    def seek(self, time_s: float, keyframes_only: bool = False) -> "VideoReader":
-        """Seek within current stream.
-
-        Args:
-            time_s (float): seek time in seconds
-            keyframes_only (bool): allow to seek only to keyframes
-
-        .. note::
-            Current implementation is the so-called precise seek. This
-            means following seek, call to :mod:`next()` will return the
-            frame with the exact timestamp if it exists or
-            the first frame with timestamp larger than ``time_s``.
-        """
-        if self.backend in ["cuda", "video_reader"]:
-            self._c.seek(time_s, keyframes_only)
-        else:
-            # handle special case as pyav doesn't catch it
-            if time_s < 0:
-                time_s = 0
-            temp_str = self.container.streams.get(**self.pyav_stream)[0]
-            offset = int(round(time_s / temp_str.time_base))
-            if not keyframes_only:
-                warnings.warn("Accurate seek is not implemented for pyav backend")
-            self.container.seek(offset, backward=True, any_frame=False, stream=temp_str)
-            self._c = self.container.decode(**self.pyav_stream)
-        return self
-
-    def get_metadata(self) -> dict[str, Any]:
-        """Returns video metadata
-
-        Returns:
-            (dict): dictionary containing duration and frame rate for every stream
-        """
-        if self.backend == "pyav":
-            metadata = {}  # type:  Dict[str, Any]
-            for stream in self.container.streams:
-                if stream.type not in metadata:
-                    if stream.type == "video":
-                        rate_n = "fps"
-                    else:
-                        rate_n = "framerate"
-                    metadata[stream.type] = {rate_n: [], "duration": []}
-
-                rate = getattr(stream, "average_rate", None) or stream.sample_rate
-
-                metadata[stream.type]["duration"].append(float(stream.duration * stream.time_base))
-                metadata[stream.type][rate_n].append(float(rate))
-            return metadata
-        return self._c.get_metadata()
-
-    def set_current_stream(self, stream: str) -> bool:
-        """Set current stream.
-        Explicitly define the stream we are operating on.
-
-        Args:
-            stream (string): descriptor of the required stream. Defaults to ``"video:0"``
-                Currently available stream types include ``['video', 'audio']``.
-                Each descriptor consists of two parts: stream type (e.g. 'video') and
-                a unique stream id (which are determined by video encoding).
-                In this way, if the video container contains multiple
-                streams of the same type, users can access the one they want.
-                If only stream type is passed, the decoder auto-detects first stream
-                of that type and returns it.
-
-        Returns:
-            (bool): True on success, False otherwise
-        """
-        if self.backend == "cuda":
-            warnings.warn("GPU decoding only works with video stream.")
-        if self.backend == "pyav":
-            stream_type = stream.split(":")[0]
-            stream_id = 0 if len(stream.split(":")) == 1 else int(stream.split(":")[1])
-            self.pyav_stream = {stream_type: stream_id}
-            self._c = self.container.decode(**self.pyav_stream)
-            return True
-        return self._c.set_current_stream(stream)