Remove caffe2 image and video (#125045)

This PR tries to decompose https://github.com/pytorch/pytorch/pull/122527 into a smaller one. Caffe2 image and video folders are removed along with the related CMake code. To be noted, this was inspired and is co-dev with @r-barnes. Pull Request resolved: https://github.com/pytorch/pytorch/pull/125045 Approved by: https://github.com/eqy, https://github.com/albanD
2025-10-20 21:14:14 +08:00 · 2024-04-30 17:31:57 +00:00
parent a03b9a2189
commit 04c6424fbf
21 changed files with 0 additions and 4762 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -228,7 +228,6 @@ option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON)
 option(USE_KINETO "Use Kineto profiling library" ON)
 option(USE_CUPTI_SO "Use CUPTI as a shared library" ON)
 option(USE_FAKELOWP "Use FakeLowp operators" OFF)
-option(USE_FFMPEG "Use ffmpeg" OFF)
 option(USE_GFLAGS "Use GFLAGS" OFF)
 option(USE_GLOG "Use GLOG" OFF)
 option(USE_LEVELDB "Use LEVELDB" OFF)
@ -264,7 +263,6 @@ cmake_dependent_option(
 option(USE_NUMPY "Use NumPy" ON)
 option(USE_OBSERVERS "Use observers module." OFF)
 option(USE_OPENCL "Use OpenCL" OFF)
-option(USE_OPENCV "Use OpenCV" OFF)
 option(USE_OPENMP "Use OpenMP for parallel code" ON)
 option(USE_PRECOMPILED_HEADERS "Use pre-compiled headers to accelerate build." OFF)

--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -125,8 +125,6 @@ if(BUILD_CAFFE2 AND NOT INTERN_BUILD_MOBILE)
  add_subdirectory(db)
  add_subdirectory(distributed)
  add_subdirectory(ideep)
-  add_subdirectory(image)
-  add_subdirectory(video)
  add_subdirectory(mobile)
  add_subdirectory(mpi)
  add_subdirectory(observers)
--- a/caffe2/image/CMakeLists.txt
+++ b/caffe2/image/CMakeLists.txt
@ -1,57 +0,0 @@
-if(USE_OPENCV AND OpenCV_FOUND)
-        message(STATUS "Including image processing operators")
-  # ---[ GPU files
-  # ------[ general GPU
-  file(GLOB tmp *_gpu.cc)
-  set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
-  # ------[ CUDA sources
-  file(GLOB tmp *.cu)
-  set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
-  # exclude test files
-  file(GLOB tmp *_test.cc)
-  exclude(Caffe2_GPU_SRCS "${Caffe2_GPU_SRCS}" ${tmp})
-
-  # ---[ HIP files
-  # ------[ general HIP
-  file(GLOB tmp hip/*.cc)
-  set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${tmp})
-  # ------[ HIP sources
-  file(GLOB tmp hip/*.hip)
-  set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${tmp})
-  # exclude test files
-  file(GLOB tmp hip/*_test.cc)
-  exclude(Caffe2_HIP_SRCS "${Caffe2_HIP_SRCS}" ${tmp})
-
-  # ---[ CPU files.
-  file(GLOB tmp *.cc)
-  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp})
-  # exclude test files and gpu files
-  file(GLOB tmp *_test.cc)
-  exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp})
-  exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_GPU_SRCS})
-  exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_HIP_SRCS})
-
-  # ---[ GPU test files
-  file(GLOB tmp *_gpu_test.cc)
-  set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} ${tmp})
-
-  # ---[ HIP test files
-  file(GLOB tmp hip/*_test.cc)
-  set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} ${tmp})
-
-  # ---[ CPU test files
-  file(GLOB tmp *_test.cc)
-  set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp})
-  exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_GPU_TEST_SRCS})
-  exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_HIP_TEST_SRCS})
-
-  # ---[ Send the lists to the parent scope.
-  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
-  set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
-  set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE)
-  set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
-  set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
-  set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} PARENT_SCOPE)
-else()
-        message(STATUS "Excluding image processing operators due to no opencv")
-endif()
--- a/caffe2/image/image_input_op.cc
+++ b/caffe2/image/image_input_op.cc
@ -1,167 +0,0 @@
-#include "caffe2/image/image_input_op.h"
-
-#ifdef USE_MKLDNN
-#include <caffe2/ideep/operators/operator_fallback_ideep.h>
-#include <caffe2/ideep/utils/ideep_operator.h>
-#endif
-
-namespace caffe2 {
-
-template <>
-bool ImageInputOp<CPUContext>::ApplyTransformOnGPU(
-    const std::vector<std::int64_t>&,
-    const c10::Device&) {
-  return false;
-}
-
-REGISTER_CPU_OPERATOR(ImageInput, ImageInputOp<CPUContext>);
-
-OPERATOR_SCHEMA(ImageInput)
-    .NumInputs(0, 1)
-    .NumOutputs(2, INT_MAX)
-    .TensorInferenceFunction([](const OperatorDef& def,
-                                const vector<TensorShape>& /* unused */) {
-      vector<TensorShape> out(2);
-      ArgumentHelper helper(def);
-      int batch_size = helper.GetSingleArgument<int>("batch_size", 0);
-      int crop = helper.GetSingleArgument<int>("crop", -1);
-      int color = helper.GetSingleArgument<int>("color", 1);
-      TORCH_CHECK_GT(crop, 0);
-      out[0] = CreateTensorShape(
-          vector<int>{batch_size, crop, crop, color ? 3 : 1},
-          TensorProto::FLOAT);
-      out[1] =
-          CreateTensorShape(vector<int>{1, batch_size}, TensorProto::INT32);
-      return out;
-    })
-    .SetDoc(R"DOC(
-Imports and processes images from a database. For each run of the operator,
-batch_size images will be processed. GPUs can optionally be used for
-part of the processing.
-
-The following transformations are applied to the image
-  - A bounding box is applied to the initial image (optional)
-  - The image is rescaled either up or down (with the scale argument) or
-    just up (with the minsize argument)
-  - The image is randomly cropped (crop size is passed as an argument but
-    the location of the crop is random except if is_test is passed in which case
-    the image in cropped at the center)
-  - The image is normalized. Each of its color channels can have separate
-    normalization values
-
-The dimension of the output image will always be cropxcrop
-)DOC")
-    .Arg(
-        "batch_size",
-        "Number of images to output for each run of the operator"
-        ". Must be 1 or greater")
-    .Arg("color", "Number of color channels (1 or 3). Defaults to 1")
-    .Arg("color_jitter", "Whether or not to do color jitter. Defaults to 0")
-    .Arg(
-        "img_saturation",
-        "Image saturation scale used in color jittering. "
-        "Defaults to 0.4")
-    .Arg(
-        "img_brightness",
-        "Image brightness scale used in color jittering. "
-        "Defaults to 0.4")
-    .Arg(
-        "img_contrast",
-        "Image contrast scale used in color jittering. "
-        "Defaults to 0.4")
-    .Arg(
-        "color_lighting",
-        "Whether or not to do color lighting."
-        " Defaults to 0")
-    .Arg(
-        "color_lighting_std",
-        "Std of normal distribution where color lighting"
-        " scaling factor is sampled. Defaults to 0.1")
-    .Arg(
-        "scale_jitter_type",
-        "Type 0: No scale jittering "
-        "Type 1: Inception-style scale jittering")
-    .Arg(
-        "label_type",
-        "Type 0: single integer label for multi-class "
-        "classification. Type 1: sparse active label indices for multi-label "
-        "classification. Type 2: dense label embedding vector for label "
-        "embedding regression")
-    .Arg(
-        "scale",
-        "Scale the size of the smallest dimension of the image to"
-        " this. Scale and minsize are mutually exclusive."
-        " Must be larger than crop")
-    .Arg(
-        "minsize",
-        "Scale the size of the smallest dimension of the image to"
-        " this only if the size is initially smaller. Scale and minsize are"
-        " mutually exclusive. Must be larger than crop.")
-    .Arg(
-        "warp",
-        "If 1, both dimensions of the image will be set to minsize or"
-        " scale; otherwise, the other dimension is proportionally scaled."
-        " Defaults to 0")
-    .Arg("crop", "Size to crop the image to. Must be provided")
-    .Arg("mirror", "Whether or not to mirror the image. Defaults to 0")
-    .Arg(
-        "mean",
-        "Mean by which to normalize color channels."
-        " Defaults to 0.")
-    .Arg(
-        "mean_per_channel",
-        "Vector of means per color channel "
-        " (1 or 3 elements). Defaults to mean argument. Channel order BGR")
-    .Arg(
-        "std",
-        "Standard deviation by which to normalize color channels."
-        " Defaults to 1.")
-    .Arg(
-        "std_per_channel",
-        "Vector of standard dev. per color channel "
-        " (1 or 3 elements). Defaults to std argument. Channel order is BGR")
-    .Arg("bounding_ymin", "Bounding box coordinate. Defaults to -1 (none)")
-    .Arg("bounding_xmin", "Bounding box coordinate. Defaults to -1 (none)")
-    .Arg("bounding_height", "Bounding box coordinate. Defaults to -1 (none)")
-    .Arg("bounding_width", "Bounding box coordinate. Defaults to -1 (none)")
-    .ArgIsTest("Set to 1 to do deterministic cropping. Defaults to 0")
-    .Arg("use_caffe_datum", "1 if the input is in Caffe format. Defaults to 0")
-    .Arg(
-        "use_gpu_transform",
-        "1 if GPU acceleration should be used."
-        " Defaults to 0. Can only be 1 in a CUDAContext")
-    .Arg(
-        "decode_threads",
-        "Number of CPU decode/transform threads."
-        " Defaults to 4")
-    .Arg("output_type", "If gpu_transform, can set to FLOAT or FLOAT16.")
-    .Arg("db", "Name of the database (if not passed as input)")
-    .Arg(
-        "db_type",
-        "Type of database (if not passed as input)."
-        " Defaults to leveldb")
-    .Arg(
-        "output_sizes",
-        "The sizes of any outputs besides the data and label "
-        "(should have a number of elements equal to the number of additional "
-        "outputs)")
-    .Arg(
-        "random_scale",
-        "[min, max] shortest-side desired for image resize. "
-        "Defaults to [-1, -1] or no random resize desired.")
-    .Input(0, "reader", "The input reader (a db::DBReader)")
-    .Output(0, "data", "Tensor containing the images")
-    .Output(1, "label", "Tensor containing the labels")
-    .Output(
-        2,
-        "additional outputs",
-        "Any outputs after the first 2 will be "
-        "Tensors read from the input TensorProtos");
-
-NO_GRADIENT(ImageInput);
-
-#ifdef USE_MKLDNN
-REGISTER_IDEEP_OPERATOR(ImageInput, IDEEPFallbackOp<ImageInputOp<CPUContext>>);
-#endif
-
-} // namespace caffe2
--- a/caffe2/image/image_input_op.h
+++ b/caffe2/image/image_input_op.h
--- a/caffe2/image/image_input_op_gpu.cc
+++ b/caffe2/image/image_input_op_gpu.cc
@ -1,38 +0,0 @@
-#include "caffe2/core/common_gpu.h"
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/image/image_input_op.h"
-
-namespace caffe2 {
-
-template <>
-bool ImageInputOp<CUDAContext>::ApplyTransformOnGPU(
-    const std::vector<std::int64_t>& dims,
-    const c10::Device& type) {
-  // GPU transform kernel allows explicitly setting output type
-  if (output_type_ == TensorProto_DataType_FLOAT) {
-    auto* image_output =
-        OperatorBase::OutputTensor(0, dims, at::dtype<float>().device(type));
-    TransformOnGPU<uint8_t, float, CUDAContext>(
-        prefetched_image_on_device_,
-        image_output,
-        mean_gpu_,
-        std_gpu_,
-        &context_);
-  } else if (output_type_ == TensorProto_DataType_FLOAT16) {
-    auto* image_output =
-        OperatorBase::OutputTensor(0, dims, at::dtype<at::Half>().device(type));
-    TransformOnGPU<uint8_t, at::Half, CUDAContext>(
-        prefetched_image_on_device_,
-        image_output,
-        mean_gpu_,
-        std_gpu_,
-        &context_);
-  } else {
-    return false;
-  }
-  return true;
-}
-
-REGISTER_CUDA_OPERATOR(ImageInput, ImageInputOp<CUDAContext>);
-
-} // namespace caffe2
--- a/caffe2/image/transform_gpu.cu
+++ b/caffe2/image/transform_gpu.cu
@ -1,85 +0,0 @@
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/image/transform_gpu.h"
-#include "caffe2/utils/conversions.h"
-
-/**
- *
- * Copyright (c) 2016, NVIDIA CORPORATION, All rights reserved
- * Distributed under 2-clause BSD license; see accompanying LICENSE file
- *
- **/
-
-namespace caffe2 {
-
-namespace {
-
-// input in (int8, NHWC), output in (fp32, NCHW)
-template <typename In, typename Out>
-__global__ void transform_kernel(
-    const int C,
-    const int H,
-    const int W,
-    const float* mean,
-    const float* std,
-    const In* in,
-    Out* out) {
-  const auto n = blockIdx.x;
-
-  const auto nStride = C*H*W;
-
-  // pointers to data for this image
-  const In *const input_ptr = &in[n*nStride];
-  Out *const output_ptr = &out[n*nStride];
-
-  // either read or write uncoalesced - try reading
-  for (int c=0; c < C; ++c) {
-    for (int h=threadIdx.y; h < H; h += blockDim.y) {
-      for (int w=threadIdx.x; w < W; w += blockDim.x) {
-        const int in_idx = c + C*w + C*W*h;  // HWC
-        const int out_idx = c*H*W + h*W + w;  // CHW
-
-        output_ptr[out_idx] = convert::To<float,Out>(
-          (convert::To<In,float>(input_ptr[in_idx])-mean[c]) * std[c]);
-      }
-    }
-  }
-}
-
-}
-
-template <typename T_IN, typename T_OUT, class Context>
-
-bool TransformOnGPU(
-    Tensor& X,
-    Tensor* Y,
-    Tensor& mean,
-    Tensor& std,
-    Context* context) {
-  const int N = X.dim32(0), C = X.dim32(3), H = X.dim32(1), W = X.dim32(2);
-  auto* input_data = X.template data<T_IN>();
-  auto* output_data = Y->template mutable_data<T_OUT>();
-
-  transform_kernel<
-    T_IN, T_OUT><<<N, dim3(16, 16), 0, context->cuda_stream()>>>(
-      C, H, W, mean.template data<float>(), std.template data<float>(),
-      input_data, output_data);
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-  return true;
-};
-
-template bool TransformOnGPU<uint8_t, float, CUDAContext>(
-    Tensor& X,
-    Tensor* Y,
-    Tensor& mean,
-    Tensor& std,
-    CUDAContext* context);
-
-template bool TransformOnGPU<uint8_t, at::Half, CUDAContext>(
-    Tensor& X,
-    Tensor* Y,
-    Tensor& mean,
-    Tensor& std,
-    CUDAContext* context);
-
-}  // namespace caffe2
--- a/caffe2/image/transform_gpu.h
+++ b/caffe2/image/transform_gpu.h
@ -1,43 +0,0 @@
-#ifndef CAFFE2_IMAGE_TRANSFORM_GPU_H_
-#define CAFFE2_IMAGE_TRANSFORM_GPU_H_
-
-/**
- *
- * Copyright (c) 2016, NVIDIA CORPORATION, All rights reserved
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- *    list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- **/
-
-#include "caffe2/core/context.h"
-
-namespace caffe2 {
-
-template <typename T_IN, typename T_OUT, class Context>
-bool TransformOnGPU(
-    Tensor& X,
-    Tensor* Y,
-    Tensor& mean,
-    Tensor& std,
-    Context* context);
-
-}  // namespace caffe2
-
-#endif
--- a/caffe2/video/CMakeLists.txt
+++ b/caffe2/video/CMakeLists.txt
@ -1,59 +0,0 @@
-if(USE_OPENCV AND OpenCV_FOUND AND USE_FFMPEG AND FFMPEG_FOUND)
-        message(STATUS "Including video processing operators")
-  # ---[ GPU files
-  # ------[ general GPU
-  file(GLOB tmp *_gpu.cc)
-  set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
-  # ------[ CUDA sources
-  file(GLOB tmp *.cu)
-  set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
-  # exclude test files
-  file(GLOB tmp *_test.cc)
-  exclude(Caffe2_GPU_SRCS "${Caffe2_GPU_SRCS}" ${tmp})
-
-  # ---[ HIP files
-  # ------[ general HIP
-  file(GLOB tmp hip/*.cc)
-  set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${tmp})
-  # ------[ HIP sources
-  file(GLOB tmp hip/*.hip)
-  set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${tmp})
-  # exclude test files
-  file(GLOB tmp hip/*_test.cc)
-  exclude(Caffe2_HIP_SRCS "${Caffe2_HIP_SRCS}" ${tmp})
-
-  # ---[ CPU files.
-  file(GLOB tmp *.cc)
-  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp})
-  # exclude test files and gpu files
-  file(GLOB tmp *_test.cc)
-  exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp})
-  exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_GPU_SRCS})
-  exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_HIP_SRCS})
-
-  # ---[ GPU test files
-  file(GLOB tmp *_gpu_test.cc)
-  set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} ${tmp})
-
-  # ---[ HIP test files
-  file(GLOB tmp hip/*_test.cc)
-  set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} ${tmp})
-
-  # ---[ CPU test files
-  file(GLOB tmp *_test.cc)
-  set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp})
-  exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}"
-    ${Caffe2_GPU_TEST_SRCS})
-  exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}"
-    ${Caffe2_GPU_TEST_SRCS})
-
-  # ---[ Send the lists to the parent scope.
-  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
-  set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
-  set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE)
-  set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
-  set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
-  set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} PARENT_SCOPE)
-else()
-        message(STATUS "Excluding video processing operators due to no opencv")
-endif()
--- a/caffe2/video/optical_flow.cc
+++ b/caffe2/video/optical_flow.cc
@ -1,85 +0,0 @@
-#include <caffe2/video/optical_flow.h>
-
-namespace caffe2 {
-
-void OpticalFlowExtractor(
-    const cv::Mat& prev_gray,
-    const cv::Mat& curr_gray,
-    const int flow_alg_type,
-    cv::Mat& flow) {
-#if CV_MAJOR_VERSION >= 4
-  cv::Ptr<cv::DISOpticalFlow> tvl1 = cv::DISOpticalFlow::create();
-#else
-  cv::Ptr<cv::DualTVL1OpticalFlow> tvl1 = cv::DualTVL1OpticalFlow::create();
-#endif
-  switch (flow_alg_type) {
-    case FLowAlgType::FarnebackOpticalFlow:
-      cv::calcOpticalFlowFarneback(
-          prev_gray,
-          curr_gray,
-          flow,
-          std::sqrt(2) / 2.0,
-          5,
-          10,
-          2,
-          7,
-          1.5,
-          cv::OPTFLOW_FARNEBACK_GAUSSIAN);
-      break;
-    case FLowAlgType::DensePyrLKOpticalFlow:
-      LOG(ERROR) << "DensePyrLKOpticalFlow only has sparse version on CPU";
-      break;
-    case FLowAlgType::BroxOpticalFlow:
-      LOG(ERROR) << "BroxOpticalFlow on CPU is not available";
-      break;
-    case FLowAlgType::OpticalFlowDual_TVL1:
-      tvl1->calc(prev_gray, curr_gray, flow);
-      break;
-    default:
-      LOG(ERROR) << "Unsupported optical flow type " << flow_alg_type;
-      break;
-  }
-}
-
-void MergeOpticalFlow(cv::Mat& prev_flow, const cv::Mat& curr_flow) {
-  const int rows = prev_flow.rows;
-  const int cols = prev_flow.cols;
-
-  // merge two optical flows into one
-  for (int y = 0; y < rows; y++) {
-    for (int x = 0; x < cols; x++) {
-      cv::Point2f u = prev_flow.at<cv::Point2f>(y, x);
-      // get the new location
-      int x_new = std::min(cols - 1, std::max(0, cvRound(u.x + x)));
-      int y_new = std::min(rows - 1, std::max(0, cvRound(u.y + y)));
-      cv::Point2f u_new = curr_flow.at<cv::Point2f>(y_new, x_new);
-
-      // update the flow
-      prev_flow.at<cv::Point2f>(y, x) += u_new;
-    }
-  }
-}
-
-void MultiFrameOpticalFlowExtractor(
-    const std::vector<cv::Mat>& grays,
-    const int optical_flow_alg_type,
-    cv::Mat& flow) {
-  int num_frames = grays.size();
-  CAFFE_ENFORCE_GE(num_frames, 2, "need at least 2 frames!");
-
-  // compute optical flow for every two frames
-  std::vector<cv::Mat> flows;
-  for (int i = 0; i < num_frames - 1; i++) {
-    cv::Mat tmp;
-    OpticalFlowExtractor(grays[i], grays[i + 1], optical_flow_alg_type, tmp);
-    flows.push_back(tmp);
-  }
-
-  flows[0].copyTo(flow);
-  // aggregate optical flow across multiple frame
-  for (int i = 1; i < num_frames - 1; i++) {
-    MergeOpticalFlow(flow, flows[i]);
-  }
-}
-
-} // namespace caffe2
--- a/caffe2/video/optical_flow.h
+++ b/caffe2/video/optical_flow.h
@ -1,50 +0,0 @@
-#ifndef CAFFE2_VIDEO_OPTICAL_FLOW_H_
-#define CAFFE2_VIDEO_OPTICAL_FLOW_H_
-
-#include <opencv2/core.hpp>
-#include <opencv2/highgui.hpp>
-#include <opencv2/opencv.hpp>
-#include <opencv2/video.hpp>
-
-#include <caffe2/core/logging.h>
-
-namespace caffe2 {
-
-// Four different types of optical flow algorithms supported;
-// BroxOpticalFlow doesn't have a CPU version;
-// DensePyrLKOpticalFlow only has sparse CPU version;
-enum FLowAlgType {
-  FarnebackOpticalFlow = 0,
-  DensePyrLKOpticalFlow = 1,
-  BroxOpticalFlow = 2,
-  OpticalFlowDual_TVL1 = 3,
-};
-
-// Define different types of optical flow data type
-// 0: original two channel optical flow
-// 1: three channel optical flow with magnitude as the third channel
-// 2: two channel optical flow + one channel gray
-// 3: two channel optical flow + three channel rgb
-enum FlowDataType {
-  Flow2C = 0,
-  Flow3C = 1,
-  FlowWithGray = 2,
-  FlowWithRGB = 3,
-};
-
-void OpticalFlowExtractor(
-    const cv::Mat& prev_gray,
-    const cv::Mat& curr_gray,
-    const int optical_flow_alg_type,
-    cv::Mat& flow);
-
-void MergeOpticalFlow(cv::Mat& prev_flow, const cv::Mat& curr_flow);
-
-void MultiFrameOpticalFlowExtractor(
-    const std::vector<cv::Mat>& grays,
-    const int optical_flow_alg_type,
-    cv::Mat& flow);
-
-} // namespace caffe2
-
-#endif // CAFFE2_VIDEO_OPTICAL_FLOW_H_
--- a/caffe2/video/video_decoder.cc
+++ b/caffe2/video/video_decoder.cc
@ -1,800 +0,0 @@
-#include <assert.h>
-#include <caffe2/core/logging.h>
-#include <caffe2/video/video_decoder.h>
-#include <array>
-#include <mutex>
-#include <random>
-
-namespace caffe2 {
-
-VideoDecoder::VideoDecoder() {
-  static bool gInitialized = false;
-  static std::mutex gMutex;
-  std::unique_lock<std::mutex> lock(gMutex);
-  if (!gInitialized) {
-    av_register_all();
-    avcodec_register_all();
-    avformat_network_init();
-    gInitialized = true;
-  }
-}
-
-void VideoDecoder::getAudioSample(
-    AVPacket& packet,
-    AVCodecContext* audioCodecContext_,
-    AVFrame* audioStreamFrame_,
-    SwrContext* convertCtx_,
-    Callback& callback,
-    const Params& params) {
-  int frame_finished = 0;
-  auto result = avcodec_decode_audio4(
-      audioCodecContext_, audioStreamFrame_, &frame_finished, &packet);
-
-  if (frame_finished) {
-    // from
-    // https://www.ffmpeg.org/doxygen/2.3/decoding_encoding_8c-example.html#a57
-    auto c = audioCodecContext_;
-    int data_size = av_samples_get_buffer_size(
-        nullptr, c->channels, audioStreamFrame_->nb_samples, c->sample_fmt, 1);
-    if (data_size < 0) {
-      // This should not occur, checking just for paranoia
-      LOG(ERROR) << "Failed to calculate data size";
-    }
-
-    // from https://www.ffmpeg.org/doxygen/2.1/group__lswr.html#details
-    uint8_t* output;
-    auto swr = convertCtx_;
-    auto inrate = audioCodecContext_->sample_rate;
-    auto in_samples = audioStreamFrame_->nb_samples;
-
-    int out_samples = av_rescale_rnd(
-        swr_get_delay(swr, inrate) + in_samples,
-        params.outrate_,
-        inrate,
-        AV_ROUND_UP);
-
-    if (out_samples > 0) {
-      auto input = (const uint8_t**)&audioStreamFrame_->data[0];
-      av_samples_alloc(
-          &output,
-          nullptr,
-          c->channels,
-          out_samples,
-          (AVSampleFormat)params.outfmt_,
-          0);
-
-      // resample the audio data
-      out_samples = swr_convert(swr, &output, out_samples, input, in_samples);
-      auto sample_size = out_samples * c->channels * sizeof(float);
-      auto buffer = std::make_unique<float[]>(sample_size);
-      memcpy(buffer.get(), output, sample_size);
-      av_freep(&output);
-
-      unique_ptr<DecodedAudio> audio_sample = make_unique<DecodedAudio>();
-      audio_sample->dataSize_ = data_size;
-      audio_sample->outSampleSize_ = out_samples * c->channels;
-      audio_sample->audio_data_ = std::move(buffer);
-      callback.audioDecoded(std::move(audio_sample));
-    }
-  } else {
-    result = packet.size;
-  }
-  packet.size -= result;
-  packet.data += result;
-}
-
-void VideoDecoder::ResizeAndKeepAspectRatio(
-    const int origWidth,
-    const int origHeight,
-    const int short_edge,
-    const int long_edge,
-    int& outWidth,
-    int& outHeight) {
-  if (origWidth < origHeight) {
-    // dominant height
-    if (short_edge > 0) {
-      // use short_edge for rescale
-      float ratio = short_edge / float(origWidth);
-      outWidth = short_edge;
-      outHeight = (int)round(ratio * origHeight);
-    } else {
-      // use long_edge for rescale
-      float ratio = long_edge / float(origHeight);
-      outHeight = long_edge;
-      outWidth = (int)round(ratio * origWidth);
-    }
-  } else {
-    // dominant width
-    if (short_edge > 0) {
-      // use short_edge for rescale
-      float ratio = short_edge / float(origHeight);
-      outHeight = short_edge;
-      outWidth = (int)round(ratio * origWidth);
-    } else {
-      // use long_edge for rescale
-      float ratio = long_edge / float(origWidth);
-      outWidth = long_edge;
-      outHeight = (int)round(ratio * origHeight);
-    }
-  }
-}
-
-void VideoDecoder::decodeLoop(
-    const string& videoName,
-    VideoIOContext& ioctx,
-    const Params& params,
-    const int start_frm,
-    Callback& callback) {
-  AVPixelFormat pixFormat = params.pixelFormat_;
-  AVFormatContext* inputContext = avformat_alloc_context();
-  AVStream* videoStream_ = nullptr;
-  AVCodecContext* videoCodecContext_ = nullptr;
-  AVCodecContext* audioCodecContext_ = nullptr;
-  AVFrame* videoStreamFrame_ = nullptr;
-  AVFrame* audioStreamFrame_ = nullptr;
-  SwrContext* convertCtx_ = nullptr;
-  AVPacket packet;
-  av_init_packet(&packet); // init packet
-  SwsContext* scaleContext_ = nullptr;
-
-  try {
-    inputContext->pb = ioctx.get_avio();
-    inputContext->flags |= AVFMT_FLAG_CUSTOM_IO;
-    int ret = 0;
-
-    // Determining the input format:
-    int probeSz = 1 * 1024 + AVPROBE_PADDING_SIZE;
-    DecodedFrame::AvDataPtr probe((uint8_t*)av_malloc(probeSz));
-    memset(probe.get(), 0, probeSz);
-    int len = ioctx.read(probe.get(), probeSz - AVPROBE_PADDING_SIZE);
-    if (len < probeSz - AVPROBE_PADDING_SIZE) {
-      LOG(ERROR) << "Insufficient data to determine video format";
-      return;
-    }
-    // seek back to start of stream
-    ioctx.seek(0, SEEK_SET);
-
-    unique_ptr<AVProbeData> probeData(new AVProbeData());
-    probeData->buf = probe.get();
-    probeData->buf_size = len;
-    probeData->filename = "";
-    // Determine the input-format:
-    inputContext->iformat = av_probe_input_format(probeData.get(), 1);
-    // this is to avoid the double-free error
-    if (inputContext->iformat == nullptr) {
-      LOG(ERROR) << "inputContext iformat is nullptr!";
-      return;
-    }
-
-    ret = avformat_open_input(&inputContext, "", nullptr, nullptr);
-    if (ret < 0) {
-      LOG(ERROR) << "Unable to open stream : " << ffmpegErrorStr(ret);
-      return;
-    }
-
-    ret = avformat_find_stream_info(inputContext, nullptr);
-    if (ret < 0) {
-      LOG(ERROR) << "Unable to find stream info in " << videoName << " "
-                 << ffmpegErrorStr(ret);
-      return;
-    }
-
-    // Decode the first video stream
-    int videoStreamIndex_ = params.streamIndex_;
-    int audioStreamIndex_ = params.streamIndex_;
-    if (params.streamIndex_ == -1) {
-      for (int i = 0; i < inputContext->nb_streams; i++) {
-        auto stream = inputContext->streams[i];
-        if (stream->codec->codec_type == AVMEDIA_TYPE_VIDEO &&
-            videoStreamIndex_ == -1) {
-          videoStreamIndex_ = i;
-          videoStream_ = stream;
-        } else if (
-            stream->codec->codec_type == AVMEDIA_TYPE_AUDIO &&
-            audioStreamIndex_ == -1) {
-          audioStreamIndex_ = i;
-        }
-        if (videoStreamIndex_ != -1 && audioStreamIndex_ != -1) {
-          break;
-        }
-      }
-    }
-    if (videoStream_ == nullptr) {
-      LOG(ERROR) << "Unable to find video stream in " << videoName << " "
-                 << ffmpegErrorStr(ret);
-      return;
-    }
-
-    // Initialize codec
-    AVDictionary* opts = nullptr;
-    videoCodecContext_ = videoStream_->codec;
-    try {
-      ret = avcodec_open2(
-          videoCodecContext_,
-          avcodec_find_decoder(videoCodecContext_->codec_id),
-          &opts);
-    } catch (const std::exception&) {
-      LOG(ERROR) << "Exception during open video codec";
-      return;
-    }
-
-    if (ret < 0) {
-      LOG(ERROR) << "Cannot open video codec : "
-                 << videoCodecContext_->codec->name;
-      return;
-    }
-
-    if (params.getAudio_ && audioStreamIndex_ >= 0) {
-      // see e.g. ridge/decoder/StreamDecoder.cpp
-      audioCodecContext_ = inputContext->streams[audioStreamIndex_]->codec;
-      ret = avcodec_open2(
-          audioCodecContext_,
-          avcodec_find_decoder(audioCodecContext_->codec_id),
-          nullptr);
-
-      if (ret < 0) {
-        LOG(ERROR) << "Cannot open audio codec : "
-                   << audioCodecContext_->codec->name;
-        return;
-      }
-
-      convertCtx_ = swr_alloc_set_opts(
-          nullptr,
-          params.outlayout_,
-          (AVSampleFormat)params.outfmt_,
-          params.outrate_,
-          audioCodecContext_->channel_layout,
-          audioCodecContext_->sample_fmt,
-          audioCodecContext_->sample_rate,
-          0,
-          nullptr);
-
-      if (convertCtx_ == nullptr) {
-        LOG(ERROR) << "Cannot setup sample format converter.";
-        return;
-      }
-      if (swr_init(convertCtx_) < 0) {
-        LOG(ERROR) << "Cannot init sample format converter.";
-        return;
-      }
-    }
-
-    // Calculate if we need to rescale the frames
-    const int origWidth = videoCodecContext_->width;
-    const int origHeight = videoCodecContext_->height;
-    int outWidth = origWidth;
-    int outHeight = origHeight;
-
-    if (params.video_res_type_ == VideoResType::ORIGINAL_RES) {
-      // if the original resolution is too low,
-      // make it at least the same size as crop_size_
-      if (params.crop_size_ > origWidth || params.crop_size_ > origHeight) {
-        ResizeAndKeepAspectRatio(
-            origWidth, origHeight, params.crop_size_, -1, outWidth, outHeight);
-      }
-    } else if (params.video_res_type_ == VideoResType::USE_SHORT_EDGE) {
-      // resize the image to the predefined
-      // short_edge_ resolution while keep the aspect ratio
-      ResizeAndKeepAspectRatio(
-          origWidth, origHeight, params.short_edge_, -1, outWidth, outHeight);
-    } else if (params.video_res_type_ == VideoResType::USE_WIDTH_HEIGHT) {
-      // resize the image to the predefined
-      // resolution and ignore the aspect ratio
-      outWidth = params.outputWidth_;
-      outHeight = params.outputHeight_;
-    } else {
-      LOG(ERROR) << "Unknown VideoResType: " << params.video_res_type_;
-      return;
-    }
-
-    // Make sure that we have a valid format
-    if (videoCodecContext_->pix_fmt == AV_PIX_FMT_NONE) {
-      LOG(ERROR) << "pixel format is not valid.";
-      return;
-    }
-
-    // Create a scale context
-    scaleContext_ = sws_getContext(
-        videoCodecContext_->width,
-        videoCodecContext_->height,
-        videoCodecContext_->pix_fmt,
-        outWidth,
-        outHeight,
-        pixFormat,
-        SWS_FAST_BILINEAR,
-        nullptr,
-        nullptr,
-        nullptr);
-
-    // Getting video meta data
-    VideoMeta videoMeta;
-    videoMeta.codec_type = videoCodecContext_->codec_type;
-    videoMeta.width = outWidth;
-    videoMeta.height = outHeight;
-    videoMeta.pixFormat = pixFormat;
-
-    // avoid division by zero, code adapted from
-    // https://www.ffmpeg.org/doxygen/0.6/rational_8h-source.html
-    if (videoStream_->avg_frame_rate.num == 0 ||
-        videoStream_->avg_frame_rate.den == 0) {
-      LOG(ERROR) << "Frame rate is wrong. No data found.";
-      return;
-    }
-
-    videoMeta.fps = av_q2d(videoStream_->avg_frame_rate);
-    callback.videoDecodingStarted(videoMeta);
-
-    if (params.intervals_.size() == 0) {
-      LOG(ERROR) << "Empty sampling intervals.";
-      return;
-    }
-
-    std::vector<SampleInterval>::const_iterator itvlIter =
-        params.intervals_.begin();
-    if (itvlIter->timestamp != 0) {
-      LOG(ERROR) << "Sampling interval starting timestamp is not zero.";
-      return;
-    }
-
-    double currFps = itvlIter->fps;
-    if (currFps < 0 && currFps != SpecialFps::SAMPLE_ALL_FRAMES &&
-        currFps != SpecialFps::SAMPLE_TIMESTAMP_ONLY) {
-      // fps must be 0, -1, -2 or > 0
-      LOG(ERROR) << "Invalid sampling fps.";
-      return;
-    }
-
-    double prevTimestamp = itvlIter->timestamp;
-    itvlIter++;
-    if (itvlIter != params.intervals_.end() &&
-        prevTimestamp >= itvlIter->timestamp) {
-      LOG(ERROR) << "Sampling interval timestamps must be strictly ascending.";
-      return;
-    }
-
-    double lastFrameTimestamp = -1.0;
-    double timestamp = -1.0;
-
-    // Initialize frame and packet.
-    // These will be reused across calls.
-    videoStreamFrame_ = av_frame_alloc();
-    audioStreamFrame_ = av_frame_alloc();
-
-    // frame index in video stream
-    int frameIndex = -1;
-    // frame index of outputed frames
-    int outputFrameIndex = -1;
-
-    /* identify the starting point from where we must start decoding */
-    std::mt19937 meta_randgen(time(nullptr));
-    long int start_ts = -1;
-    bool mustDecodeAll = false;
-
-    if (videoStream_->duration > 0 && videoStream_->nb_frames > 0) {
-      /* we have a valid duration and nb_frames. We can safely
-       * detect an intermediate timestamp to start decoding from. */
-
-      // leave a margin of 10 frames to take in to account the error
-      // from av_seek_frame
-      long int margin =
-          int(ceil((10 * videoStream_->duration) / (videoStream_->nb_frames)));
-      // if we need to do temporal jittering
-      if (params.decode_type_ == DecodeType::DO_TMP_JITTER) {
-        /* estimate the average duration for the required # of frames */
-        double maxFramesDuration =
-            (videoStream_->duration * params.num_of_required_frame_) /
-            (videoStream_->nb_frames);
-        int ts1 = 0;
-        int ts2 = videoStream_->duration - int(ceil(maxFramesDuration));
-        ts2 = ts2 > 0 ? ts2 : 0;
-        // pick a random timestamp between ts1 and ts2. ts2 is selected such
-        // that you have enough frames to satisfy the required # of frames.
-        start_ts = std::uniform_int_distribution<>(ts1, ts2)(meta_randgen);
-        // seek a frame at start_ts
-        ret = av_seek_frame(
-            inputContext,
-            videoStreamIndex_,
-            0 > (start_ts - margin) ? 0 : (start_ts - margin),
-            AVSEEK_FLAG_BACKWARD);
-
-        // if we need to decode from the start_frm
-      } else if (params.decode_type_ == DecodeType::USE_START_FRM) {
-        if (videoStream_ == nullptr) {
-          LOG(ERROR) << "Nullptr found at videoStream_";
-          return;
-        }
-        start_ts = int(floor(
-            (videoStream_->duration * start_frm) / (videoStream_->nb_frames)));
-        // seek a frame at start_ts
-        ret = av_seek_frame(
-            inputContext,
-            videoStreamIndex_,
-            0 > (start_ts - margin) ? 0 : (start_ts - margin),
-            AVSEEK_FLAG_BACKWARD);
-      } else {
-        mustDecodeAll = true;
-      }
-
-      if (ret < 0) {
-        LOG(INFO) << "Unable to decode from a random start point";
-        /* fall back to default decoding of all frames from start */
-        av_seek_frame(inputContext, videoStreamIndex_, 0, AVSEEK_FLAG_BACKWARD);
-        mustDecodeAll = true;
-      }
-    } else {
-      mustDecodeAll = true;
-    }
-
-    int gotPicture = 0;
-    int eof = 0;
-    int selectiveDecodedFrames = 0;
-
-    int maxFrames = (params.decode_type_ == DecodeType::DO_UNIFORM_SMP)
-        ? MAX_DECODING_FRAMES
-        : params.num_of_required_frame_;
-    // There is a delay between reading packets from the
-    // transport and getting decoded frames back.
-    // Therefore, after EOF, continue going while
-    // the decoder is still giving us frames.
-    while ((!eof || gotPicture) &&
-           /* either you must decode all frames or decode up to maxFrames
-            * based on status of the mustDecodeAll flag */
-           (mustDecodeAll || (selectiveDecodedFrames < maxFrames)) &&
-           /* If on the last interval and not autodecoding keyframes and a
-            * SpecialFps indicates no more frames are needed, stop decoding */
-           !((itvlIter == params.intervals_.end() &&
-              (currFps == SpecialFps::SAMPLE_TIMESTAMP_ONLY ||
-               currFps == SpecialFps::SAMPLE_NO_FRAME)) &&
-             !params.keyFrames_)) {
-      try {
-        if (!eof) {
-          ret = av_read_frame(inputContext, &packet);
-          if (ret == AVERROR_EOF) {
-            eof = 1;
-            av_free_packet(&packet);
-            packet.data = nullptr;
-            packet.size = 0;
-            // stay in the while loop to flush frames
-          } else if (ret == AVERROR(EAGAIN)) {
-            av_free_packet(&packet);
-            continue;
-          } else if (ret < 0) {
-            LOG(ERROR) << "Error reading packet : " << ffmpegErrorStr(ret);
-            return;
-          }
-
-          auto si = packet.stream_index;
-          if (params.getAudio_ && audioStreamIndex_ >= 0 &&
-              si == audioStreamIndex_) {
-            // Audio packets can have multiple audio frames in a single packet
-            while (packet.size > 0) {
-              assert(audioCodecContext_ != nullptr);
-              assert(convertCtx_ != nullptr);
-              getAudioSample(
-                  packet,
-                  audioCodecContext_,
-                  audioStreamFrame_,
-                  convertCtx_,
-                  callback,
-                  params);
-            }
-          }
-
-          if (si != videoStreamIndex_) {
-            av_free_packet(&packet);
-            continue;
-          }
-        }
-
-        ret = avcodec_decode_video2(
-            videoCodecContext_, videoStreamFrame_, &gotPicture, &packet);
-        if (ret < 0) {
-          LOG(ERROR) << "Error decoding video frame : " << ffmpegErrorStr(ret);
-          return;
-        }
-        try {
-          // Nothing to do without a picture
-          if (!gotPicture) {
-            av_free_packet(&packet);
-            continue;
-          }
-          frameIndex++;
-
-          long int frame_ts =
-              av_frame_get_best_effort_timestamp(videoStreamFrame_);
-          timestamp = frame_ts * av_q2d(videoStream_->time_base);
-          if ((frame_ts >= start_ts && !mustDecodeAll) || mustDecodeAll) {
-            /* process current frame if:
-             * 1) We are not doing selective decoding and mustDecodeAll
-             *    OR
-             * 2) We are doing selective decoding and current frame
-             *   timestamp is >= start_ts from where we start selective
-             *   decoding*/
-            // if reaching the next interval, update the current fps
-            // and reset lastFrameTimestamp so the current frame could be
-            // sampled (unless fps == SpecialFps::SAMPLE_NO_FRAME)
-            if (itvlIter != params.intervals_.end() &&
-                timestamp >= itvlIter->timestamp) {
-              lastFrameTimestamp = -1.0;
-              currFps = itvlIter->fps;
-              prevTimestamp = itvlIter->timestamp;
-              itvlIter++;
-              if (itvlIter != params.intervals_.end() &&
-                  prevTimestamp >= itvlIter->timestamp) {
-                LOG(ERROR)
-                    << "Sampling interval timestamps must be strictly ascending.";
-                return;
-              }
-            }
-
-            // keyFrame will bypass all checks on fps sampling settings
-            bool keyFrame = params.keyFrames_ && videoStreamFrame_->key_frame;
-            if (!keyFrame) {
-              // if fps == SpecialFps::SAMPLE_NO_FRAME (0), don't sample at all
-              if (currFps == SpecialFps::SAMPLE_NO_FRAME) {
-                av_free_packet(&packet);
-                continue;
-              }
-
-              // fps is considered reached in the following cases:
-              // 1. lastFrameTimestamp < 0 - start of a new interval
-              //    (or first frame)
-              // 2. currFps == SpecialFps::SAMPLE_ALL_FRAMES (-1) - sample every
-              //    frame
-              // 3. timestamp - lastFrameTimestamp has reached target fps and
-              //    currFps > 0 (not special fps setting)
-              // different modes for fps:
-              // SpecialFps::SAMPLE_NO_FRAMES (0):
-              //     disable fps sampling, no frame sampled at all
-              // SpecialFps::SAMPLE_ALL_FRAMES (-1):
-              //     unlimited fps sampling, will sample at native video fps
-              // SpecialFps::SAMPLE_TIMESTAMP_ONLY (-2):
-              //     disable fps sampling, but will get the frame at specific
-              //     timestamp
-              // others (> 0): decoding at the specified fps
-              bool fpsReached = lastFrameTimestamp < 0 ||
-                  currFps == SpecialFps::SAMPLE_ALL_FRAMES ||
-                  (currFps > 0 &&
-                   timestamp >= lastFrameTimestamp + (1 / currFps));
-
-              if (!fpsReached) {
-                av_free_packet(&packet);
-                continue;
-              }
-            }
-
-            lastFrameTimestamp = timestamp;
-
-            outputFrameIndex++;
-            if (params.maximumOutputFrames_ != -1 &&
-                outputFrameIndex >= params.maximumOutputFrames_) {
-              // enough frames
-              av_free_packet(&packet);
-              break;
-            }
-
-            AVFrame* rgbFrame = av_frame_alloc();
-            if (!rgbFrame) {
-              LOG(ERROR) << "Error allocating AVframe";
-              return;
-            }
-
-            try {
-              // Determine required buffer size and allocate buffer
-              int numBytes = avpicture_get_size(pixFormat, outWidth, outHeight);
-              DecodedFrame::AvDataPtr buffer(
-                  (uint8_t*)av_malloc(numBytes * sizeof(uint8_t)));
-
-              int size = avpicture_fill(
-                  (AVPicture*)rgbFrame,
-                  buffer.get(),
-                  pixFormat,
-                  outWidth,
-                  outHeight);
-
-              sws_scale(
-                  scaleContext_,
-                  videoStreamFrame_->data,
-                  videoStreamFrame_->linesize,
-                  0,
-                  videoCodecContext_->height,
-                  rgbFrame->data,
-                  rgbFrame->linesize);
-
-              unique_ptr<DecodedFrame> frame = make_unique<DecodedFrame>();
-              frame->width_ = outWidth;
-              frame->height_ = outHeight;
-              frame->data_ = std::move(buffer);
-              frame->size_ = size;
-              frame->index_ = frameIndex;
-              frame->outputFrameIndex_ = outputFrameIndex;
-              frame->timestamp_ = timestamp;
-              frame->keyFrame_ = videoStreamFrame_->key_frame;
-
-              callback.frameDecoded(std::move(frame));
-
-              selectiveDecodedFrames++;
-              av_frame_free(&rgbFrame);
-            } catch (const std::exception&) {
-              av_frame_free(&rgbFrame);
-            }
-          }
-          av_frame_unref(videoStreamFrame_);
-          av_frame_unref(audioStreamFrame_);
-        } catch (const std::exception&) {
-          av_frame_unref(videoStreamFrame_);
-          av_frame_unref(audioStreamFrame_);
-        }
-
-        av_free_packet(&packet);
-      } catch (const std::exception&) {
-        av_free_packet(&packet);
-      }
-    } // of while loop
-    callback.videoDecodingEnded(timestamp);
-
-    // free all stuffs
-    sws_freeContext(scaleContext_);
-    swr_free(&convertCtx_);
-    av_packet_unref(&packet);
-    av_frame_free(&videoStreamFrame_);
-    av_frame_free(&audioStreamFrame_);
-    avcodec_close(videoCodecContext_);
-    if (audioCodecContext_ != nullptr) {
-      avcodec_close(audioCodecContext_);
-    }
-    avformat_close_input(&inputContext);
-    avformat_free_context(inputContext);
-  } catch (const std::exception&) {
-    // In case of decoding error
-    // free all stuffs
-    sws_freeContext(scaleContext_);
-    swr_free(&convertCtx_);
-    av_packet_unref(&packet);
-    av_frame_free(&videoStreamFrame_);
-    av_frame_free(&audioStreamFrame_);
-    avcodec_close(videoCodecContext_);
-    avcodec_close(audioCodecContext_);
-    avformat_close_input(&inputContext);
-    avformat_free_context(inputContext);
-  }
-}
-
-void VideoDecoder::decodeMemory(
-    const string& videoName,
-    const char* buffer,
-    const int size,
-    const Params& params,
-    const int start_frm,
-    Callback& callback) {
-  VideoIOContext ioctx(buffer, size);
-  decodeLoop(videoName, ioctx, params, start_frm, callback);
-}
-
-void VideoDecoder::decodeFile(
-    const string& file,
-    const Params& params,
-    const int start_frm,
-    Callback& callback) {
-  VideoIOContext ioctx(file);
-  decodeLoop(file, ioctx, params, start_frm, callback);
-}
-
-string VideoDecoder::ffmpegErrorStr(int result) {
-  std::array<char, 128> buf;
-  av_strerror(result, buf.data(), buf.size());
-  return string(buf.data());
-}
-
-void FreeDecodedData(
-    std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames,
-    std::vector<std::unique_ptr<DecodedAudio>>& sampledAudio) {
-  // free the sampledFrames and sampledAudio
-  for (int i = 0; i < sampledFrames.size(); i++) {
-    DecodedFrame* p = sampledFrames[i].release();
-    delete p;
-  }
-  for (int i = 0; i < sampledAudio.size(); i++) {
-    DecodedAudio* p = sampledAudio[i].release();
-    delete p;
-  }
-  sampledFrames.clear();
-  sampledAudio.clear();
-}
-
-bool DecodeMultipleClipsFromVideo(
-    const char* video_buffer,
-    const std::string& video_filename,
-    const int encoded_size,
-    const Params& params,
-    const int start_frm,
-    const int clip_per_video,
-    const std::vector<int>& clip_start_positions,
-    const bool use_local_file,
-    int& height,
-    int& width,
-    std::vector<unsigned char*>& buffer_rgb) {
-  std::vector<std::unique_ptr<DecodedFrame>> sampledFrames;
-  std::vector<std::unique_ptr<DecodedAudio>> sampledAudio;
-  VideoDecoder decoder;
-
-  CallbackImpl callback;
-  // decoding from buffer or file
-  if (!use_local_file) {
-    decoder.decodeMemory(
-        string("Memory Buffer"),
-        video_buffer,
-        encoded_size,
-        params,
-        start_frm,
-        callback);
-  } else {
-    decoder.decodeFile(video_filename, params, start_frm, callback);
-  }
-
-  for (auto& frame : callback.frames) {
-    sampledFrames.push_back(std::move(frame));
-  }
-  for (auto& audio_sample : callback.audio_samples) {
-    sampledAudio.push_back(std::move(audio_sample));
-  }
-
-  for (int i = 0; i < buffer_rgb.size(); i++) {
-    unsigned char* buff = buffer_rgb[i];
-    delete[] buff;
-  }
-  buffer_rgb.clear();
-
-  if (sampledFrames.size() < params.num_of_required_frame_) {
-    LOG(ERROR)
-        << "The video seems faulty and we could not decode enough frames: "
-        << sampledFrames.size() << " VS " << params.num_of_required_frame_;
-    FreeDecodedData(sampledFrames, sampledAudio);
-    return true;
-  }
-  if (sampledFrames.size() == 0) {
-    LOG(ERROR) << "The samples frames have size 0, no frame to process";
-    FreeDecodedData(sampledFrames, sampledAudio);
-    return true;
-  }
-  height = sampledFrames[0]->height_;
-  width = sampledFrames[0]->width_;
-  float sample_stepsz = (clip_per_video <= 1)
-      ? 0
-      : (float(sampledFrames.size() - params.num_of_required_frame_) /
-         (clip_per_video - 1));
-
-  int image_size = 3 * height * width;
-  int clip_size = params.num_of_required_frame_ * image_size;
-  // get the RGB frames for each clip
-  if (clip_start_positions.size() > 0) {
-    for (int i = 0; i < clip_start_positions.size(); i++) {
-      unsigned char* buffer_rgb_ptr = new unsigned char[clip_size];
-      int clip_start = clip_start_positions[i];
-      for (int j = 0; j < params.num_of_required_frame_; j++) {
-        memcpy(
-            buffer_rgb_ptr + j * image_size,
-            (unsigned char*)sampledFrames[j + clip_start]->data_.get(),
-            image_size * sizeof(unsigned char));
-      }
-      buffer_rgb.push_back(buffer_rgb_ptr);
-    }
-  } else {
-    for (int i = 0; i < clip_per_video; i++) {
-      unsigned char* buffer_rgb_ptr = new unsigned char[clip_size];
-      int clip_start = floor(i * sample_stepsz);
-      for (int j = 0; j < params.num_of_required_frame_; j++) {
-        memcpy(
-            buffer_rgb_ptr + j * image_size,
-            (unsigned char*)sampledFrames[j + clip_start]->data_.get(),
-            image_size * sizeof(unsigned char));
-      }
-      buffer_rgb.push_back(buffer_rgb_ptr);
-    }
-  }
-  FreeDecodedData(sampledFrames, sampledAudio);
-
-  return true;
-}
-
-} // namespace caffe2
--- a/caffe2/video/video_decoder.h
+++ b/caffe2/video/video_decoder.h
@ -1,525 +0,0 @@
-#ifndef CAFFE2_VIDEO_VIDEO_DECODER_H_
-#define CAFFE2_VIDEO_VIDEO_DECODER_H_
-
-#include <caffe2/core/logging.h>
-#include <stdio.h>
-#include <memory>
-#include <string>
-#include <vector>
-
-extern "C" {
-#include <libavcodec/avcodec.h>
-#include <libavformat/avformat.h>
-#include <libavformat/avio.h>
-#include <libavutil/log.h>
-#include <libavutil/motion_vector.h>
-#include <libswresample/swresample.h>
-#include <libswscale/swscale.h>
-}
-
-namespace caffe2 {
-
-#define VIO_BUFFER_SZ 32768
-#define MAX_DECODING_FRAMES 10000
-
-// enum to specify 3 special fps sampling behaviors:
-// 0: disable fps sampling, no frame sampled at all
-// -1: unlimited fps sampling, will sample at native video fps
-// -2: disable fps sampling, but will get the frame at specific timestamp
-enum SpecialFps {
-  SAMPLE_NO_FRAME = 0,
-  SAMPLE_ALL_FRAMES = -1,
-  SAMPLE_TIMESTAMP_ONLY = -2,
-};
-
-// three different types of resolution when decoding the video
-// 0: resize to width x height and ignore the aspect ratio;
-// 1: resize to short_edge and keep the aspect ratio;
-// 2: using the original resolution of the video; if resolution
-//    is smaller than crop_size x crop_size, resize to crop_size
-//    and keep the aspect ratio;
-// 3: for xray video service
-enum VideoResType {
-  USE_WIDTH_HEIGHT = 0,
-  USE_SHORT_EDGE = 1,
-  ORIGINAL_RES = 2,
-};
-
-// three different types of decoding behavior are supported
-// 0: do temporal jittering to sample a random clip from the video
-// 1: uniformly sample multiple clips from the video;
-// 2: sample a clip from a given starting frame
-// 3: for xray video service
-enum DecodeType {
-  DO_TMP_JITTER = 0,
-  DO_UNIFORM_SMP = 1,
-  USE_START_FRM = 2,
-};
-
-// sampling interval for fps starting at specified timestamp
-// use enum SpecialFps to set special fps decoding behavior
-// note sampled fps will not always accurately follow the target fps,
-// because sampled frame has to snap to actual frame timestamp,
-// e.g. video fps = 25, sample fps = 4 will sample every 0.28s, not 0.25
-// video fps = 25, sample fps = 5 will sample every 0.24s, not 0.2,
-// because of floating-point division accuracy (1 / 5.0 is not exactly 0.2)
-struct SampleInterval {
-  double timestamp;
-  double fps;
-  SampleInterval() : timestamp(-1), fps(SpecialFps::SAMPLE_ALL_FRAMES) {}
-  SampleInterval(double ts, double f) : timestamp(ts), fps(f) {}
-  bool operator<(const SampleInterval& itvl) const {
-    return (timestamp < itvl.timestamp);
-  }
-};
-
-class Params {
- public:
-  // return all key-frames regardless of specified fps
-  bool keyFrames_ = false;
-
-  // return audio data while decoding the video
-  bool getAudio_ = false;
-
-  // for sampling audio data
-  int outrate_ = 22000;
-  int outfmt_ = AV_SAMPLE_FMT_FLT;
-  int64_t outlayout_ = AV_CH_LAYOUT_MONO;
-
-  // Output image pixel format
-  AVPixelFormat pixelFormat_ = AVPixelFormat::AV_PIX_FMT_RGB24;
-
-  // Index of stream to decode.
-  // -1 will automatically decode the first video stream.
-  int streamIndex_ = -1;
-
-  // How many frames to output at most from the video
-  // -1 no limit
-  int maximumOutputFrames_ = -1;
-
-  // params for video resolution
-  int video_res_type_ = VideoResType::USE_WIDTH_HEIGHT;
-  int crop_size_ = -1;
-  int short_edge_ = -1;
-
-  // Output video size, -1 to preserve origianl dimension
-  int outputWidth_ = -1;
-  int outputHeight_ = -1;
-
-  // max output dimension, -1 to preserve original size
-  // the larger dimension of the video will be scaled to this size,
-  // and the second dimension will be scaled to preserve aspect ratio
-  int maxOutputDimension_ = -1;
-
-  // params for decoding behavior
-  int decode_type_ = DecodeType::DO_TMP_JITTER;
-  int num_of_required_frame_ = -1;
-
-  // intervals_ control variable sampling fps between different timestamps
-  // intervals_ must be ordered strictly ascending by timestamps
-  // the first interval must have a timestamp of zero
-  // fps must be either the 3 special fps defined in SpecialFps, or > 0
-  std::vector<SampleInterval> intervals_ = {{0, SpecialFps::SAMPLE_ALL_FRAMES}};
-
-  Params() {}
-
-  /**
-   * FPS of output frames
-   * setting here will reset intervals_ and force decoding at target FPS
-   * This can be used if user just want to decode at a steady fps
-   */
-  Params& fps(float v) {
-    intervals_.clear();
-    intervals_.emplace_back(0, v);
-    return *this;
-  }
-
-  /**
-   * Sample output frames at a specified list of timestamps
-   * Timestamps must be in increasing order, and timestamps past the end of the
-   * video will be ignored
-   * Setting here will reset intervals_
-   */
-  Params& setSampleTimestamps(const std::vector<double>& timestamps) {
-    intervals_.clear();
-    // insert an interval per desired frame.
-    for (auto& timestamp : timestamps) {
-      intervals_.emplace_back(timestamp, SpecialFps::SAMPLE_TIMESTAMP_ONLY);
-    }
-    return *this;
-  }
-
-  /**
-   * Pixel format of output buffer, default PIX_FMT_RGB24
-   */
-  Params& pixelFormat(AVPixelFormat pixelFormat) {
-    pixelFormat_ = pixelFormat;
-    return *this;
-  }
-
-  /**
-   * Return all key-frames
-   */
-  Params& keyFrames(bool keyFrames) {
-    keyFrames_ = keyFrames;
-    return *this;
-  }
-
-  /**
-   * Index of video stream to process, defaults to the first video stream
-   */
-  Params& streamIndex(int index) {
-    streamIndex_ = index;
-    return *this;
-  }
-
-  /**
-   * Only output this many frames, default to no limit
-   */
-  Params& maxOutputFrames(int count) {
-    maximumOutputFrames_ = count;
-    return *this;
-  }
-
-  /**
-   * Output frame width, default to video width
-   */
-  Params& outputWidth(int width) {
-    outputWidth_ = width;
-    return *this;
-  }
-
-  /**
-   * Output frame height, default to video height
-   */
-  Params& outputHeight(int height) {
-    outputHeight_ = height;
-    return *this;
-  }
-
-  /**
-   * Max dimension of either width or height, if any is bigger
-   * it will be scaled down to this and econd dimension
-   * will be scaled down to maintain aspect ratio.
-   */
-  Params& maxOutputDimension(int size) {
-    maxOutputDimension_ = size;
-    return *this;
-  }
-};
-
-// data structure for storing decoded video frames
-class DecodedFrame {
- public:
-  struct avDeleter {
-    void operator()(unsigned char* p) const {
-      av_free(p);
-    }
-  };
-  using AvDataPtr = std::unique_ptr<uint8_t, avDeleter>;
-
-  // decoded data buffer
-  AvDataPtr data_;
-
-  // size in bytes
-  int size_ = 0;
-
-  // frame dimensions
-  int width_ = 0;
-  int height_ = 0;
-
-  // timestamp in seconds since beginning of video
-  double timestamp_ = 0;
-
-  // true if this is a key frame.
-  bool keyFrame_ = false;
-
-  // index of frame in video
-  int index_ = -1;
-
-  // Sequential number of outputted frame
-  int outputFrameIndex_ = -1;
-};
-
-// data structure for storing decoded audio data
-struct DecodedAudio {
-  int dataSize_;
-  int outSampleSize_;
-  std::unique_ptr<float[]> audio_data_;
-
-  explicit DecodedAudio(
-      int dataSize = 0,
-      int outSampleSize = 0,
-      std::unique_ptr<float[]> audio_data = nullptr)
-      : dataSize_(dataSize),
-        outSampleSize_(outSampleSize),
-        audio_data_(std::move(audio_data)) {}
-};
-
-class VideoIOContext {
- public:
-  explicit VideoIOContext(const std::string& fname)
-      : workBuffersize_(VIO_BUFFER_SZ),
-        workBuffer_((uint8_t*)av_malloc(workBuffersize_)),
-        inputFile_(nullptr),
-        inputBuffer_(nullptr),
-        inputBufferSize_(0) {
-    inputFile_ = fopen(fname.c_str(), "rb");
-    if (inputFile_ == nullptr) {
-      LOG(ERROR) << "Error opening video file " << fname;
-      return;
-    }
-    ctx_ = avio_alloc_context(
-        static_cast<unsigned char*>(workBuffer_.get()),
-        workBuffersize_,
-        0,
-        this,
-        &VideoIOContext::readFile,
-        nullptr, // no write function
-        &VideoIOContext::seekFile);
-  }
-
-  explicit VideoIOContext(const char* buffer, int size)
-      : workBuffersize_(VIO_BUFFER_SZ),
-        workBuffer_((uint8_t*)av_malloc(workBuffersize_)),
-        inputFile_(nullptr),
-        inputBuffer_(buffer),
-        inputBufferSize_(size) {
-    ctx_ = avio_alloc_context(
-        static_cast<unsigned char*>(workBuffer_.get()),
-        workBuffersize_,
-        0,
-        this,
-        &VideoIOContext::readMemory,
-        nullptr, // no write function
-        &VideoIOContext::seekMemory);
-  }
-
-  ~VideoIOContext() {
-    av_free(ctx_);
-    if (inputFile_) {
-      fclose(inputFile_);
-    }
-  }
-
-  int read(unsigned char* buf, int buf_size) {
-    if (inputBuffer_) {
-      return readMemory(this, buf, buf_size);
-    } else if (inputFile_) {
-      return readFile(this, buf, buf_size);
-    } else {
-      return -1;
-    }
-  }
-
-  int64_t seek(int64_t offset, int whence) {
-    if (inputBuffer_) {
-      return seekMemory(this, offset, whence);
-    } else if (inputFile_) {
-      return seekFile(this, offset, whence);
-    } else {
-      return -1;
-    }
-  }
-
-  static int readFile(void* opaque, unsigned char* buf, int buf_size) {
-    VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
-    if (feof(h->inputFile_)) {
-      return AVERROR_EOF;
-    }
-    size_t ret = fread(buf, 1, buf_size, h->inputFile_);
-    if (ret < buf_size) {
-      if (ferror(h->inputFile_)) {
-        return -1;
-      }
-    }
-    return ret;
-  }
-
-  static int64_t seekFile(void* opaque, int64_t offset, int whence) {
-    VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
-    switch (whence) {
-      case SEEK_CUR: // from current position
-      case SEEK_END: // from eof
-      case SEEK_SET: // from beginning of file
-        return fseek(h->inputFile_, static_cast<long>(offset), whence);
-        break;
-      case AVSEEK_SIZE:
-        int64_t cur = ftell(h->inputFile_);
-        fseek(h->inputFile_, 0L, SEEK_END);
-        int64_t size = ftell(h->inputFile_);
-        fseek(h->inputFile_, cur, SEEK_SET);
-        return size;
-    }
-
-    return -1;
-  }
-
-  static int readMemory(void* opaque, unsigned char* buf, int buf_size) {
-    VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
-    if (buf_size < 0) {
-      return -1;
-    }
-
-    int reminder = h->inputBufferSize_ - h->offset_;
-    int r = buf_size < reminder ? buf_size : reminder;
-    if (r < 0) {
-      return AVERROR_EOF;
-    }
-
-    memcpy(buf, h->inputBuffer_ + h->offset_, r);
-    h->offset_ += r;
-    return r;
-  }
-
-  static int64_t seekMemory(void* opaque, int64_t offset, int whence) {
-    VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
-    switch (whence) {
-      case SEEK_CUR: // from current position
-        h->offset_ += offset;
-        break;
-      case SEEK_END: // from eof
-        h->offset_ = h->inputBufferSize_ + offset;
-        break;
-      case SEEK_SET: // from beginning of file
-        h->offset_ = offset;
-        break;
-      case AVSEEK_SIZE:
-        return h->inputBufferSize_;
-    }
-    return h->offset_;
-  }
-
-  AVIOContext* get_avio() {
-    return ctx_;
-  }
-
- private:
-  int workBuffersize_;
-  DecodedFrame::AvDataPtr workBuffer_;
-  // for file mode
-  FILE* inputFile_;
-
-  // for memory mode
-  const char* inputBuffer_;
-  int inputBufferSize_;
-  int offset_ = 0;
-
-  AVIOContext* ctx_;
-};
-
-struct VideoMeta {
-  double fps;
-  int width;
-  int height;
-  enum AVMediaType codec_type;
-  AVPixelFormat pixFormat;
-  VideoMeta()
-      : fps(-1),
-        width(-1),
-        height(-1),
-        codec_type(AVMEDIA_TYPE_VIDEO),
-        pixFormat(AVPixelFormat::AV_PIX_FMT_RGB24) {}
-};
-
-class Callback {
- public:
-  virtual void frameDecoded(std::unique_ptr<DecodedFrame> img) = 0;
-  virtual void audioDecoded(
-      std::unique_ptr<DecodedAudio> /*decoded audio data*/) {}
-  virtual void videoDecodingStarted(const VideoMeta& /*videoMeta*/) {}
-  virtual void videoDecodingEnded(double /*lastFrameTimestamp*/) {}
-  virtual ~Callback() {}
-};
-
-class VideoDecoder {
- public:
-  VideoDecoder();
-
-  void decodeFile(
-      const std::string& filename,
-      const Params& params,
-      const int start_frm,
-      Callback& callback);
-
-  void decodeMemory(
-      const std::string& filename,
-      const char* buffer,
-      const int size,
-      const Params& params,
-      const int start_frm,
-      Callback& callback);
-
- private:
-  std::string ffmpegErrorStr(int result);
-
-  void ResizeAndKeepAspectRatio(
-      const int origWidth,
-      const int origHeight,
-      const int short_edge,
-      const int long_edge,
-      int& outWidth,
-      int& outHeight);
-
-  void getAudioSample(
-      AVPacket& packet,
-      AVCodecContext* audioCodecContext_,
-      AVFrame* audioStreamFrame_,
-      SwrContext* convertCtx_,
-      Callback& callback,
-      const Params& params);
-
-  void decodeLoop(
-      const std::string& videoName,
-      VideoIOContext& ioctx,
-      const Params& params,
-      const int start_frm,
-      Callback& callback);
-};
-
-TORCH_API void FreeDecodedData(
-    std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames,
-    std::vector<std::unique_ptr<DecodedAudio>>& sampledAudio);
-
-TORCH_API bool DecodeMultipleClipsFromVideo(
-    const char* video_buffer,
-    const std::string& video_filename,
-    const int encoded_size,
-    const Params& params,
-    const int start_frm,
-    const int clip_per_video,
-    const std::vector<int>& clip_start_positions,
-    const bool use_local_file,
-    int& height,
-    int& width,
-    std::vector<unsigned char*>& buffer_rgb);
-
-class CallbackImpl : public Callback {
- public:
-  std::vector<std::unique_ptr<DecodedFrame>> frames;
-  std::vector<std::unique_ptr<DecodedAudio>> audio_samples;
-
-  explicit CallbackImpl() {
-    clear();
-  }
-
-  void clear() {
-    FreeDecodedData(frames, audio_samples);
-  }
-
-  void frameDecoded(std::unique_ptr<DecodedFrame> frame) override {
-    frames.push_back(std::move(frame));
-  }
-
-  void audioDecoded(std::unique_ptr<DecodedAudio> audio_sample) override {
-    audio_samples.push_back(std::move(audio_sample));
-  }
-
-  void videoDecodingStarted(const VideoMeta& /*videoMeta*/) override {
-    clear();
-  }
-};
-
-} // namespace caffe2
-
-#endif // CAFFE2_VIDEO_VIDEO_DECODER_H_
--- a/caffe2/video/video_input_op.cc
+++ b/caffe2/video/video_input_op.cc
@ -1,93 +0,0 @@
-#include <caffe2/video/video_input_op.h>
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(VideoInput, VideoInputOp<CPUContext>);
-
-OPERATOR_SCHEMA(VideoInput)
-    .NumInputs(0, 1)
-    .NumOutputs(2, 5)
-    .TensorInferenceFunction(
-        [](const OperatorDef& def,
-           const vector<TensorShape>& /* unused */ /*in*/) {
-          ArgumentHelper helper(def);
-          int batch_size = helper.GetSingleArgument<int>("batch_size", 0);
-          int clip_per_video =
-              helper.GetSingleArgument<int>("clip_per_video", 1);
-          int crop_size = helper.GetSingleArgument<int>("crop_size", -1);
-          int length_rgb = helper.GetSingleArgument<int>("length_rgb", 0);
-          int channels_rgb = helper.GetSingleArgument<int>("channels_rgb", 3);
-          int length_of = helper.GetSingleArgument<int>("length_of", 0);
-          int channels_of = helper.GetSingleArgument<int>("channels_of", 2);
-
-          // get the flags
-          bool get_rgb = helper.GetSingleArgument<bool>("get_rgb", true);
-          bool get_optical_flow =
-              helper.GetSingleArgument<bool>("get_optical_flow", false);
-          bool do_multi_label =
-              helper.GetSingleArgument<bool>("do_multi_label", false);
-          bool get_video_id =
-              helper.GetSingleArgument<bool>("get_video_id", false);
-          bool get_start_frame =
-              helper.GetSingleArgument<bool>("get_start_frame", false);
-          // get starting positions if available
-          vector<int> clip_start_positions =
-              helper.GetRepeatedArgument<int>("clip_start_positions", {});
-          // In case clip_start_positions are given, set the clip_per_video arg
-          if (clip_start_positions.size() > 0) {
-            clip_per_video = clip_start_positions.size();
-          }
-
-          int output_size = 1;
-          if (get_rgb) {
-            output_size++;
-          }
-          if (get_optical_flow) {
-            output_size++;
-          }
-          if (get_video_id) {
-            output_size++;
-          }
-          if (get_start_frame) {
-            output_size++;
-          }
-
-          int index = 0;
-          vector<TensorShape> out(output_size);
-          TORCH_CHECK_GT(crop_size, 0);
-          batch_size *= clip_per_video;
-          if (get_rgb) {
-            out[index++] = CreateTensorShape(
-                vector<int>{
-                    batch_size, channels_rgb, length_rgb, crop_size, crop_size},
-                TensorProto::FLOAT);
-          }
-          if (get_optical_flow) {
-            out[index++] = CreateTensorShape(
-                vector<int>{
-                    batch_size, channels_of, length_of, crop_size, crop_size},
-                TensorProto::FLOAT);
-          }
-          if (!do_multi_label) {
-            out[index++] = CreateTensorShape(
-                vector<int>{1, batch_size}, TensorProto::INT32);
-          } else {
-            int num_of_class = helper.GetSingleArgument<int>("num_of_class", 0);
-            out[index++] = CreateTensorShape(
-                vector<int>{batch_size, num_of_class}, TensorProto::INT32);
-          }
-          if (get_video_id) {
-            out[index++] = CreateTensorShape(
-                vector<int64_t>{1, batch_size}, TensorProto::INT64);
-          }
-          if (get_start_frame) {
-            out[index] = CreateTensorShape(
-                vector<int>{1, batch_size}, TensorProto::INT32);
-          }
-
-          return out;
-        });
-
-NO_GRADIENT(VideoInput);
-
-} // namespace caffe2
--- a/caffe2/video/video_input_op.h
+++ b/caffe2/video/video_input_op.h
--- a/caffe2/video/video_input_op_gpu.cc
+++ b/caffe2/video/video_input_op_gpu.cc
@ -1,9 +0,0 @@
-#include <caffe2/core/common_gpu.h>
-#include <caffe2/core/context_gpu.h>
-#include <caffe2/video/video_input_op.h>
-
-namespace caffe2 {
-
-REGISTER_CUDA_OPERATOR(VideoInput, VideoInputOp<CUDAContext>);
-
-} // namespace caffe2
--- a/caffe2/video/video_io.cc
+++ b/caffe2/video/video_io.cc
@ -1,210 +0,0 @@
-#include <caffe2/core/logging.h>
-#include <caffe2/video/video_io.h>
-#include <algorithm>
-#include <random>
-#include <string>
-
-namespace caffe2 {
-
-void ClipTransformRGB(
-    const unsigned char* buffer_rgb,
-    const int crop_size,
-    const int length_rgb,
-    const int channels_rgb,
-    const int sampling_rate_rgb,
-    const int height,
-    const int width,
-    const int h_off,
-    const int w_off,
-    const bool mirror_me,
-    const std::vector<float>& mean_rgb,
-    const std::vector<float>& inv_std_rgb,
-    float* transformed_clip) {
-  // The order of output dimensions is C, L, H, W
-  int orig_index, tran_index;
-  for (int c = 0; c < channels_rgb; ++c) {
-    for (int l = 0; l < length_rgb; ++l) {
-      int orig_index_l = l * sampling_rate_rgb * height * width * channels_rgb;
-      int tran_index_l = (c * length_rgb + l) * crop_size;
-
-      for (int h = 0; h < crop_size; ++h) {
-        int orig_index_h = orig_index_l + (h + h_off) * width * channels_rgb;
-        int tran_index_h = (tran_index_l + h) * crop_size;
-
-        for (int w = 0; w < crop_size; ++w) {
-          orig_index = orig_index_h + (w + w_off) * channels_rgb + c;
-
-          // mirror the frame
-          if (mirror_me) {
-            tran_index = tran_index_h + (crop_size - 1 - w);
-          } else {
-            tran_index = tran_index_h + w;
-          }
-
-          // normalize and transform the clip
-          transformed_clip[tran_index] =
-              (buffer_rgb[orig_index] - mean_rgb[c]) * inv_std_rgb[c];
-        }
-      }
-    }
-  }
-}
-
-void ClipTransformOpticalFlow(
-    const unsigned char* buffer_rgb,
-    const int crop_size,
-    const int length_of,
-    const int channels_of,
-    const int sampling_rate_of,
-    const int height,
-    const int width,
-    const cv::Rect& rect,
-    const int channels_rgb,
-    const bool mirror_me,
-    const int flow_alg_type,
-    const int flow_data_type,
-    const int frame_gap_of,
-    const bool do_flow_aggregation,
-    const std::vector<float>& mean_of,
-    const std::vector<float>& inv_std_of,
-    float* transformed_clip) {
-  const int frame_size = crop_size * crop_size;
-  const int channel_size_flow = length_of * frame_size;
-
-  // for get the mean and std of the input data
-  bool extract_statistics = false;
-  static std::vector<double> mean_static(channels_of, 0.f);
-  static std::vector<double> std_static(channels_of, 0.f);
-  static long long count = 0;
-  cv::Scalar mean_img, std_img;
-
-  for (int l = 0; l < length_of; l++) {
-    // get the grayscale frames
-    std::vector<cv::Mat> grays, rgbs;
-    int step_size = do_flow_aggregation ? 1 : frame_gap_of;
-    for (int j = 0; j <= frame_gap_of; j += step_size) {
-      // get the current frame
-      const unsigned char* curr_frame = buffer_rgb +
-          (l * sampling_rate_of + j) * height * width * channels_rgb;
-      cv::Mat img = cv::Mat::zeros(height, width, CV_8UC3);
-      memcpy(
-          img.data,
-          curr_frame,
-          height * width * channels_rgb * sizeof(unsigned char));
-
-      // crop and mirror the frame
-      cv::Mat img_cropped = img(rect);
-      if (mirror_me) {
-        cv::flip(img_cropped, img_cropped, 1);
-      }
-
-      cv::Mat gray;
-      cv::cvtColor(img_cropped, gray, cv::COLOR_RGB2GRAY);
-      grays.push_back(gray);
-      rgbs.push_back(img_cropped);
-    }
-
-    cv::Mat first_gray, first_rgb;
-    cv::Mat flow = cv::Mat::zeros(crop_size, crop_size, CV_32FC2);
-    MultiFrameOpticalFlowExtractor(grays, flow_alg_type, flow);
-
-    std::vector<cv::Mat> imgs;
-    cv::split(flow, imgs);
-    // save the 2-channel optical flow first
-    int c = 0;
-    for (; c < 2; c++) {
-      if (extract_statistics) {
-        cv::meanStdDev(imgs[c], mean_img, std_img);
-        mean_static[c] += mean_img[0];
-        std_static[c] += std_img[0];
-      }
-
-      imgs[c] -= mean_of[c];
-      imgs[c] *= inv_std_of[c];
-      memcpy(
-          transformed_clip + c * channel_size_flow + l * frame_size,
-          imgs[c].data,
-          frame_size * sizeof(float));
-    }
-
-    cv::Mat mag;
-    std::vector<cv::Mat> chans;
-    // augment the optical flow with more channels
-    switch (flow_data_type) {
-      case FlowDataType::Flow2C:
-        // nothing to do if we only need two channels
-        break;
-
-      case FlowDataType::Flow3C:
-        // use magnitude as the third channel
-        mag = cv::abs(imgs[0]) + cv::abs(imgs[1]);
-        if (extract_statistics) {
-          cv::meanStdDev(mag, mean_img, std_img);
-          mean_static[c] += mean_img[0];
-          std_static[c] += std_img[0];
-        }
-
-        mag -= mean_of[c];
-        mag *= inv_std_of[c];
-        memcpy(
-            transformed_clip + c * channel_size_flow + l * frame_size,
-            mag.data,
-            frame_size * sizeof(float));
-        break;
-
-      case FlowDataType::FlowWithGray:
-        // add grayscale image as the third channel
-        grays[0].convertTo(first_gray, CV_32FC1);
-        if (extract_statistics) {
-          cv::meanStdDev(first_gray, mean_img, std_img);
-          mean_static[c] += mean_img[0];
-          std_static[c] += std_img[0];
-        }
-
-        first_gray -= mean_of[c];
-        first_gray *= inv_std_of[c];
-        memcpy(
-            transformed_clip + c * channel_size_flow + l * frame_size,
-            first_gray.data,
-            frame_size * sizeof(float));
-        break;
-
-      case FlowDataType::FlowWithRGB:
-        // add all three rgb channels
-        rgbs[0].convertTo(first_rgb, CV_32FC3);
-        cv::split(first_rgb, chans);
-        for (; c < channels_of; c++) {
-          if (extract_statistics) {
-            cv::meanStdDev(chans[c - 2], mean_img, std_img);
-            mean_static[c] += mean_img[0];
-            std_static[c] += std_img[0];
-          }
-
-          chans[c - 2] -= mean_of[c];
-          chans[c - 2] *= inv_std_of[c];
-          memcpy(
-              transformed_clip + c * channel_size_flow + l * frame_size,
-              chans[c - 2].data,
-              frame_size * sizeof(float));
-        }
-        break;
-
-      default:
-        LOG(ERROR) << "Unsupported optical flow data type " << flow_data_type;
-        break;
-    }
-
-    if (extract_statistics) {
-      count++;
-      if (count % 1000 == 1) {
-        for (int i = 0; i < channels_of; i++) {
-          LOG(INFO) << i
-                    << "-th channel mean: " << mean_static[i] / float(count)
-                    << " std: " << std_static[i] / float(count);
-        }
-      }
-    }
-  }
-}
-
-} // namespace caffe2
--- a/caffe2/video/video_io.h
+++ b/caffe2/video/video_io.h
@ -1,51 +0,0 @@
-#ifndef CAFFE2_VIDEO_VIDEO_IO_H_
-#define CAFFE2_VIDEO_VIDEO_IO_H_
-
-#include <caffe2/core/common.h>
-#include <caffe2/video/optical_flow.h>
-#include <caffe2/video/video_decoder.h>
-#include <opencv2/opencv.hpp>
-#include <random>
-
-#include <istream>
-#include <ostream>
-
-namespace caffe2 {
-
-TORCH_API void ClipTransformRGB(
-    const unsigned char* buffer_rgb,
-    const int crop_size,
-    const int length_rgb,
-    const int channels_rgb,
-    const int sampling_rate_rgb,
-    const int height,
-    const int width,
-    const int h_off,
-    const int w_off,
-    const bool mirror_me,
-    const std::vector<float>& mean_rgb,
-    const std::vector<float>& inv_std_rgb,
-    float* transformed_clip);
-
-TORCH_API void ClipTransformOpticalFlow(
-    const unsigned char* buffer_rgb,
-    const int crop_size,
-    const int length_of,
-    const int channels_of,
-    const int sampling_rate_of,
-    const int height,
-    const int width,
-    const cv::Rect& rect,
-    const int channels_rgb,
-    const bool mirror_me,
-    const int flow_alg_type,
-    const int flow_data_type,
-    const int frame_gap_of,
-    const bool do_flow_aggregation,
-    const std::vector<float>& mean_of,
-    const std::vector<float>& inv_std_of,
-    float* transformed_clip);
-
-} // namespace caffe2
-
-#endif // CAFFE2_VIDEO_VIDEO_IO_H_
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@ -932,45 +932,6 @@ if(USE_REDIS)
  endif()
 endif()

-
-# ---[ OpenCV
-if(USE_OPENCV)
-  # OpenCV 4
-  find_package(OpenCV 4 QUIET COMPONENTS core highgui imgproc imgcodecs optflow videoio video)
-  if(NOT OpenCV_FOUND)
-    # OpenCV 3
-    find_package(OpenCV 3 QUIET COMPONENTS core highgui imgproc imgcodecs videoio video)
-    if(NOT OpenCV_FOUND)
-      # OpenCV 2
-      find_package(OpenCV QUIET COMPONENTS core highgui imgproc)
-    endif()
-  endif()
-  if(OpenCV_FOUND)
-    include_directories(SYSTEM ${OpenCV_INCLUDE_DIRS})
-    list(APPEND Caffe2_DEPENDENCY_LIBS ${OpenCV_LIBS})
-    if(MSVC AND USE_CUDA)
-        list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS ${OpenCV_LIBS})
-    endif()
-    message(STATUS "OpenCV found (${OpenCV_CONFIG_PATH})")
-  else()
-    message(WARNING "Not compiling with OpenCV. Suppress this warning with -DUSE_OPENCV=OFF")
-    caffe2_update_option(USE_OPENCV OFF)
-  endif()
-endif()
-
-# ---[ FFMPEG
-if(USE_FFMPEG)
-  find_package(FFmpeg REQUIRED)
-  if(FFMPEG_FOUND)
-    message("Found FFMPEG/LibAV libraries")
-    include_directories(SYSTEM ${FFMPEG_INCLUDE_DIR})
-    list(APPEND Caffe2_DEPENDENCY_LIBS ${FFMPEG_LIBRARIES})
-  else()
-    message("Not compiling with FFmpeg. Suppress this warning with -DUSE_FFMPEG=OFF")
-    caffe2_update_option(USE_FFMPEG OFF)
-  endif()
-endif()
-
 if(USE_ITT)
  find_package(ITT)
  if(ITT_FOUND)
--- a/cmake/Modules/FindFFmpeg.cmake
+++ b/cmake/Modules/FindFFmpeg.cmake
@ -1,71 +0,0 @@
-# - Try to find ffmpeg libraries
-#     (libavcodec, libavformat, libavutil, libswscale)
-# Once done this will define
-#
-# FFMPEG_FOUND - system has ffmpeg or libav
-# FFMPEG_INCLUDE_DIR - the ffmpeg include directory
-# FFMPEG_LIBRARIES - Link these to use ffmpeg
-#
-
-if (FFMPEG_LIBRARIES AND FFMPEG_INCLUDE_DIR)
-  # in cache already
-  set(FFMPEG_FOUND TRUE)
-else (FFMPEG_LIBRARIES AND FFMPEG_INCLUDE_DIR)
-
-  find_path(FFMPEG_AVCODEC_INCLUDE_DIR
-    NAMES libavcodec/avcodec.h
-    PATHS ${_FFMPEG_AVCODEC_INCLUDE_DIRS} /usr/include /usr/local/include /opt/local/include /sw/include
-    PATH_SUFFIXES ffmpeg libav
-  )
-
-  find_library(FFMPEG_LIBAVCODEC
-    NAMES avcodec
-    PATHS ${_FFMPEG_AVCODEC_LIBRARY_DIRS} /usr/lib /usr/local/lib /opt/local/lib /sw/lib
-  )
-
-  find_library(FFMPEG_LIBAVFORMAT
-    NAMES avformat
-    PATHS ${_FFMPEG_AVFORMAT_LIBRARY_DIRS} /usr/lib /usr/local/lib /opt/local/lib /sw/lib
-  )
-
-  find_library(FFMPEG_LIBAVUTIL
-    NAMES avutil
-    PATHS ${_FFMPEG_AVUTIL_LIBRARY_DIRS} /usr/lib /usr/local/lib /opt/local/lib /sw/lib
-  )
-
-
-  find_library(FFMPEG_LIBSWSCALE
-    NAMES swscale
-    PATHS ${_FFMPEG_SWSCALE_LIBRARY_DIRS} /usr/lib /usr/local/lib /opt/local/lib /sw/lib
-  )
-
-  find_library(FFMPEG_LIBSWRESAMPLE
-    NAMES swresample
-    PATHS ${_FFMPEG_SWSCALE_LIBRARY_DIRS} /usr/lib /usr/local/lib /opt/local/lib /sw/lib
-  )
-
-  if (FFMPEG_LIBAVCODEC AND FFMPEG_LIBAVFORMAT)
-    set(FFMPEG_FOUND TRUE)
-  endif()
-
-  if (FFMPEG_FOUND)
-    set(FFMPEG_INCLUDE_DIR ${FFMPEG_AVCODEC_INCLUDE_DIR})
-
-    set(FFMPEG_LIBRARIES
-      ${FFMPEG_LIBAVCODEC}
-      ${FFMPEG_LIBAVFORMAT}
-      ${FFMPEG_LIBAVUTIL}
-      ${FFMPEG_LIBSWSCALE}
-      ${FFMPEG_LIBSWRESAMPLE}
-    )
-
-    if (NOT FFMPEG_FIND_QUIETLY)
-      message(STATUS "Found FFMPEG or Libav: ${FFMPEG_LIBRARIES}, ${FFMPEG_INCLUDE_DIR}")
-    endif (NOT FFMPEG_FIND_QUIETLY)
-  else (FFMPEG_FOUND)
-    if (FFMPEG_FIND_REQUIRED)
-      message(FATAL_ERROR "Could not find libavcodec or libavformat or libavutil")
-    endif (FFMPEG_FIND_REQUIRED)
-  endif (FFMPEG_FOUND)
-
-endif (FFMPEG_LIBRARIES AND FFMPEG_INCLUDE_DIR)
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@ -128,7 +128,6 @@ function(caffe2_print_configuration_summary)
  message(STATUS "  USE_FBGEMM            : ${USE_FBGEMM}")
  message(STATUS "    USE_FAKELOWP          : ${USE_FAKELOWP}")
  message(STATUS "  USE_KINETO            : ${USE_KINETO}")
-  message(STATUS "  USE_FFMPEG            : ${USE_FFMPEG}")
  message(STATUS "  USE_GFLAGS            : ${USE_GFLAGS}")
  message(STATUS "  USE_GLOG              : ${USE_GLOG}")
  message(STATUS "  USE_LEVELDB           : ${USE_LEVELDB}")
@ -164,10 +163,6 @@ function(caffe2_print_configuration_summary)
  message(STATUS "  USE_NUMPY             : ${USE_NUMPY}")
  message(STATUS "  USE_OBSERVERS         : ${USE_OBSERVERS}")
  message(STATUS "  USE_OPENCL            : ${USE_OPENCL}")
-  message(STATUS "  USE_OPENCV            : ${USE_OPENCV}")
-  if(${USE_OPENCV})
-    message(STATUS "    OpenCV version      : ${OpenCV_VERSION}")
-  endif()
  message(STATUS "  USE_OPENMP            : ${USE_OPENMP}")
  message(STATUS "  USE_TBB               : ${USE_TBB}")
  if(${USE_TBB})