Remove caffe2 image and video (#125045)

This PR tries to decompose https://github.com/pytorch/pytorch/pull/122527 into a smaller one. Caffe2 image and video folders are removed along with the related CMake code.
To be noted, this was inspired and is co-dev with @r-barnes.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/125045
Approved by: https://github.com/eqy, https://github.com/albanD
This commit is contained in:
cyy
2024-04-30 17:31:57 +00:00
committed by PyTorch MergeBot
parent a03b9a2189
commit 04c6424fbf
21 changed files with 0 additions and 4762 deletions

View File

@ -228,7 +228,6 @@ option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON)
option(USE_KINETO "Use Kineto profiling library" ON)
option(USE_CUPTI_SO "Use CUPTI as a shared library" ON)
option(USE_FAKELOWP "Use FakeLowp operators" OFF)
option(USE_FFMPEG "Use ffmpeg" OFF)
option(USE_GFLAGS "Use GFLAGS" OFF)
option(USE_GLOG "Use GLOG" OFF)
option(USE_LEVELDB "Use LEVELDB" OFF)
@ -264,7 +263,6 @@ cmake_dependent_option(
option(USE_NUMPY "Use NumPy" ON)
option(USE_OBSERVERS "Use observers module." OFF)
option(USE_OPENCL "Use OpenCL" OFF)
option(USE_OPENCV "Use OpenCV" OFF)
option(USE_OPENMP "Use OpenMP for parallel code" ON)
option(USE_PRECOMPILED_HEADERS "Use pre-compiled headers to accelerate build." OFF)

View File

@ -125,8 +125,6 @@ if(BUILD_CAFFE2 AND NOT INTERN_BUILD_MOBILE)
add_subdirectory(db)
add_subdirectory(distributed)
add_subdirectory(ideep)
add_subdirectory(image)
add_subdirectory(video)
add_subdirectory(mobile)
add_subdirectory(mpi)
add_subdirectory(observers)

View File

@ -1,57 +0,0 @@
if(USE_OPENCV AND OpenCV_FOUND)
message(STATUS "Including image processing operators")
# ---[ GPU files
# ------[ general GPU
file(GLOB tmp *_gpu.cc)
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
# ------[ CUDA sources
file(GLOB tmp *.cu)
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
# exclude test files
file(GLOB tmp *_test.cc)
exclude(Caffe2_GPU_SRCS "${Caffe2_GPU_SRCS}" ${tmp})
# ---[ HIP files
# ------[ general HIP
file(GLOB tmp hip/*.cc)
set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${tmp})
# ------[ HIP sources
file(GLOB tmp hip/*.hip)
set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${tmp})
# exclude test files
file(GLOB tmp hip/*_test.cc)
exclude(Caffe2_HIP_SRCS "${Caffe2_HIP_SRCS}" ${tmp})
# ---[ CPU files.
file(GLOB tmp *.cc)
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp})
# exclude test files and gpu files
file(GLOB tmp *_test.cc)
exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp})
exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_GPU_SRCS})
exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_HIP_SRCS})
# ---[ GPU test files
file(GLOB tmp *_gpu_test.cc)
set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} ${tmp})
# ---[ HIP test files
file(GLOB tmp hip/*_test.cc)
set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} ${tmp})
# ---[ CPU test files
file(GLOB tmp *_test.cc)
set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp})
exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_GPU_TEST_SRCS})
exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_HIP_TEST_SRCS})
# ---[ Send the lists to the parent scope.
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE)
set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} PARENT_SCOPE)
else()
message(STATUS "Excluding image processing operators due to no opencv")
endif()

View File

@ -1,167 +0,0 @@
#include "caffe2/image/image_input_op.h"
#ifdef USE_MKLDNN
#include <caffe2/ideep/operators/operator_fallback_ideep.h>
#include <caffe2/ideep/utils/ideep_operator.h>
#endif
namespace caffe2 {
template <>
bool ImageInputOp<CPUContext>::ApplyTransformOnGPU(
const std::vector<std::int64_t>&,
const c10::Device&) {
return false;
}
REGISTER_CPU_OPERATOR(ImageInput, ImageInputOp<CPUContext>);
OPERATOR_SCHEMA(ImageInput)
.NumInputs(0, 1)
.NumOutputs(2, INT_MAX)
.TensorInferenceFunction([](const OperatorDef& def,
const vector<TensorShape>& /* unused */) {
vector<TensorShape> out(2);
ArgumentHelper helper(def);
int batch_size = helper.GetSingleArgument<int>("batch_size", 0);
int crop = helper.GetSingleArgument<int>("crop", -1);
int color = helper.GetSingleArgument<int>("color", 1);
TORCH_CHECK_GT(crop, 0);
out[0] = CreateTensorShape(
vector<int>{batch_size, crop, crop, color ? 3 : 1},
TensorProto::FLOAT);
out[1] =
CreateTensorShape(vector<int>{1, batch_size}, TensorProto::INT32);
return out;
})
.SetDoc(R"DOC(
Imports and processes images from a database. For each run of the operator,
batch_size images will be processed. GPUs can optionally be used for
part of the processing.
The following transformations are applied to the image
- A bounding box is applied to the initial image (optional)
- The image is rescaled either up or down (with the scale argument) or
just up (with the minsize argument)
- The image is randomly cropped (crop size is passed as an argument but
the location of the crop is random except if is_test is passed in which case
the image in cropped at the center)
- The image is normalized. Each of its color channels can have separate
normalization values
The dimension of the output image will always be cropxcrop
)DOC")
.Arg(
"batch_size",
"Number of images to output for each run of the operator"
". Must be 1 or greater")
.Arg("color", "Number of color channels (1 or 3). Defaults to 1")
.Arg("color_jitter", "Whether or not to do color jitter. Defaults to 0")
.Arg(
"img_saturation",
"Image saturation scale used in color jittering. "
"Defaults to 0.4")
.Arg(
"img_brightness",
"Image brightness scale used in color jittering. "
"Defaults to 0.4")
.Arg(
"img_contrast",
"Image contrast scale used in color jittering. "
"Defaults to 0.4")
.Arg(
"color_lighting",
"Whether or not to do color lighting."
" Defaults to 0")
.Arg(
"color_lighting_std",
"Std of normal distribution where color lighting"
" scaling factor is sampled. Defaults to 0.1")
.Arg(
"scale_jitter_type",
"Type 0: No scale jittering "
"Type 1: Inception-style scale jittering")
.Arg(
"label_type",
"Type 0: single integer label for multi-class "
"classification. Type 1: sparse active label indices for multi-label "
"classification. Type 2: dense label embedding vector for label "
"embedding regression")
.Arg(
"scale",
"Scale the size of the smallest dimension of the image to"
" this. Scale and minsize are mutually exclusive."
" Must be larger than crop")
.Arg(
"minsize",
"Scale the size of the smallest dimension of the image to"
" this only if the size is initially smaller. Scale and minsize are"
" mutually exclusive. Must be larger than crop.")
.Arg(
"warp",
"If 1, both dimensions of the image will be set to minsize or"
" scale; otherwise, the other dimension is proportionally scaled."
" Defaults to 0")
.Arg("crop", "Size to crop the image to. Must be provided")
.Arg("mirror", "Whether or not to mirror the image. Defaults to 0")
.Arg(
"mean",
"Mean by which to normalize color channels."
" Defaults to 0.")
.Arg(
"mean_per_channel",
"Vector of means per color channel "
" (1 or 3 elements). Defaults to mean argument. Channel order BGR")
.Arg(
"std",
"Standard deviation by which to normalize color channels."
" Defaults to 1.")
.Arg(
"std_per_channel",
"Vector of standard dev. per color channel "
" (1 or 3 elements). Defaults to std argument. Channel order is BGR")
.Arg("bounding_ymin", "Bounding box coordinate. Defaults to -1 (none)")
.Arg("bounding_xmin", "Bounding box coordinate. Defaults to -1 (none)")
.Arg("bounding_height", "Bounding box coordinate. Defaults to -1 (none)")
.Arg("bounding_width", "Bounding box coordinate. Defaults to -1 (none)")
.ArgIsTest("Set to 1 to do deterministic cropping. Defaults to 0")
.Arg("use_caffe_datum", "1 if the input is in Caffe format. Defaults to 0")
.Arg(
"use_gpu_transform",
"1 if GPU acceleration should be used."
" Defaults to 0. Can only be 1 in a CUDAContext")
.Arg(
"decode_threads",
"Number of CPU decode/transform threads."
" Defaults to 4")
.Arg("output_type", "If gpu_transform, can set to FLOAT or FLOAT16.")
.Arg("db", "Name of the database (if not passed as input)")
.Arg(
"db_type",
"Type of database (if not passed as input)."
" Defaults to leveldb")
.Arg(
"output_sizes",
"The sizes of any outputs besides the data and label "
"(should have a number of elements equal to the number of additional "
"outputs)")
.Arg(
"random_scale",
"[min, max] shortest-side desired for image resize. "
"Defaults to [-1, -1] or no random resize desired.")
.Input(0, "reader", "The input reader (a db::DBReader)")
.Output(0, "data", "Tensor containing the images")
.Output(1, "label", "Tensor containing the labels")
.Output(
2,
"additional outputs",
"Any outputs after the first 2 will be "
"Tensors read from the input TensorProtos");
NO_GRADIENT(ImageInput);
#ifdef USE_MKLDNN
REGISTER_IDEEP_OPERATOR(ImageInput, IDEEPFallbackOp<ImageInputOp<CPUContext>>);
#endif
} // namespace caffe2

File diff suppressed because it is too large Load Diff

View File

@ -1,38 +0,0 @@
#include "caffe2/core/common_gpu.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/image/image_input_op.h"
namespace caffe2 {
template <>
bool ImageInputOp<CUDAContext>::ApplyTransformOnGPU(
const std::vector<std::int64_t>& dims,
const c10::Device& type) {
// GPU transform kernel allows explicitly setting output type
if (output_type_ == TensorProto_DataType_FLOAT) {
auto* image_output =
OperatorBase::OutputTensor(0, dims, at::dtype<float>().device(type));
TransformOnGPU<uint8_t, float, CUDAContext>(
prefetched_image_on_device_,
image_output,
mean_gpu_,
std_gpu_,
&context_);
} else if (output_type_ == TensorProto_DataType_FLOAT16) {
auto* image_output =
OperatorBase::OutputTensor(0, dims, at::dtype<at::Half>().device(type));
TransformOnGPU<uint8_t, at::Half, CUDAContext>(
prefetched_image_on_device_,
image_output,
mean_gpu_,
std_gpu_,
&context_);
} else {
return false;
}
return true;
}
REGISTER_CUDA_OPERATOR(ImageInput, ImageInputOp<CUDAContext>);
} // namespace caffe2

View File

@ -1,85 +0,0 @@
#include "caffe2/core/context_gpu.h"
#include "caffe2/image/transform_gpu.h"
#include "caffe2/utils/conversions.h"
/**
*
* Copyright (c) 2016, NVIDIA CORPORATION, All rights reserved
* Distributed under 2-clause BSD license; see accompanying LICENSE file
*
**/
namespace caffe2 {
namespace {
// input in (int8, NHWC), output in (fp32, NCHW)
template <typename In, typename Out>
__global__ void transform_kernel(
const int C,
const int H,
const int W,
const float* mean,
const float* std,
const In* in,
Out* out) {
const auto n = blockIdx.x;
const auto nStride = C*H*W;
// pointers to data for this image
const In *const input_ptr = &in[n*nStride];
Out *const output_ptr = &out[n*nStride];
// either read or write uncoalesced - try reading
for (int c=0; c < C; ++c) {
for (int h=threadIdx.y; h < H; h += blockDim.y) {
for (int w=threadIdx.x; w < W; w += blockDim.x) {
const int in_idx = c + C*w + C*W*h; // HWC
const int out_idx = c*H*W + h*W + w; // CHW
output_ptr[out_idx] = convert::To<float,Out>(
(convert::To<In,float>(input_ptr[in_idx])-mean[c]) * std[c]);
}
}
}
}
}
template <typename T_IN, typename T_OUT, class Context>
bool TransformOnGPU(
Tensor& X,
Tensor* Y,
Tensor& mean,
Tensor& std,
Context* context) {
const int N = X.dim32(0), C = X.dim32(3), H = X.dim32(1), W = X.dim32(2);
auto* input_data = X.template data<T_IN>();
auto* output_data = Y->template mutable_data<T_OUT>();
transform_kernel<
T_IN, T_OUT><<<N, dim3(16, 16), 0, context->cuda_stream()>>>(
C, H, W, mean.template data<float>(), std.template data<float>(),
input_data, output_data);
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
};
template bool TransformOnGPU<uint8_t, float, CUDAContext>(
Tensor& X,
Tensor* Y,
Tensor& mean,
Tensor& std,
CUDAContext* context);
template bool TransformOnGPU<uint8_t, at::Half, CUDAContext>(
Tensor& X,
Tensor* Y,
Tensor& mean,
Tensor& std,
CUDAContext* context);
} // namespace caffe2

View File

@ -1,43 +0,0 @@
#ifndef CAFFE2_IMAGE_TRANSFORM_GPU_H_
#define CAFFE2_IMAGE_TRANSFORM_GPU_H_
/**
*
* Copyright (c) 2016, NVIDIA CORPORATION, All rights reserved
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**/
#include "caffe2/core/context.h"
namespace caffe2 {
template <typename T_IN, typename T_OUT, class Context>
bool TransformOnGPU(
Tensor& X,
Tensor* Y,
Tensor& mean,
Tensor& std,
Context* context);
} // namespace caffe2
#endif

View File

@ -1,59 +0,0 @@
if(USE_OPENCV AND OpenCV_FOUND AND USE_FFMPEG AND FFMPEG_FOUND)
message(STATUS "Including video processing operators")
# ---[ GPU files
# ------[ general GPU
file(GLOB tmp *_gpu.cc)
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
# ------[ CUDA sources
file(GLOB tmp *.cu)
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
# exclude test files
file(GLOB tmp *_test.cc)
exclude(Caffe2_GPU_SRCS "${Caffe2_GPU_SRCS}" ${tmp})
# ---[ HIP files
# ------[ general HIP
file(GLOB tmp hip/*.cc)
set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${tmp})
# ------[ HIP sources
file(GLOB tmp hip/*.hip)
set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${tmp})
# exclude test files
file(GLOB tmp hip/*_test.cc)
exclude(Caffe2_HIP_SRCS "${Caffe2_HIP_SRCS}" ${tmp})
# ---[ CPU files.
file(GLOB tmp *.cc)
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp})
# exclude test files and gpu files
file(GLOB tmp *_test.cc)
exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp})
exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_GPU_SRCS})
exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_HIP_SRCS})
# ---[ GPU test files
file(GLOB tmp *_gpu_test.cc)
set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} ${tmp})
# ---[ HIP test files
file(GLOB tmp hip/*_test.cc)
set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} ${tmp})
# ---[ CPU test files
file(GLOB tmp *_test.cc)
set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp})
exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}"
${Caffe2_GPU_TEST_SRCS})
exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}"
${Caffe2_GPU_TEST_SRCS})
# ---[ Send the lists to the parent scope.
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE)
set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} PARENT_SCOPE)
else()
message(STATUS "Excluding video processing operators due to no opencv")
endif()

View File

@ -1,85 +0,0 @@
#include <caffe2/video/optical_flow.h>
namespace caffe2 {
void OpticalFlowExtractor(
const cv::Mat& prev_gray,
const cv::Mat& curr_gray,
const int flow_alg_type,
cv::Mat& flow) {
#if CV_MAJOR_VERSION >= 4
cv::Ptr<cv::DISOpticalFlow> tvl1 = cv::DISOpticalFlow::create();
#else
cv::Ptr<cv::DualTVL1OpticalFlow> tvl1 = cv::DualTVL1OpticalFlow::create();
#endif
switch (flow_alg_type) {
case FLowAlgType::FarnebackOpticalFlow:
cv::calcOpticalFlowFarneback(
prev_gray,
curr_gray,
flow,
std::sqrt(2) / 2.0,
5,
10,
2,
7,
1.5,
cv::OPTFLOW_FARNEBACK_GAUSSIAN);
break;
case FLowAlgType::DensePyrLKOpticalFlow:
LOG(ERROR) << "DensePyrLKOpticalFlow only has sparse version on CPU";
break;
case FLowAlgType::BroxOpticalFlow:
LOG(ERROR) << "BroxOpticalFlow on CPU is not available";
break;
case FLowAlgType::OpticalFlowDual_TVL1:
tvl1->calc(prev_gray, curr_gray, flow);
break;
default:
LOG(ERROR) << "Unsupported optical flow type " << flow_alg_type;
break;
}
}
void MergeOpticalFlow(cv::Mat& prev_flow, const cv::Mat& curr_flow) {
const int rows = prev_flow.rows;
const int cols = prev_flow.cols;
// merge two optical flows into one
for (int y = 0; y < rows; y++) {
for (int x = 0; x < cols; x++) {
cv::Point2f u = prev_flow.at<cv::Point2f>(y, x);
// get the new location
int x_new = std::min(cols - 1, std::max(0, cvRound(u.x + x)));
int y_new = std::min(rows - 1, std::max(0, cvRound(u.y + y)));
cv::Point2f u_new = curr_flow.at<cv::Point2f>(y_new, x_new);
// update the flow
prev_flow.at<cv::Point2f>(y, x) += u_new;
}
}
}
void MultiFrameOpticalFlowExtractor(
const std::vector<cv::Mat>& grays,
const int optical_flow_alg_type,
cv::Mat& flow) {
int num_frames = grays.size();
CAFFE_ENFORCE_GE(num_frames, 2, "need at least 2 frames!");
// compute optical flow for every two frames
std::vector<cv::Mat> flows;
for (int i = 0; i < num_frames - 1; i++) {
cv::Mat tmp;
OpticalFlowExtractor(grays[i], grays[i + 1], optical_flow_alg_type, tmp);
flows.push_back(tmp);
}
flows[0].copyTo(flow);
// aggregate optical flow across multiple frame
for (int i = 1; i < num_frames - 1; i++) {
MergeOpticalFlow(flow, flows[i]);
}
}
} // namespace caffe2

View File

@ -1,50 +0,0 @@
#ifndef CAFFE2_VIDEO_OPTICAL_FLOW_H_
#define CAFFE2_VIDEO_OPTICAL_FLOW_H_
#include <opencv2/core.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/opencv.hpp>
#include <opencv2/video.hpp>
#include <caffe2/core/logging.h>
namespace caffe2 {
// Four different types of optical flow algorithms supported;
// BroxOpticalFlow doesn't have a CPU version;
// DensePyrLKOpticalFlow only has sparse CPU version;
enum FLowAlgType {
FarnebackOpticalFlow = 0,
DensePyrLKOpticalFlow = 1,
BroxOpticalFlow = 2,
OpticalFlowDual_TVL1 = 3,
};
// Define different types of optical flow data type
// 0: original two channel optical flow
// 1: three channel optical flow with magnitude as the third channel
// 2: two channel optical flow + one channel gray
// 3: two channel optical flow + three channel rgb
enum FlowDataType {
Flow2C = 0,
Flow3C = 1,
FlowWithGray = 2,
FlowWithRGB = 3,
};
void OpticalFlowExtractor(
const cv::Mat& prev_gray,
const cv::Mat& curr_gray,
const int optical_flow_alg_type,
cv::Mat& flow);
void MergeOpticalFlow(cv::Mat& prev_flow, const cv::Mat& curr_flow);
void MultiFrameOpticalFlowExtractor(
const std::vector<cv::Mat>& grays,
const int optical_flow_alg_type,
cv::Mat& flow);
} // namespace caffe2
#endif // CAFFE2_VIDEO_OPTICAL_FLOW_H_

View File

@ -1,800 +0,0 @@
#include <assert.h>
#include <caffe2/core/logging.h>
#include <caffe2/video/video_decoder.h>
#include <array>
#include <mutex>
#include <random>
namespace caffe2 {
VideoDecoder::VideoDecoder() {
static bool gInitialized = false;
static std::mutex gMutex;
std::unique_lock<std::mutex> lock(gMutex);
if (!gInitialized) {
av_register_all();
avcodec_register_all();
avformat_network_init();
gInitialized = true;
}
}
void VideoDecoder::getAudioSample(
AVPacket& packet,
AVCodecContext* audioCodecContext_,
AVFrame* audioStreamFrame_,
SwrContext* convertCtx_,
Callback& callback,
const Params& params) {
int frame_finished = 0;
auto result = avcodec_decode_audio4(
audioCodecContext_, audioStreamFrame_, &frame_finished, &packet);
if (frame_finished) {
// from
// https://www.ffmpeg.org/doxygen/2.3/decoding_encoding_8c-example.html#a57
auto c = audioCodecContext_;
int data_size = av_samples_get_buffer_size(
nullptr, c->channels, audioStreamFrame_->nb_samples, c->sample_fmt, 1);
if (data_size < 0) {
// This should not occur, checking just for paranoia
LOG(ERROR) << "Failed to calculate data size";
}
// from https://www.ffmpeg.org/doxygen/2.1/group__lswr.html#details
uint8_t* output;
auto swr = convertCtx_;
auto inrate = audioCodecContext_->sample_rate;
auto in_samples = audioStreamFrame_->nb_samples;
int out_samples = av_rescale_rnd(
swr_get_delay(swr, inrate) + in_samples,
params.outrate_,
inrate,
AV_ROUND_UP);
if (out_samples > 0) {
auto input = (const uint8_t**)&audioStreamFrame_->data[0];
av_samples_alloc(
&output,
nullptr,
c->channels,
out_samples,
(AVSampleFormat)params.outfmt_,
0);
// resample the audio data
out_samples = swr_convert(swr, &output, out_samples, input, in_samples);
auto sample_size = out_samples * c->channels * sizeof(float);
auto buffer = std::make_unique<float[]>(sample_size);
memcpy(buffer.get(), output, sample_size);
av_freep(&output);
unique_ptr<DecodedAudio> audio_sample = make_unique<DecodedAudio>();
audio_sample->dataSize_ = data_size;
audio_sample->outSampleSize_ = out_samples * c->channels;
audio_sample->audio_data_ = std::move(buffer);
callback.audioDecoded(std::move(audio_sample));
}
} else {
result = packet.size;
}
packet.size -= result;
packet.data += result;
}
void VideoDecoder::ResizeAndKeepAspectRatio(
const int origWidth,
const int origHeight,
const int short_edge,
const int long_edge,
int& outWidth,
int& outHeight) {
if (origWidth < origHeight) {
// dominant height
if (short_edge > 0) {
// use short_edge for rescale
float ratio = short_edge / float(origWidth);
outWidth = short_edge;
outHeight = (int)round(ratio * origHeight);
} else {
// use long_edge for rescale
float ratio = long_edge / float(origHeight);
outHeight = long_edge;
outWidth = (int)round(ratio * origWidth);
}
} else {
// dominant width
if (short_edge > 0) {
// use short_edge for rescale
float ratio = short_edge / float(origHeight);
outHeight = short_edge;
outWidth = (int)round(ratio * origWidth);
} else {
// use long_edge for rescale
float ratio = long_edge / float(origWidth);
outWidth = long_edge;
outHeight = (int)round(ratio * origHeight);
}
}
}
void VideoDecoder::decodeLoop(
const string& videoName,
VideoIOContext& ioctx,
const Params& params,
const int start_frm,
Callback& callback) {
AVPixelFormat pixFormat = params.pixelFormat_;
AVFormatContext* inputContext = avformat_alloc_context();
AVStream* videoStream_ = nullptr;
AVCodecContext* videoCodecContext_ = nullptr;
AVCodecContext* audioCodecContext_ = nullptr;
AVFrame* videoStreamFrame_ = nullptr;
AVFrame* audioStreamFrame_ = nullptr;
SwrContext* convertCtx_ = nullptr;
AVPacket packet;
av_init_packet(&packet); // init packet
SwsContext* scaleContext_ = nullptr;
try {
inputContext->pb = ioctx.get_avio();
inputContext->flags |= AVFMT_FLAG_CUSTOM_IO;
int ret = 0;
// Determining the input format:
int probeSz = 1 * 1024 + AVPROBE_PADDING_SIZE;
DecodedFrame::AvDataPtr probe((uint8_t*)av_malloc(probeSz));
memset(probe.get(), 0, probeSz);
int len = ioctx.read(probe.get(), probeSz - AVPROBE_PADDING_SIZE);
if (len < probeSz - AVPROBE_PADDING_SIZE) {
LOG(ERROR) << "Insufficient data to determine video format";
return;
}
// seek back to start of stream
ioctx.seek(0, SEEK_SET);
unique_ptr<AVProbeData> probeData(new AVProbeData());
probeData->buf = probe.get();
probeData->buf_size = len;
probeData->filename = "";
// Determine the input-format:
inputContext->iformat = av_probe_input_format(probeData.get(), 1);
// this is to avoid the double-free error
if (inputContext->iformat == nullptr) {
LOG(ERROR) << "inputContext iformat is nullptr!";
return;
}
ret = avformat_open_input(&inputContext, "", nullptr, nullptr);
if (ret < 0) {
LOG(ERROR) << "Unable to open stream : " << ffmpegErrorStr(ret);
return;
}
ret = avformat_find_stream_info(inputContext, nullptr);
if (ret < 0) {
LOG(ERROR) << "Unable to find stream info in " << videoName << " "
<< ffmpegErrorStr(ret);
return;
}
// Decode the first video stream
int videoStreamIndex_ = params.streamIndex_;
int audioStreamIndex_ = params.streamIndex_;
if (params.streamIndex_ == -1) {
for (int i = 0; i < inputContext->nb_streams; i++) {
auto stream = inputContext->streams[i];
if (stream->codec->codec_type == AVMEDIA_TYPE_VIDEO &&
videoStreamIndex_ == -1) {
videoStreamIndex_ = i;
videoStream_ = stream;
} else if (
stream->codec->codec_type == AVMEDIA_TYPE_AUDIO &&
audioStreamIndex_ == -1) {
audioStreamIndex_ = i;
}
if (videoStreamIndex_ != -1 && audioStreamIndex_ != -1) {
break;
}
}
}
if (videoStream_ == nullptr) {
LOG(ERROR) << "Unable to find video stream in " << videoName << " "
<< ffmpegErrorStr(ret);
return;
}
// Initialize codec
AVDictionary* opts = nullptr;
videoCodecContext_ = videoStream_->codec;
try {
ret = avcodec_open2(
videoCodecContext_,
avcodec_find_decoder(videoCodecContext_->codec_id),
&opts);
} catch (const std::exception&) {
LOG(ERROR) << "Exception during open video codec";
return;
}
if (ret < 0) {
LOG(ERROR) << "Cannot open video codec : "
<< videoCodecContext_->codec->name;
return;
}
if (params.getAudio_ && audioStreamIndex_ >= 0) {
// see e.g. ridge/decoder/StreamDecoder.cpp
audioCodecContext_ = inputContext->streams[audioStreamIndex_]->codec;
ret = avcodec_open2(
audioCodecContext_,
avcodec_find_decoder(audioCodecContext_->codec_id),
nullptr);
if (ret < 0) {
LOG(ERROR) << "Cannot open audio codec : "
<< audioCodecContext_->codec->name;
return;
}
convertCtx_ = swr_alloc_set_opts(
nullptr,
params.outlayout_,
(AVSampleFormat)params.outfmt_,
params.outrate_,
audioCodecContext_->channel_layout,
audioCodecContext_->sample_fmt,
audioCodecContext_->sample_rate,
0,
nullptr);
if (convertCtx_ == nullptr) {
LOG(ERROR) << "Cannot setup sample format converter.";
return;
}
if (swr_init(convertCtx_) < 0) {
LOG(ERROR) << "Cannot init sample format converter.";
return;
}
}
// Calculate if we need to rescale the frames
const int origWidth = videoCodecContext_->width;
const int origHeight = videoCodecContext_->height;
int outWidth = origWidth;
int outHeight = origHeight;
if (params.video_res_type_ == VideoResType::ORIGINAL_RES) {
// if the original resolution is too low,
// make it at least the same size as crop_size_
if (params.crop_size_ > origWidth || params.crop_size_ > origHeight) {
ResizeAndKeepAspectRatio(
origWidth, origHeight, params.crop_size_, -1, outWidth, outHeight);
}
} else if (params.video_res_type_ == VideoResType::USE_SHORT_EDGE) {
// resize the image to the predefined
// short_edge_ resolution while keep the aspect ratio
ResizeAndKeepAspectRatio(
origWidth, origHeight, params.short_edge_, -1, outWidth, outHeight);
} else if (params.video_res_type_ == VideoResType::USE_WIDTH_HEIGHT) {
// resize the image to the predefined
// resolution and ignore the aspect ratio
outWidth = params.outputWidth_;
outHeight = params.outputHeight_;
} else {
LOG(ERROR) << "Unknown VideoResType: " << params.video_res_type_;
return;
}
// Make sure that we have a valid format
if (videoCodecContext_->pix_fmt == AV_PIX_FMT_NONE) {
LOG(ERROR) << "pixel format is not valid.";
return;
}
// Create a scale context
scaleContext_ = sws_getContext(
videoCodecContext_->width,
videoCodecContext_->height,
videoCodecContext_->pix_fmt,
outWidth,
outHeight,
pixFormat,
SWS_FAST_BILINEAR,
nullptr,
nullptr,
nullptr);
// Getting video meta data
VideoMeta videoMeta;
videoMeta.codec_type = videoCodecContext_->codec_type;
videoMeta.width = outWidth;
videoMeta.height = outHeight;
videoMeta.pixFormat = pixFormat;
// avoid division by zero, code adapted from
// https://www.ffmpeg.org/doxygen/0.6/rational_8h-source.html
if (videoStream_->avg_frame_rate.num == 0 ||
videoStream_->avg_frame_rate.den == 0) {
LOG(ERROR) << "Frame rate is wrong. No data found.";
return;
}
videoMeta.fps = av_q2d(videoStream_->avg_frame_rate);
callback.videoDecodingStarted(videoMeta);
if (params.intervals_.size() == 0) {
LOG(ERROR) << "Empty sampling intervals.";
return;
}
std::vector<SampleInterval>::const_iterator itvlIter =
params.intervals_.begin();
if (itvlIter->timestamp != 0) {
LOG(ERROR) << "Sampling interval starting timestamp is not zero.";
return;
}
double currFps = itvlIter->fps;
if (currFps < 0 && currFps != SpecialFps::SAMPLE_ALL_FRAMES &&
currFps != SpecialFps::SAMPLE_TIMESTAMP_ONLY) {
// fps must be 0, -1, -2 or > 0
LOG(ERROR) << "Invalid sampling fps.";
return;
}
double prevTimestamp = itvlIter->timestamp;
itvlIter++;
if (itvlIter != params.intervals_.end() &&
prevTimestamp >= itvlIter->timestamp) {
LOG(ERROR) << "Sampling interval timestamps must be strictly ascending.";
return;
}
double lastFrameTimestamp = -1.0;
double timestamp = -1.0;
// Initialize frame and packet.
// These will be reused across calls.
videoStreamFrame_ = av_frame_alloc();
audioStreamFrame_ = av_frame_alloc();
// frame index in video stream
int frameIndex = -1;
// frame index of outputed frames
int outputFrameIndex = -1;
/* identify the starting point from where we must start decoding */
std::mt19937 meta_randgen(time(nullptr));
long int start_ts = -1;
bool mustDecodeAll = false;
if (videoStream_->duration > 0 && videoStream_->nb_frames > 0) {
/* we have a valid duration and nb_frames. We can safely
* detect an intermediate timestamp to start decoding from. */
// leave a margin of 10 frames to take in to account the error
// from av_seek_frame
long int margin =
int(ceil((10 * videoStream_->duration) / (videoStream_->nb_frames)));
// if we need to do temporal jittering
if (params.decode_type_ == DecodeType::DO_TMP_JITTER) {
/* estimate the average duration for the required # of frames */
double maxFramesDuration =
(videoStream_->duration * params.num_of_required_frame_) /
(videoStream_->nb_frames);
int ts1 = 0;
int ts2 = videoStream_->duration - int(ceil(maxFramesDuration));
ts2 = ts2 > 0 ? ts2 : 0;
// pick a random timestamp between ts1 and ts2. ts2 is selected such
// that you have enough frames to satisfy the required # of frames.
start_ts = std::uniform_int_distribution<>(ts1, ts2)(meta_randgen);
// seek a frame at start_ts
ret = av_seek_frame(
inputContext,
videoStreamIndex_,
0 > (start_ts - margin) ? 0 : (start_ts - margin),
AVSEEK_FLAG_BACKWARD);
// if we need to decode from the start_frm
} else if (params.decode_type_ == DecodeType::USE_START_FRM) {
if (videoStream_ == nullptr) {
LOG(ERROR) << "Nullptr found at videoStream_";
return;
}
start_ts = int(floor(
(videoStream_->duration * start_frm) / (videoStream_->nb_frames)));
// seek a frame at start_ts
ret = av_seek_frame(
inputContext,
videoStreamIndex_,
0 > (start_ts - margin) ? 0 : (start_ts - margin),
AVSEEK_FLAG_BACKWARD);
} else {
mustDecodeAll = true;
}
if (ret < 0) {
LOG(INFO) << "Unable to decode from a random start point";
/* fall back to default decoding of all frames from start */
av_seek_frame(inputContext, videoStreamIndex_, 0, AVSEEK_FLAG_BACKWARD);
mustDecodeAll = true;
}
} else {
mustDecodeAll = true;
}
int gotPicture = 0;
int eof = 0;
int selectiveDecodedFrames = 0;
int maxFrames = (params.decode_type_ == DecodeType::DO_UNIFORM_SMP)
? MAX_DECODING_FRAMES
: params.num_of_required_frame_;
// There is a delay between reading packets from the
// transport and getting decoded frames back.
// Therefore, after EOF, continue going while
// the decoder is still giving us frames.
while ((!eof || gotPicture) &&
/* either you must decode all frames or decode up to maxFrames
* based on status of the mustDecodeAll flag */
(mustDecodeAll || (selectiveDecodedFrames < maxFrames)) &&
/* If on the last interval and not autodecoding keyframes and a
* SpecialFps indicates no more frames are needed, stop decoding */
!((itvlIter == params.intervals_.end() &&
(currFps == SpecialFps::SAMPLE_TIMESTAMP_ONLY ||
currFps == SpecialFps::SAMPLE_NO_FRAME)) &&
!params.keyFrames_)) {
try {
if (!eof) {
ret = av_read_frame(inputContext, &packet);
if (ret == AVERROR_EOF) {
eof = 1;
av_free_packet(&packet);
packet.data = nullptr;
packet.size = 0;
// stay in the while loop to flush frames
} else if (ret == AVERROR(EAGAIN)) {
av_free_packet(&packet);
continue;
} else if (ret < 0) {
LOG(ERROR) << "Error reading packet : " << ffmpegErrorStr(ret);
return;
}
auto si = packet.stream_index;
if (params.getAudio_ && audioStreamIndex_ >= 0 &&
si == audioStreamIndex_) {
// Audio packets can have multiple audio frames in a single packet
while (packet.size > 0) {
assert(audioCodecContext_ != nullptr);
assert(convertCtx_ != nullptr);
getAudioSample(
packet,
audioCodecContext_,
audioStreamFrame_,
convertCtx_,
callback,
params);
}
}
if (si != videoStreamIndex_) {
av_free_packet(&packet);
continue;
}
}
ret = avcodec_decode_video2(
videoCodecContext_, videoStreamFrame_, &gotPicture, &packet);
if (ret < 0) {
LOG(ERROR) << "Error decoding video frame : " << ffmpegErrorStr(ret);
return;
}
try {
// Nothing to do without a picture
if (!gotPicture) {
av_free_packet(&packet);
continue;
}
frameIndex++;
long int frame_ts =
av_frame_get_best_effort_timestamp(videoStreamFrame_);
timestamp = frame_ts * av_q2d(videoStream_->time_base);
if ((frame_ts >= start_ts && !mustDecodeAll) || mustDecodeAll) {
/* process current frame if:
* 1) We are not doing selective decoding and mustDecodeAll
* OR
* 2) We are doing selective decoding and current frame
* timestamp is >= start_ts from where we start selective
* decoding*/
// if reaching the next interval, update the current fps
// and reset lastFrameTimestamp so the current frame could be
// sampled (unless fps == SpecialFps::SAMPLE_NO_FRAME)
if (itvlIter != params.intervals_.end() &&
timestamp >= itvlIter->timestamp) {
lastFrameTimestamp = -1.0;
currFps = itvlIter->fps;
prevTimestamp = itvlIter->timestamp;
itvlIter++;
if (itvlIter != params.intervals_.end() &&
prevTimestamp >= itvlIter->timestamp) {
LOG(ERROR)
<< "Sampling interval timestamps must be strictly ascending.";
return;
}
}
// keyFrame will bypass all checks on fps sampling settings
bool keyFrame = params.keyFrames_ && videoStreamFrame_->key_frame;
if (!keyFrame) {
// if fps == SpecialFps::SAMPLE_NO_FRAME (0), don't sample at all
if (currFps == SpecialFps::SAMPLE_NO_FRAME) {
av_free_packet(&packet);
continue;
}
// fps is considered reached in the following cases:
// 1. lastFrameTimestamp < 0 - start of a new interval
// (or first frame)
// 2. currFps == SpecialFps::SAMPLE_ALL_FRAMES (-1) - sample every
// frame
// 3. timestamp - lastFrameTimestamp has reached target fps and
// currFps > 0 (not special fps setting)
// different modes for fps:
// SpecialFps::SAMPLE_NO_FRAMES (0):
// disable fps sampling, no frame sampled at all
// SpecialFps::SAMPLE_ALL_FRAMES (-1):
// unlimited fps sampling, will sample at native video fps
// SpecialFps::SAMPLE_TIMESTAMP_ONLY (-2):
// disable fps sampling, but will get the frame at specific
// timestamp
// others (> 0): decoding at the specified fps
bool fpsReached = lastFrameTimestamp < 0 ||
currFps == SpecialFps::SAMPLE_ALL_FRAMES ||
(currFps > 0 &&
timestamp >= lastFrameTimestamp + (1 / currFps));
if (!fpsReached) {
av_free_packet(&packet);
continue;
}
}
lastFrameTimestamp = timestamp;
outputFrameIndex++;
if (params.maximumOutputFrames_ != -1 &&
outputFrameIndex >= params.maximumOutputFrames_) {
// enough frames
av_free_packet(&packet);
break;
}
AVFrame* rgbFrame = av_frame_alloc();
if (!rgbFrame) {
LOG(ERROR) << "Error allocating AVframe";
return;
}
try {
// Determine required buffer size and allocate buffer
int numBytes = avpicture_get_size(pixFormat, outWidth, outHeight);
DecodedFrame::AvDataPtr buffer(
(uint8_t*)av_malloc(numBytes * sizeof(uint8_t)));
int size = avpicture_fill(
(AVPicture*)rgbFrame,
buffer.get(),
pixFormat,
outWidth,
outHeight);
sws_scale(
scaleContext_,
videoStreamFrame_->data,
videoStreamFrame_->linesize,
0,
videoCodecContext_->height,
rgbFrame->data,
rgbFrame->linesize);
unique_ptr<DecodedFrame> frame = make_unique<DecodedFrame>();
frame->width_ = outWidth;
frame->height_ = outHeight;
frame->data_ = std::move(buffer);
frame->size_ = size;
frame->index_ = frameIndex;
frame->outputFrameIndex_ = outputFrameIndex;
frame->timestamp_ = timestamp;
frame->keyFrame_ = videoStreamFrame_->key_frame;
callback.frameDecoded(std::move(frame));
selectiveDecodedFrames++;
av_frame_free(&rgbFrame);
} catch (const std::exception&) {
av_frame_free(&rgbFrame);
}
}
av_frame_unref(videoStreamFrame_);
av_frame_unref(audioStreamFrame_);
} catch (const std::exception&) {
av_frame_unref(videoStreamFrame_);
av_frame_unref(audioStreamFrame_);
}
av_free_packet(&packet);
} catch (const std::exception&) {
av_free_packet(&packet);
}
} // of while loop
callback.videoDecodingEnded(timestamp);
// free all stuffs
sws_freeContext(scaleContext_);
swr_free(&convertCtx_);
av_packet_unref(&packet);
av_frame_free(&videoStreamFrame_);
av_frame_free(&audioStreamFrame_);
avcodec_close(videoCodecContext_);
if (audioCodecContext_ != nullptr) {
avcodec_close(audioCodecContext_);
}
avformat_close_input(&inputContext);
avformat_free_context(inputContext);
} catch (const std::exception&) {
// In case of decoding error
// free all stuffs
sws_freeContext(scaleContext_);
swr_free(&convertCtx_);
av_packet_unref(&packet);
av_frame_free(&videoStreamFrame_);
av_frame_free(&audioStreamFrame_);
avcodec_close(videoCodecContext_);
avcodec_close(audioCodecContext_);
avformat_close_input(&inputContext);
avformat_free_context(inputContext);
}
}
void VideoDecoder::decodeMemory(
const string& videoName,
const char* buffer,
const int size,
const Params& params,
const int start_frm,
Callback& callback) {
VideoIOContext ioctx(buffer, size);
decodeLoop(videoName, ioctx, params, start_frm, callback);
}
void VideoDecoder::decodeFile(
const string& file,
const Params& params,
const int start_frm,
Callback& callback) {
VideoIOContext ioctx(file);
decodeLoop(file, ioctx, params, start_frm, callback);
}
string VideoDecoder::ffmpegErrorStr(int result) {
std::array<char, 128> buf;
av_strerror(result, buf.data(), buf.size());
return string(buf.data());
}
void FreeDecodedData(
std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames,
std::vector<std::unique_ptr<DecodedAudio>>& sampledAudio) {
// free the sampledFrames and sampledAudio
for (int i = 0; i < sampledFrames.size(); i++) {
DecodedFrame* p = sampledFrames[i].release();
delete p;
}
for (int i = 0; i < sampledAudio.size(); i++) {
DecodedAudio* p = sampledAudio[i].release();
delete p;
}
sampledFrames.clear();
sampledAudio.clear();
}
bool DecodeMultipleClipsFromVideo(
const char* video_buffer,
const std::string& video_filename,
const int encoded_size,
const Params& params,
const int start_frm,
const int clip_per_video,
const std::vector<int>& clip_start_positions,
const bool use_local_file,
int& height,
int& width,
std::vector<unsigned char*>& buffer_rgb) {
std::vector<std::unique_ptr<DecodedFrame>> sampledFrames;
std::vector<std::unique_ptr<DecodedAudio>> sampledAudio;
VideoDecoder decoder;
CallbackImpl callback;
// decoding from buffer or file
if (!use_local_file) {
decoder.decodeMemory(
string("Memory Buffer"),
video_buffer,
encoded_size,
params,
start_frm,
callback);
} else {
decoder.decodeFile(video_filename, params, start_frm, callback);
}
for (auto& frame : callback.frames) {
sampledFrames.push_back(std::move(frame));
}
for (auto& audio_sample : callback.audio_samples) {
sampledAudio.push_back(std::move(audio_sample));
}
for (int i = 0; i < buffer_rgb.size(); i++) {
unsigned char* buff = buffer_rgb[i];
delete[] buff;
}
buffer_rgb.clear();
if (sampledFrames.size() < params.num_of_required_frame_) {
LOG(ERROR)
<< "The video seems faulty and we could not decode enough frames: "
<< sampledFrames.size() << " VS " << params.num_of_required_frame_;
FreeDecodedData(sampledFrames, sampledAudio);
return true;
}
if (sampledFrames.size() == 0) {
LOG(ERROR) << "The samples frames have size 0, no frame to process";
FreeDecodedData(sampledFrames, sampledAudio);
return true;
}
height = sampledFrames[0]->height_;
width = sampledFrames[0]->width_;
float sample_stepsz = (clip_per_video <= 1)
? 0
: (float(sampledFrames.size() - params.num_of_required_frame_) /
(clip_per_video - 1));
int image_size = 3 * height * width;
int clip_size = params.num_of_required_frame_ * image_size;
// get the RGB frames for each clip
if (clip_start_positions.size() > 0) {
for (int i = 0; i < clip_start_positions.size(); i++) {
unsigned char* buffer_rgb_ptr = new unsigned char[clip_size];
int clip_start = clip_start_positions[i];
for (int j = 0; j < params.num_of_required_frame_; j++) {
memcpy(
buffer_rgb_ptr + j * image_size,
(unsigned char*)sampledFrames[j + clip_start]->data_.get(),
image_size * sizeof(unsigned char));
}
buffer_rgb.push_back(buffer_rgb_ptr);
}
} else {
for (int i = 0; i < clip_per_video; i++) {
unsigned char* buffer_rgb_ptr = new unsigned char[clip_size];
int clip_start = floor(i * sample_stepsz);
for (int j = 0; j < params.num_of_required_frame_; j++) {
memcpy(
buffer_rgb_ptr + j * image_size,
(unsigned char*)sampledFrames[j + clip_start]->data_.get(),
image_size * sizeof(unsigned char));
}
buffer_rgb.push_back(buffer_rgb_ptr);
}
}
FreeDecodedData(sampledFrames, sampledAudio);
return true;
}
} // namespace caffe2

View File

@ -1,525 +0,0 @@
#ifndef CAFFE2_VIDEO_VIDEO_DECODER_H_
#define CAFFE2_VIDEO_VIDEO_DECODER_H_
#include <caffe2/core/logging.h>
#include <stdio.h>
#include <memory>
#include <string>
#include <vector>
extern "C" {
#include <libavcodec/avcodec.h>
#include <libavformat/avformat.h>
#include <libavformat/avio.h>
#include <libavutil/log.h>
#include <libavutil/motion_vector.h>
#include <libswresample/swresample.h>
#include <libswscale/swscale.h>
}
namespace caffe2 {
#define VIO_BUFFER_SZ 32768
#define MAX_DECODING_FRAMES 10000
// enum to specify 3 special fps sampling behaviors:
// 0: disable fps sampling, no frame sampled at all
// -1: unlimited fps sampling, will sample at native video fps
// -2: disable fps sampling, but will get the frame at specific timestamp
enum SpecialFps {
SAMPLE_NO_FRAME = 0,
SAMPLE_ALL_FRAMES = -1,
SAMPLE_TIMESTAMP_ONLY = -2,
};
// three different types of resolution when decoding the video
// 0: resize to width x height and ignore the aspect ratio;
// 1: resize to short_edge and keep the aspect ratio;
// 2: using the original resolution of the video; if resolution
// is smaller than crop_size x crop_size, resize to crop_size
// and keep the aspect ratio;
// 3: for xray video service
enum VideoResType {
USE_WIDTH_HEIGHT = 0,
USE_SHORT_EDGE = 1,
ORIGINAL_RES = 2,
};
// three different types of decoding behavior are supported
// 0: do temporal jittering to sample a random clip from the video
// 1: uniformly sample multiple clips from the video;
// 2: sample a clip from a given starting frame
// 3: for xray video service
enum DecodeType {
DO_TMP_JITTER = 0,
DO_UNIFORM_SMP = 1,
USE_START_FRM = 2,
};
// sampling interval for fps starting at specified timestamp
// use enum SpecialFps to set special fps decoding behavior
// note sampled fps will not always accurately follow the target fps,
// because sampled frame has to snap to actual frame timestamp,
// e.g. video fps = 25, sample fps = 4 will sample every 0.28s, not 0.25
// video fps = 25, sample fps = 5 will sample every 0.24s, not 0.2,
// because of floating-point division accuracy (1 / 5.0 is not exactly 0.2)
struct SampleInterval {
double timestamp;
double fps;
SampleInterval() : timestamp(-1), fps(SpecialFps::SAMPLE_ALL_FRAMES) {}
SampleInterval(double ts, double f) : timestamp(ts), fps(f) {}
bool operator<(const SampleInterval& itvl) const {
return (timestamp < itvl.timestamp);
}
};
class Params {
public:
// return all key-frames regardless of specified fps
bool keyFrames_ = false;
// return audio data while decoding the video
bool getAudio_ = false;
// for sampling audio data
int outrate_ = 22000;
int outfmt_ = AV_SAMPLE_FMT_FLT;
int64_t outlayout_ = AV_CH_LAYOUT_MONO;
// Output image pixel format
AVPixelFormat pixelFormat_ = AVPixelFormat::AV_PIX_FMT_RGB24;
// Index of stream to decode.
// -1 will automatically decode the first video stream.
int streamIndex_ = -1;
// How many frames to output at most from the video
// -1 no limit
int maximumOutputFrames_ = -1;
// params for video resolution
int video_res_type_ = VideoResType::USE_WIDTH_HEIGHT;
int crop_size_ = -1;
int short_edge_ = -1;
// Output video size, -1 to preserve origianl dimension
int outputWidth_ = -1;
int outputHeight_ = -1;
// max output dimension, -1 to preserve original size
// the larger dimension of the video will be scaled to this size,
// and the second dimension will be scaled to preserve aspect ratio
int maxOutputDimension_ = -1;
// params for decoding behavior
int decode_type_ = DecodeType::DO_TMP_JITTER;
int num_of_required_frame_ = -1;
// intervals_ control variable sampling fps between different timestamps
// intervals_ must be ordered strictly ascending by timestamps
// the first interval must have a timestamp of zero
// fps must be either the 3 special fps defined in SpecialFps, or > 0
std::vector<SampleInterval> intervals_ = {{0, SpecialFps::SAMPLE_ALL_FRAMES}};
Params() {}
/**
* FPS of output frames
* setting here will reset intervals_ and force decoding at target FPS
* This can be used if user just want to decode at a steady fps
*/
Params& fps(float v) {
intervals_.clear();
intervals_.emplace_back(0, v);
return *this;
}
/**
* Sample output frames at a specified list of timestamps
* Timestamps must be in increasing order, and timestamps past the end of the
* video will be ignored
* Setting here will reset intervals_
*/
Params& setSampleTimestamps(const std::vector<double>& timestamps) {
intervals_.clear();
// insert an interval per desired frame.
for (auto& timestamp : timestamps) {
intervals_.emplace_back(timestamp, SpecialFps::SAMPLE_TIMESTAMP_ONLY);
}
return *this;
}
/**
* Pixel format of output buffer, default PIX_FMT_RGB24
*/
Params& pixelFormat(AVPixelFormat pixelFormat) {
pixelFormat_ = pixelFormat;
return *this;
}
/**
* Return all key-frames
*/
Params& keyFrames(bool keyFrames) {
keyFrames_ = keyFrames;
return *this;
}
/**
* Index of video stream to process, defaults to the first video stream
*/
Params& streamIndex(int index) {
streamIndex_ = index;
return *this;
}
/**
* Only output this many frames, default to no limit
*/
Params& maxOutputFrames(int count) {
maximumOutputFrames_ = count;
return *this;
}
/**
* Output frame width, default to video width
*/
Params& outputWidth(int width) {
outputWidth_ = width;
return *this;
}
/**
* Output frame height, default to video height
*/
Params& outputHeight(int height) {
outputHeight_ = height;
return *this;
}
/**
* Max dimension of either width or height, if any is bigger
* it will be scaled down to this and econd dimension
* will be scaled down to maintain aspect ratio.
*/
Params& maxOutputDimension(int size) {
maxOutputDimension_ = size;
return *this;
}
};
// data structure for storing decoded video frames
class DecodedFrame {
public:
struct avDeleter {
void operator()(unsigned char* p) const {
av_free(p);
}
};
using AvDataPtr = std::unique_ptr<uint8_t, avDeleter>;
// decoded data buffer
AvDataPtr data_;
// size in bytes
int size_ = 0;
// frame dimensions
int width_ = 0;
int height_ = 0;
// timestamp in seconds since beginning of video
double timestamp_ = 0;
// true if this is a key frame.
bool keyFrame_ = false;
// index of frame in video
int index_ = -1;
// Sequential number of outputted frame
int outputFrameIndex_ = -1;
};
// data structure for storing decoded audio data
struct DecodedAudio {
int dataSize_;
int outSampleSize_;
std::unique_ptr<float[]> audio_data_;
explicit DecodedAudio(
int dataSize = 0,
int outSampleSize = 0,
std::unique_ptr<float[]> audio_data = nullptr)
: dataSize_(dataSize),
outSampleSize_(outSampleSize),
audio_data_(std::move(audio_data)) {}
};
class VideoIOContext {
public:
explicit VideoIOContext(const std::string& fname)
: workBuffersize_(VIO_BUFFER_SZ),
workBuffer_((uint8_t*)av_malloc(workBuffersize_)),
inputFile_(nullptr),
inputBuffer_(nullptr),
inputBufferSize_(0) {
inputFile_ = fopen(fname.c_str(), "rb");
if (inputFile_ == nullptr) {
LOG(ERROR) << "Error opening video file " << fname;
return;
}
ctx_ = avio_alloc_context(
static_cast<unsigned char*>(workBuffer_.get()),
workBuffersize_,
0,
this,
&VideoIOContext::readFile,
nullptr, // no write function
&VideoIOContext::seekFile);
}
explicit VideoIOContext(const char* buffer, int size)
: workBuffersize_(VIO_BUFFER_SZ),
workBuffer_((uint8_t*)av_malloc(workBuffersize_)),
inputFile_(nullptr),
inputBuffer_(buffer),
inputBufferSize_(size) {
ctx_ = avio_alloc_context(
static_cast<unsigned char*>(workBuffer_.get()),
workBuffersize_,
0,
this,
&VideoIOContext::readMemory,
nullptr, // no write function
&VideoIOContext::seekMemory);
}
~VideoIOContext() {
av_free(ctx_);
if (inputFile_) {
fclose(inputFile_);
}
}
int read(unsigned char* buf, int buf_size) {
if (inputBuffer_) {
return readMemory(this, buf, buf_size);
} else if (inputFile_) {
return readFile(this, buf, buf_size);
} else {
return -1;
}
}
int64_t seek(int64_t offset, int whence) {
if (inputBuffer_) {
return seekMemory(this, offset, whence);
} else if (inputFile_) {
return seekFile(this, offset, whence);
} else {
return -1;
}
}
static int readFile(void* opaque, unsigned char* buf, int buf_size) {
VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
if (feof(h->inputFile_)) {
return AVERROR_EOF;
}
size_t ret = fread(buf, 1, buf_size, h->inputFile_);
if (ret < buf_size) {
if (ferror(h->inputFile_)) {
return -1;
}
}
return ret;
}
static int64_t seekFile(void* opaque, int64_t offset, int whence) {
VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
switch (whence) {
case SEEK_CUR: // from current position
case SEEK_END: // from eof
case SEEK_SET: // from beginning of file
return fseek(h->inputFile_, static_cast<long>(offset), whence);
break;
case AVSEEK_SIZE:
int64_t cur = ftell(h->inputFile_);
fseek(h->inputFile_, 0L, SEEK_END);
int64_t size = ftell(h->inputFile_);
fseek(h->inputFile_, cur, SEEK_SET);
return size;
}
return -1;
}
static int readMemory(void* opaque, unsigned char* buf, int buf_size) {
VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
if (buf_size < 0) {
return -1;
}
int reminder = h->inputBufferSize_ - h->offset_;
int r = buf_size < reminder ? buf_size : reminder;
if (r < 0) {
return AVERROR_EOF;
}
memcpy(buf, h->inputBuffer_ + h->offset_, r);
h->offset_ += r;
return r;
}
static int64_t seekMemory(void* opaque, int64_t offset, int whence) {
VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
switch (whence) {
case SEEK_CUR: // from current position
h->offset_ += offset;
break;
case SEEK_END: // from eof
h->offset_ = h->inputBufferSize_ + offset;
break;
case SEEK_SET: // from beginning of file
h->offset_ = offset;
break;
case AVSEEK_SIZE:
return h->inputBufferSize_;
}
return h->offset_;
}
AVIOContext* get_avio() {
return ctx_;
}
private:
int workBuffersize_;
DecodedFrame::AvDataPtr workBuffer_;
// for file mode
FILE* inputFile_;
// for memory mode
const char* inputBuffer_;
int inputBufferSize_;
int offset_ = 0;
AVIOContext* ctx_;
};
struct VideoMeta {
double fps;
int width;
int height;
enum AVMediaType codec_type;
AVPixelFormat pixFormat;
VideoMeta()
: fps(-1),
width(-1),
height(-1),
codec_type(AVMEDIA_TYPE_VIDEO),
pixFormat(AVPixelFormat::AV_PIX_FMT_RGB24) {}
};
class Callback {
public:
virtual void frameDecoded(std::unique_ptr<DecodedFrame> img) = 0;
virtual void audioDecoded(
std::unique_ptr<DecodedAudio> /*decoded audio data*/) {}
virtual void videoDecodingStarted(const VideoMeta& /*videoMeta*/) {}
virtual void videoDecodingEnded(double /*lastFrameTimestamp*/) {}
virtual ~Callback() {}
};
class VideoDecoder {
public:
VideoDecoder();
void decodeFile(
const std::string& filename,
const Params& params,
const int start_frm,
Callback& callback);
void decodeMemory(
const std::string& filename,
const char* buffer,
const int size,
const Params& params,
const int start_frm,
Callback& callback);
private:
std::string ffmpegErrorStr(int result);
void ResizeAndKeepAspectRatio(
const int origWidth,
const int origHeight,
const int short_edge,
const int long_edge,
int& outWidth,
int& outHeight);
void getAudioSample(
AVPacket& packet,
AVCodecContext* audioCodecContext_,
AVFrame* audioStreamFrame_,
SwrContext* convertCtx_,
Callback& callback,
const Params& params);
void decodeLoop(
const std::string& videoName,
VideoIOContext& ioctx,
const Params& params,
const int start_frm,
Callback& callback);
};
TORCH_API void FreeDecodedData(
std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames,
std::vector<std::unique_ptr<DecodedAudio>>& sampledAudio);
TORCH_API bool DecodeMultipleClipsFromVideo(
const char* video_buffer,
const std::string& video_filename,
const int encoded_size,
const Params& params,
const int start_frm,
const int clip_per_video,
const std::vector<int>& clip_start_positions,
const bool use_local_file,
int& height,
int& width,
std::vector<unsigned char*>& buffer_rgb);
class CallbackImpl : public Callback {
public:
std::vector<std::unique_ptr<DecodedFrame>> frames;
std::vector<std::unique_ptr<DecodedAudio>> audio_samples;
explicit CallbackImpl() {
clear();
}
void clear() {
FreeDecodedData(frames, audio_samples);
}
void frameDecoded(std::unique_ptr<DecodedFrame> frame) override {
frames.push_back(std::move(frame));
}
void audioDecoded(std::unique_ptr<DecodedAudio> audio_sample) override {
audio_samples.push_back(std::move(audio_sample));
}
void videoDecodingStarted(const VideoMeta& /*videoMeta*/) override {
clear();
}
};
} // namespace caffe2
#endif // CAFFE2_VIDEO_VIDEO_DECODER_H_

View File

@ -1,93 +0,0 @@
#include <caffe2/video/video_input_op.h>
namespace caffe2 {
REGISTER_CPU_OPERATOR(VideoInput, VideoInputOp<CPUContext>);
OPERATOR_SCHEMA(VideoInput)
.NumInputs(0, 1)
.NumOutputs(2, 5)
.TensorInferenceFunction(
[](const OperatorDef& def,
const vector<TensorShape>& /* unused */ /*in*/) {
ArgumentHelper helper(def);
int batch_size = helper.GetSingleArgument<int>("batch_size", 0);
int clip_per_video =
helper.GetSingleArgument<int>("clip_per_video", 1);
int crop_size = helper.GetSingleArgument<int>("crop_size", -1);
int length_rgb = helper.GetSingleArgument<int>("length_rgb", 0);
int channels_rgb = helper.GetSingleArgument<int>("channels_rgb", 3);
int length_of = helper.GetSingleArgument<int>("length_of", 0);
int channels_of = helper.GetSingleArgument<int>("channels_of", 2);
// get the flags
bool get_rgb = helper.GetSingleArgument<bool>("get_rgb", true);
bool get_optical_flow =
helper.GetSingleArgument<bool>("get_optical_flow", false);
bool do_multi_label =
helper.GetSingleArgument<bool>("do_multi_label", false);
bool get_video_id =
helper.GetSingleArgument<bool>("get_video_id", false);
bool get_start_frame =
helper.GetSingleArgument<bool>("get_start_frame", false);
// get starting positions if available
vector<int> clip_start_positions =
helper.GetRepeatedArgument<int>("clip_start_positions", {});
// In case clip_start_positions are given, set the clip_per_video arg
if (clip_start_positions.size() > 0) {
clip_per_video = clip_start_positions.size();
}
int output_size = 1;
if (get_rgb) {
output_size++;
}
if (get_optical_flow) {
output_size++;
}
if (get_video_id) {
output_size++;
}
if (get_start_frame) {
output_size++;
}
int index = 0;
vector<TensorShape> out(output_size);
TORCH_CHECK_GT(crop_size, 0);
batch_size *= clip_per_video;
if (get_rgb) {
out[index++] = CreateTensorShape(
vector<int>{
batch_size, channels_rgb, length_rgb, crop_size, crop_size},
TensorProto::FLOAT);
}
if (get_optical_flow) {
out[index++] = CreateTensorShape(
vector<int>{
batch_size, channels_of, length_of, crop_size, crop_size},
TensorProto::FLOAT);
}
if (!do_multi_label) {
out[index++] = CreateTensorShape(
vector<int>{1, batch_size}, TensorProto::INT32);
} else {
int num_of_class = helper.GetSingleArgument<int>("num_of_class", 0);
out[index++] = CreateTensorShape(
vector<int>{batch_size, num_of_class}, TensorProto::INT32);
}
if (get_video_id) {
out[index++] = CreateTensorShape(
vector<int64_t>{1, batch_size}, TensorProto::INT64);
}
if (get_start_frame) {
out[index] = CreateTensorShape(
vector<int>{1, batch_size}, TensorProto::INT32);
}
return out;
});
NO_GRADIENT(VideoInput);
} // namespace caffe2

File diff suppressed because it is too large Load Diff

View File

@ -1,9 +0,0 @@
#include <caffe2/core/common_gpu.h>
#include <caffe2/core/context_gpu.h>
#include <caffe2/video/video_input_op.h>
namespace caffe2 {
REGISTER_CUDA_OPERATOR(VideoInput, VideoInputOp<CUDAContext>);
} // namespace caffe2

View File

@ -1,210 +0,0 @@
#include <caffe2/core/logging.h>
#include <caffe2/video/video_io.h>
#include <algorithm>
#include <random>
#include <string>
namespace caffe2 {
void ClipTransformRGB(
const unsigned char* buffer_rgb,
const int crop_size,
const int length_rgb,
const int channels_rgb,
const int sampling_rate_rgb,
const int height,
const int width,
const int h_off,
const int w_off,
const bool mirror_me,
const std::vector<float>& mean_rgb,
const std::vector<float>& inv_std_rgb,
float* transformed_clip) {
// The order of output dimensions is C, L, H, W
int orig_index, tran_index;
for (int c = 0; c < channels_rgb; ++c) {
for (int l = 0; l < length_rgb; ++l) {
int orig_index_l = l * sampling_rate_rgb * height * width * channels_rgb;
int tran_index_l = (c * length_rgb + l) * crop_size;
for (int h = 0; h < crop_size; ++h) {
int orig_index_h = orig_index_l + (h + h_off) * width * channels_rgb;
int tran_index_h = (tran_index_l + h) * crop_size;
for (int w = 0; w < crop_size; ++w) {
orig_index = orig_index_h + (w + w_off) * channels_rgb + c;
// mirror the frame
if (mirror_me) {
tran_index = tran_index_h + (crop_size - 1 - w);
} else {
tran_index = tran_index_h + w;
}
// normalize and transform the clip
transformed_clip[tran_index] =
(buffer_rgb[orig_index] - mean_rgb[c]) * inv_std_rgb[c];
}
}
}
}
}
void ClipTransformOpticalFlow(
const unsigned char* buffer_rgb,
const int crop_size,
const int length_of,
const int channels_of,
const int sampling_rate_of,
const int height,
const int width,
const cv::Rect& rect,
const int channels_rgb,
const bool mirror_me,
const int flow_alg_type,
const int flow_data_type,
const int frame_gap_of,
const bool do_flow_aggregation,
const std::vector<float>& mean_of,
const std::vector<float>& inv_std_of,
float* transformed_clip) {
const int frame_size = crop_size * crop_size;
const int channel_size_flow = length_of * frame_size;
// for get the mean and std of the input data
bool extract_statistics = false;
static std::vector<double> mean_static(channels_of, 0.f);
static std::vector<double> std_static(channels_of, 0.f);
static long long count = 0;
cv::Scalar mean_img, std_img;
for (int l = 0; l < length_of; l++) {
// get the grayscale frames
std::vector<cv::Mat> grays, rgbs;
int step_size = do_flow_aggregation ? 1 : frame_gap_of;
for (int j = 0; j <= frame_gap_of; j += step_size) {
// get the current frame
const unsigned char* curr_frame = buffer_rgb +
(l * sampling_rate_of + j) * height * width * channels_rgb;
cv::Mat img = cv::Mat::zeros(height, width, CV_8UC3);
memcpy(
img.data,
curr_frame,
height * width * channels_rgb * sizeof(unsigned char));
// crop and mirror the frame
cv::Mat img_cropped = img(rect);
if (mirror_me) {
cv::flip(img_cropped, img_cropped, 1);
}
cv::Mat gray;
cv::cvtColor(img_cropped, gray, cv::COLOR_RGB2GRAY);
grays.push_back(gray);
rgbs.push_back(img_cropped);
}
cv::Mat first_gray, first_rgb;
cv::Mat flow = cv::Mat::zeros(crop_size, crop_size, CV_32FC2);
MultiFrameOpticalFlowExtractor(grays, flow_alg_type, flow);
std::vector<cv::Mat> imgs;
cv::split(flow, imgs);
// save the 2-channel optical flow first
int c = 0;
for (; c < 2; c++) {
if (extract_statistics) {
cv::meanStdDev(imgs[c], mean_img, std_img);
mean_static[c] += mean_img[0];
std_static[c] += std_img[0];
}
imgs[c] -= mean_of[c];
imgs[c] *= inv_std_of[c];
memcpy(
transformed_clip + c * channel_size_flow + l * frame_size,
imgs[c].data,
frame_size * sizeof(float));
}
cv::Mat mag;
std::vector<cv::Mat> chans;
// augment the optical flow with more channels
switch (flow_data_type) {
case FlowDataType::Flow2C:
// nothing to do if we only need two channels
break;
case FlowDataType::Flow3C:
// use magnitude as the third channel
mag = cv::abs(imgs[0]) + cv::abs(imgs[1]);
if (extract_statistics) {
cv::meanStdDev(mag, mean_img, std_img);
mean_static[c] += mean_img[0];
std_static[c] += std_img[0];
}
mag -= mean_of[c];
mag *= inv_std_of[c];
memcpy(
transformed_clip + c * channel_size_flow + l * frame_size,
mag.data,
frame_size * sizeof(float));
break;
case FlowDataType::FlowWithGray:
// add grayscale image as the third channel
grays[0].convertTo(first_gray, CV_32FC1);
if (extract_statistics) {
cv::meanStdDev(first_gray, mean_img, std_img);
mean_static[c] += mean_img[0];
std_static[c] += std_img[0];
}
first_gray -= mean_of[c];
first_gray *= inv_std_of[c];
memcpy(
transformed_clip + c * channel_size_flow + l * frame_size,
first_gray.data,
frame_size * sizeof(float));
break;
case FlowDataType::FlowWithRGB:
// add all three rgb channels
rgbs[0].convertTo(first_rgb, CV_32FC3);
cv::split(first_rgb, chans);
for (; c < channels_of; c++) {
if (extract_statistics) {
cv::meanStdDev(chans[c - 2], mean_img, std_img);
mean_static[c] += mean_img[0];
std_static[c] += std_img[0];
}
chans[c - 2] -= mean_of[c];
chans[c - 2] *= inv_std_of[c];
memcpy(
transformed_clip + c * channel_size_flow + l * frame_size,
chans[c - 2].data,
frame_size * sizeof(float));
}
break;
default:
LOG(ERROR) << "Unsupported optical flow data type " << flow_data_type;
break;
}
if (extract_statistics) {
count++;
if (count % 1000 == 1) {
for (int i = 0; i < channels_of; i++) {
LOG(INFO) << i
<< "-th channel mean: " << mean_static[i] / float(count)
<< " std: " << std_static[i] / float(count);
}
}
}
}
}
} // namespace caffe2

View File

@ -1,51 +0,0 @@
#ifndef CAFFE2_VIDEO_VIDEO_IO_H_
#define CAFFE2_VIDEO_VIDEO_IO_H_
#include <caffe2/core/common.h>
#include <caffe2/video/optical_flow.h>
#include <caffe2/video/video_decoder.h>
#include <opencv2/opencv.hpp>
#include <random>
#include <istream>
#include <ostream>
namespace caffe2 {
TORCH_API void ClipTransformRGB(
const unsigned char* buffer_rgb,
const int crop_size,
const int length_rgb,
const int channels_rgb,
const int sampling_rate_rgb,
const int height,
const int width,
const int h_off,
const int w_off,
const bool mirror_me,
const std::vector<float>& mean_rgb,
const std::vector<float>& inv_std_rgb,
float* transformed_clip);
TORCH_API void ClipTransformOpticalFlow(
const unsigned char* buffer_rgb,
const int crop_size,
const int length_of,
const int channels_of,
const int sampling_rate_of,
const int height,
const int width,
const cv::Rect& rect,
const int channels_rgb,
const bool mirror_me,
const int flow_alg_type,
const int flow_data_type,
const int frame_gap_of,
const bool do_flow_aggregation,
const std::vector<float>& mean_of,
const std::vector<float>& inv_std_of,
float* transformed_clip);
} // namespace caffe2
#endif // CAFFE2_VIDEO_VIDEO_IO_H_

View File

@ -932,45 +932,6 @@ if(USE_REDIS)
endif()
endif()
# ---[ OpenCV
if(USE_OPENCV)
# OpenCV 4
find_package(OpenCV 4 QUIET COMPONENTS core highgui imgproc imgcodecs optflow videoio video)
if(NOT OpenCV_FOUND)
# OpenCV 3
find_package(OpenCV 3 QUIET COMPONENTS core highgui imgproc imgcodecs videoio video)
if(NOT OpenCV_FOUND)
# OpenCV 2
find_package(OpenCV QUIET COMPONENTS core highgui imgproc)
endif()
endif()
if(OpenCV_FOUND)
include_directories(SYSTEM ${OpenCV_INCLUDE_DIRS})
list(APPEND Caffe2_DEPENDENCY_LIBS ${OpenCV_LIBS})
if(MSVC AND USE_CUDA)
list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS ${OpenCV_LIBS})
endif()
message(STATUS "OpenCV found (${OpenCV_CONFIG_PATH})")
else()
message(WARNING "Not compiling with OpenCV. Suppress this warning with -DUSE_OPENCV=OFF")
caffe2_update_option(USE_OPENCV OFF)
endif()
endif()
# ---[ FFMPEG
if(USE_FFMPEG)
find_package(FFmpeg REQUIRED)
if(FFMPEG_FOUND)
message("Found FFMPEG/LibAV libraries")
include_directories(SYSTEM ${FFMPEG_INCLUDE_DIR})
list(APPEND Caffe2_DEPENDENCY_LIBS ${FFMPEG_LIBRARIES})
else()
message("Not compiling with FFmpeg. Suppress this warning with -DUSE_FFMPEG=OFF")
caffe2_update_option(USE_FFMPEG OFF)
endif()
endif()
if(USE_ITT)
find_package(ITT)
if(ITT_FOUND)

View File

@ -1,71 +0,0 @@
# - Try to find ffmpeg libraries
# (libavcodec, libavformat, libavutil, libswscale)
# Once done this will define
#
# FFMPEG_FOUND - system has ffmpeg or libav
# FFMPEG_INCLUDE_DIR - the ffmpeg include directory
# FFMPEG_LIBRARIES - Link these to use ffmpeg
#
if (FFMPEG_LIBRARIES AND FFMPEG_INCLUDE_DIR)
# in cache already
set(FFMPEG_FOUND TRUE)
else (FFMPEG_LIBRARIES AND FFMPEG_INCLUDE_DIR)
find_path(FFMPEG_AVCODEC_INCLUDE_DIR
NAMES libavcodec/avcodec.h
PATHS ${_FFMPEG_AVCODEC_INCLUDE_DIRS} /usr/include /usr/local/include /opt/local/include /sw/include
PATH_SUFFIXES ffmpeg libav
)
find_library(FFMPEG_LIBAVCODEC
NAMES avcodec
PATHS ${_FFMPEG_AVCODEC_LIBRARY_DIRS} /usr/lib /usr/local/lib /opt/local/lib /sw/lib
)
find_library(FFMPEG_LIBAVFORMAT
NAMES avformat
PATHS ${_FFMPEG_AVFORMAT_LIBRARY_DIRS} /usr/lib /usr/local/lib /opt/local/lib /sw/lib
)
find_library(FFMPEG_LIBAVUTIL
NAMES avutil
PATHS ${_FFMPEG_AVUTIL_LIBRARY_DIRS} /usr/lib /usr/local/lib /opt/local/lib /sw/lib
)
find_library(FFMPEG_LIBSWSCALE
NAMES swscale
PATHS ${_FFMPEG_SWSCALE_LIBRARY_DIRS} /usr/lib /usr/local/lib /opt/local/lib /sw/lib
)
find_library(FFMPEG_LIBSWRESAMPLE
NAMES swresample
PATHS ${_FFMPEG_SWSCALE_LIBRARY_DIRS} /usr/lib /usr/local/lib /opt/local/lib /sw/lib
)
if (FFMPEG_LIBAVCODEC AND FFMPEG_LIBAVFORMAT)
set(FFMPEG_FOUND TRUE)
endif()
if (FFMPEG_FOUND)
set(FFMPEG_INCLUDE_DIR ${FFMPEG_AVCODEC_INCLUDE_DIR})
set(FFMPEG_LIBRARIES
${FFMPEG_LIBAVCODEC}
${FFMPEG_LIBAVFORMAT}
${FFMPEG_LIBAVUTIL}
${FFMPEG_LIBSWSCALE}
${FFMPEG_LIBSWRESAMPLE}
)
if (NOT FFMPEG_FIND_QUIETLY)
message(STATUS "Found FFMPEG or Libav: ${FFMPEG_LIBRARIES}, ${FFMPEG_INCLUDE_DIR}")
endif (NOT FFMPEG_FIND_QUIETLY)
else (FFMPEG_FOUND)
if (FFMPEG_FIND_REQUIRED)
message(FATAL_ERROR "Could not find libavcodec or libavformat or libavutil")
endif (FFMPEG_FIND_REQUIRED)
endif (FFMPEG_FOUND)
endif (FFMPEG_LIBRARIES AND FFMPEG_INCLUDE_DIR)

View File

@ -128,7 +128,6 @@ function(caffe2_print_configuration_summary)
message(STATUS " USE_FBGEMM : ${USE_FBGEMM}")
message(STATUS " USE_FAKELOWP : ${USE_FAKELOWP}")
message(STATUS " USE_KINETO : ${USE_KINETO}")
message(STATUS " USE_FFMPEG : ${USE_FFMPEG}")
message(STATUS " USE_GFLAGS : ${USE_GFLAGS}")
message(STATUS " USE_GLOG : ${USE_GLOG}")
message(STATUS " USE_LEVELDB : ${USE_LEVELDB}")
@ -164,10 +163,6 @@ function(caffe2_print_configuration_summary)
message(STATUS " USE_NUMPY : ${USE_NUMPY}")
message(STATUS " USE_OBSERVERS : ${USE_OBSERVERS}")
message(STATUS " USE_OPENCL : ${USE_OPENCL}")
message(STATUS " USE_OPENCV : ${USE_OPENCV}")
if(${USE_OPENCV})
message(STATUS " OpenCV version : ${OpenCV_VERSION}")
endif()
message(STATUS " USE_OPENMP : ${USE_OPENMP}")
message(STATUS " USE_TBB : ${USE_TBB}")
if(${USE_TBB})