mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Remove caffe2 image and video (#125045)
This PR tries to decompose https://github.com/pytorch/pytorch/pull/122527 into a smaller one. Caffe2 image and video folders are removed along with the related CMake code. To be noted, this was inspired and is co-dev with @r-barnes. Pull Request resolved: https://github.com/pytorch/pytorch/pull/125045 Approved by: https://github.com/eqy, https://github.com/albanD
This commit is contained in:
@ -228,7 +228,6 @@ option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON)
|
||||
option(USE_KINETO "Use Kineto profiling library" ON)
|
||||
option(USE_CUPTI_SO "Use CUPTI as a shared library" ON)
|
||||
option(USE_FAKELOWP "Use FakeLowp operators" OFF)
|
||||
option(USE_FFMPEG "Use ffmpeg" OFF)
|
||||
option(USE_GFLAGS "Use GFLAGS" OFF)
|
||||
option(USE_GLOG "Use GLOG" OFF)
|
||||
option(USE_LEVELDB "Use LEVELDB" OFF)
|
||||
@ -264,7 +263,6 @@ cmake_dependent_option(
|
||||
option(USE_NUMPY "Use NumPy" ON)
|
||||
option(USE_OBSERVERS "Use observers module." OFF)
|
||||
option(USE_OPENCL "Use OpenCL" OFF)
|
||||
option(USE_OPENCV "Use OpenCV" OFF)
|
||||
option(USE_OPENMP "Use OpenMP for parallel code" ON)
|
||||
option(USE_PRECOMPILED_HEADERS "Use pre-compiled headers to accelerate build." OFF)
|
||||
|
||||
|
@ -125,8 +125,6 @@ if(BUILD_CAFFE2 AND NOT INTERN_BUILD_MOBILE)
|
||||
add_subdirectory(db)
|
||||
add_subdirectory(distributed)
|
||||
add_subdirectory(ideep)
|
||||
add_subdirectory(image)
|
||||
add_subdirectory(video)
|
||||
add_subdirectory(mobile)
|
||||
add_subdirectory(mpi)
|
||||
add_subdirectory(observers)
|
||||
|
@ -1,57 +0,0 @@
|
||||
if(USE_OPENCV AND OpenCV_FOUND)
|
||||
message(STATUS "Including image processing operators")
|
||||
# ---[ GPU files
|
||||
# ------[ general GPU
|
||||
file(GLOB tmp *_gpu.cc)
|
||||
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
|
||||
# ------[ CUDA sources
|
||||
file(GLOB tmp *.cu)
|
||||
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
|
||||
# exclude test files
|
||||
file(GLOB tmp *_test.cc)
|
||||
exclude(Caffe2_GPU_SRCS "${Caffe2_GPU_SRCS}" ${tmp})
|
||||
|
||||
# ---[ HIP files
|
||||
# ------[ general HIP
|
||||
file(GLOB tmp hip/*.cc)
|
||||
set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${tmp})
|
||||
# ------[ HIP sources
|
||||
file(GLOB tmp hip/*.hip)
|
||||
set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${tmp})
|
||||
# exclude test files
|
||||
file(GLOB tmp hip/*_test.cc)
|
||||
exclude(Caffe2_HIP_SRCS "${Caffe2_HIP_SRCS}" ${tmp})
|
||||
|
||||
# ---[ CPU files.
|
||||
file(GLOB tmp *.cc)
|
||||
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp})
|
||||
# exclude test files and gpu files
|
||||
file(GLOB tmp *_test.cc)
|
||||
exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp})
|
||||
exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_GPU_SRCS})
|
||||
exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_HIP_SRCS})
|
||||
|
||||
# ---[ GPU test files
|
||||
file(GLOB tmp *_gpu_test.cc)
|
||||
set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} ${tmp})
|
||||
|
||||
# ---[ HIP test files
|
||||
file(GLOB tmp hip/*_test.cc)
|
||||
set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} ${tmp})
|
||||
|
||||
# ---[ CPU test files
|
||||
file(GLOB tmp *_test.cc)
|
||||
set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp})
|
||||
exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_GPU_TEST_SRCS})
|
||||
exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_HIP_TEST_SRCS})
|
||||
|
||||
# ---[ Send the lists to the parent scope.
|
||||
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
|
||||
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
|
||||
set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE)
|
||||
set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
|
||||
set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
|
||||
set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} PARENT_SCOPE)
|
||||
else()
|
||||
message(STATUS "Excluding image processing operators due to no opencv")
|
||||
endif()
|
@ -1,167 +0,0 @@
|
||||
#include "caffe2/image/image_input_op.h"
|
||||
|
||||
#ifdef USE_MKLDNN
|
||||
#include <caffe2/ideep/operators/operator_fallback_ideep.h>
|
||||
#include <caffe2/ideep/utils/ideep_operator.h>
|
||||
#endif
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <>
|
||||
bool ImageInputOp<CPUContext>::ApplyTransformOnGPU(
|
||||
const std::vector<std::int64_t>&,
|
||||
const c10::Device&) {
|
||||
return false;
|
||||
}
|
||||
|
||||
REGISTER_CPU_OPERATOR(ImageInput, ImageInputOp<CPUContext>);
|
||||
|
||||
OPERATOR_SCHEMA(ImageInput)
|
||||
.NumInputs(0, 1)
|
||||
.NumOutputs(2, INT_MAX)
|
||||
.TensorInferenceFunction([](const OperatorDef& def,
|
||||
const vector<TensorShape>& /* unused */) {
|
||||
vector<TensorShape> out(2);
|
||||
ArgumentHelper helper(def);
|
||||
int batch_size = helper.GetSingleArgument<int>("batch_size", 0);
|
||||
int crop = helper.GetSingleArgument<int>("crop", -1);
|
||||
int color = helper.GetSingleArgument<int>("color", 1);
|
||||
TORCH_CHECK_GT(crop, 0);
|
||||
out[0] = CreateTensorShape(
|
||||
vector<int>{batch_size, crop, crop, color ? 3 : 1},
|
||||
TensorProto::FLOAT);
|
||||
out[1] =
|
||||
CreateTensorShape(vector<int>{1, batch_size}, TensorProto::INT32);
|
||||
return out;
|
||||
})
|
||||
.SetDoc(R"DOC(
|
||||
Imports and processes images from a database. For each run of the operator,
|
||||
batch_size images will be processed. GPUs can optionally be used for
|
||||
part of the processing.
|
||||
|
||||
The following transformations are applied to the image
|
||||
- A bounding box is applied to the initial image (optional)
|
||||
- The image is rescaled either up or down (with the scale argument) or
|
||||
just up (with the minsize argument)
|
||||
- The image is randomly cropped (crop size is passed as an argument but
|
||||
the location of the crop is random except if is_test is passed in which case
|
||||
the image in cropped at the center)
|
||||
- The image is normalized. Each of its color channels can have separate
|
||||
normalization values
|
||||
|
||||
The dimension of the output image will always be cropxcrop
|
||||
)DOC")
|
||||
.Arg(
|
||||
"batch_size",
|
||||
"Number of images to output for each run of the operator"
|
||||
". Must be 1 or greater")
|
||||
.Arg("color", "Number of color channels (1 or 3). Defaults to 1")
|
||||
.Arg("color_jitter", "Whether or not to do color jitter. Defaults to 0")
|
||||
.Arg(
|
||||
"img_saturation",
|
||||
"Image saturation scale used in color jittering. "
|
||||
"Defaults to 0.4")
|
||||
.Arg(
|
||||
"img_brightness",
|
||||
"Image brightness scale used in color jittering. "
|
||||
"Defaults to 0.4")
|
||||
.Arg(
|
||||
"img_contrast",
|
||||
"Image contrast scale used in color jittering. "
|
||||
"Defaults to 0.4")
|
||||
.Arg(
|
||||
"color_lighting",
|
||||
"Whether or not to do color lighting."
|
||||
" Defaults to 0")
|
||||
.Arg(
|
||||
"color_lighting_std",
|
||||
"Std of normal distribution where color lighting"
|
||||
" scaling factor is sampled. Defaults to 0.1")
|
||||
.Arg(
|
||||
"scale_jitter_type",
|
||||
"Type 0: No scale jittering "
|
||||
"Type 1: Inception-style scale jittering")
|
||||
.Arg(
|
||||
"label_type",
|
||||
"Type 0: single integer label for multi-class "
|
||||
"classification. Type 1: sparse active label indices for multi-label "
|
||||
"classification. Type 2: dense label embedding vector for label "
|
||||
"embedding regression")
|
||||
.Arg(
|
||||
"scale",
|
||||
"Scale the size of the smallest dimension of the image to"
|
||||
" this. Scale and minsize are mutually exclusive."
|
||||
" Must be larger than crop")
|
||||
.Arg(
|
||||
"minsize",
|
||||
"Scale the size of the smallest dimension of the image to"
|
||||
" this only if the size is initially smaller. Scale and minsize are"
|
||||
" mutually exclusive. Must be larger than crop.")
|
||||
.Arg(
|
||||
"warp",
|
||||
"If 1, both dimensions of the image will be set to minsize or"
|
||||
" scale; otherwise, the other dimension is proportionally scaled."
|
||||
" Defaults to 0")
|
||||
.Arg("crop", "Size to crop the image to. Must be provided")
|
||||
.Arg("mirror", "Whether or not to mirror the image. Defaults to 0")
|
||||
.Arg(
|
||||
"mean",
|
||||
"Mean by which to normalize color channels."
|
||||
" Defaults to 0.")
|
||||
.Arg(
|
||||
"mean_per_channel",
|
||||
"Vector of means per color channel "
|
||||
" (1 or 3 elements). Defaults to mean argument. Channel order BGR")
|
||||
.Arg(
|
||||
"std",
|
||||
"Standard deviation by which to normalize color channels."
|
||||
" Defaults to 1.")
|
||||
.Arg(
|
||||
"std_per_channel",
|
||||
"Vector of standard dev. per color channel "
|
||||
" (1 or 3 elements). Defaults to std argument. Channel order is BGR")
|
||||
.Arg("bounding_ymin", "Bounding box coordinate. Defaults to -1 (none)")
|
||||
.Arg("bounding_xmin", "Bounding box coordinate. Defaults to -1 (none)")
|
||||
.Arg("bounding_height", "Bounding box coordinate. Defaults to -1 (none)")
|
||||
.Arg("bounding_width", "Bounding box coordinate. Defaults to -1 (none)")
|
||||
.ArgIsTest("Set to 1 to do deterministic cropping. Defaults to 0")
|
||||
.Arg("use_caffe_datum", "1 if the input is in Caffe format. Defaults to 0")
|
||||
.Arg(
|
||||
"use_gpu_transform",
|
||||
"1 if GPU acceleration should be used."
|
||||
" Defaults to 0. Can only be 1 in a CUDAContext")
|
||||
.Arg(
|
||||
"decode_threads",
|
||||
"Number of CPU decode/transform threads."
|
||||
" Defaults to 4")
|
||||
.Arg("output_type", "If gpu_transform, can set to FLOAT or FLOAT16.")
|
||||
.Arg("db", "Name of the database (if not passed as input)")
|
||||
.Arg(
|
||||
"db_type",
|
||||
"Type of database (if not passed as input)."
|
||||
" Defaults to leveldb")
|
||||
.Arg(
|
||||
"output_sizes",
|
||||
"The sizes of any outputs besides the data and label "
|
||||
"(should have a number of elements equal to the number of additional "
|
||||
"outputs)")
|
||||
.Arg(
|
||||
"random_scale",
|
||||
"[min, max] shortest-side desired for image resize. "
|
||||
"Defaults to [-1, -1] or no random resize desired.")
|
||||
.Input(0, "reader", "The input reader (a db::DBReader)")
|
||||
.Output(0, "data", "Tensor containing the images")
|
||||
.Output(1, "label", "Tensor containing the labels")
|
||||
.Output(
|
||||
2,
|
||||
"additional outputs",
|
||||
"Any outputs after the first 2 will be "
|
||||
"Tensors read from the input TensorProtos");
|
||||
|
||||
NO_GRADIENT(ImageInput);
|
||||
|
||||
#ifdef USE_MKLDNN
|
||||
REGISTER_IDEEP_OPERATOR(ImageInput, IDEEPFallbackOp<ImageInputOp<CPUContext>>);
|
||||
#endif
|
||||
|
||||
} // namespace caffe2
|
File diff suppressed because it is too large
Load Diff
@ -1,38 +0,0 @@
|
||||
#include "caffe2/core/common_gpu.h"
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/image/image_input_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <>
|
||||
bool ImageInputOp<CUDAContext>::ApplyTransformOnGPU(
|
||||
const std::vector<std::int64_t>& dims,
|
||||
const c10::Device& type) {
|
||||
// GPU transform kernel allows explicitly setting output type
|
||||
if (output_type_ == TensorProto_DataType_FLOAT) {
|
||||
auto* image_output =
|
||||
OperatorBase::OutputTensor(0, dims, at::dtype<float>().device(type));
|
||||
TransformOnGPU<uint8_t, float, CUDAContext>(
|
||||
prefetched_image_on_device_,
|
||||
image_output,
|
||||
mean_gpu_,
|
||||
std_gpu_,
|
||||
&context_);
|
||||
} else if (output_type_ == TensorProto_DataType_FLOAT16) {
|
||||
auto* image_output =
|
||||
OperatorBase::OutputTensor(0, dims, at::dtype<at::Half>().device(type));
|
||||
TransformOnGPU<uint8_t, at::Half, CUDAContext>(
|
||||
prefetched_image_on_device_,
|
||||
image_output,
|
||||
mean_gpu_,
|
||||
std_gpu_,
|
||||
&context_);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
REGISTER_CUDA_OPERATOR(ImageInput, ImageInputOp<CUDAContext>);
|
||||
|
||||
} // namespace caffe2
|
@ -1,85 +0,0 @@
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/image/transform_gpu.h"
|
||||
#include "caffe2/utils/conversions.h"
|
||||
|
||||
/**
|
||||
*
|
||||
* Copyright (c) 2016, NVIDIA CORPORATION, All rights reserved
|
||||
* Distributed under 2-clause BSD license; see accompanying LICENSE file
|
||||
*
|
||||
**/
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
namespace {
|
||||
|
||||
// input in (int8, NHWC), output in (fp32, NCHW)
|
||||
template <typename In, typename Out>
|
||||
__global__ void transform_kernel(
|
||||
const int C,
|
||||
const int H,
|
||||
const int W,
|
||||
const float* mean,
|
||||
const float* std,
|
||||
const In* in,
|
||||
Out* out) {
|
||||
const auto n = blockIdx.x;
|
||||
|
||||
const auto nStride = C*H*W;
|
||||
|
||||
// pointers to data for this image
|
||||
const In *const input_ptr = &in[n*nStride];
|
||||
Out *const output_ptr = &out[n*nStride];
|
||||
|
||||
// either read or write uncoalesced - try reading
|
||||
for (int c=0; c < C; ++c) {
|
||||
for (int h=threadIdx.y; h < H; h += blockDim.y) {
|
||||
for (int w=threadIdx.x; w < W; w += blockDim.x) {
|
||||
const int in_idx = c + C*w + C*W*h; // HWC
|
||||
const int out_idx = c*H*W + h*W + w; // CHW
|
||||
|
||||
output_ptr[out_idx] = convert::To<float,Out>(
|
||||
(convert::To<In,float>(input_ptr[in_idx])-mean[c]) * std[c]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template <typename T_IN, typename T_OUT, class Context>
|
||||
|
||||
bool TransformOnGPU(
|
||||
Tensor& X,
|
||||
Tensor* Y,
|
||||
Tensor& mean,
|
||||
Tensor& std,
|
||||
Context* context) {
|
||||
const int N = X.dim32(0), C = X.dim32(3), H = X.dim32(1), W = X.dim32(2);
|
||||
auto* input_data = X.template data<T_IN>();
|
||||
auto* output_data = Y->template mutable_data<T_OUT>();
|
||||
|
||||
transform_kernel<
|
||||
T_IN, T_OUT><<<N, dim3(16, 16), 0, context->cuda_stream()>>>(
|
||||
C, H, W, mean.template data<float>(), std.template data<float>(),
|
||||
input_data, output_data);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
template bool TransformOnGPU<uint8_t, float, CUDAContext>(
|
||||
Tensor& X,
|
||||
Tensor* Y,
|
||||
Tensor& mean,
|
||||
Tensor& std,
|
||||
CUDAContext* context);
|
||||
|
||||
template bool TransformOnGPU<uint8_t, at::Half, CUDAContext>(
|
||||
Tensor& X,
|
||||
Tensor* Y,
|
||||
Tensor& mean,
|
||||
Tensor& std,
|
||||
CUDAContext* context);
|
||||
|
||||
} // namespace caffe2
|
@ -1,43 +0,0 @@
|
||||
#ifndef CAFFE2_IMAGE_TRANSFORM_GPU_H_
|
||||
#define CAFFE2_IMAGE_TRANSFORM_GPU_H_
|
||||
|
||||
/**
|
||||
*
|
||||
* Copyright (c) 2016, NVIDIA CORPORATION, All rights reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**/
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename T_IN, typename T_OUT, class Context>
|
||||
bool TransformOnGPU(
|
||||
Tensor& X,
|
||||
Tensor* Y,
|
||||
Tensor& mean,
|
||||
Tensor& std,
|
||||
Context* context);
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif
|
@ -1,59 +0,0 @@
|
||||
if(USE_OPENCV AND OpenCV_FOUND AND USE_FFMPEG AND FFMPEG_FOUND)
|
||||
message(STATUS "Including video processing operators")
|
||||
# ---[ GPU files
|
||||
# ------[ general GPU
|
||||
file(GLOB tmp *_gpu.cc)
|
||||
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
|
||||
# ------[ CUDA sources
|
||||
file(GLOB tmp *.cu)
|
||||
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
|
||||
# exclude test files
|
||||
file(GLOB tmp *_test.cc)
|
||||
exclude(Caffe2_GPU_SRCS "${Caffe2_GPU_SRCS}" ${tmp})
|
||||
|
||||
# ---[ HIP files
|
||||
# ------[ general HIP
|
||||
file(GLOB tmp hip/*.cc)
|
||||
set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${tmp})
|
||||
# ------[ HIP sources
|
||||
file(GLOB tmp hip/*.hip)
|
||||
set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${tmp})
|
||||
# exclude test files
|
||||
file(GLOB tmp hip/*_test.cc)
|
||||
exclude(Caffe2_HIP_SRCS "${Caffe2_HIP_SRCS}" ${tmp})
|
||||
|
||||
# ---[ CPU files.
|
||||
file(GLOB tmp *.cc)
|
||||
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp})
|
||||
# exclude test files and gpu files
|
||||
file(GLOB tmp *_test.cc)
|
||||
exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp})
|
||||
exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_GPU_SRCS})
|
||||
exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_HIP_SRCS})
|
||||
|
||||
# ---[ GPU test files
|
||||
file(GLOB tmp *_gpu_test.cc)
|
||||
set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} ${tmp})
|
||||
|
||||
# ---[ HIP test files
|
||||
file(GLOB tmp hip/*_test.cc)
|
||||
set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} ${tmp})
|
||||
|
||||
# ---[ CPU test files
|
||||
file(GLOB tmp *_test.cc)
|
||||
set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp})
|
||||
exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}"
|
||||
${Caffe2_GPU_TEST_SRCS})
|
||||
exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}"
|
||||
${Caffe2_GPU_TEST_SRCS})
|
||||
|
||||
# ---[ Send the lists to the parent scope.
|
||||
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
|
||||
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
|
||||
set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE)
|
||||
set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
|
||||
set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
|
||||
set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} PARENT_SCOPE)
|
||||
else()
|
||||
message(STATUS "Excluding video processing operators due to no opencv")
|
||||
endif()
|
@ -1,85 +0,0 @@
|
||||
#include <caffe2/video/optical_flow.h>
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
void OpticalFlowExtractor(
|
||||
const cv::Mat& prev_gray,
|
||||
const cv::Mat& curr_gray,
|
||||
const int flow_alg_type,
|
||||
cv::Mat& flow) {
|
||||
#if CV_MAJOR_VERSION >= 4
|
||||
cv::Ptr<cv::DISOpticalFlow> tvl1 = cv::DISOpticalFlow::create();
|
||||
#else
|
||||
cv::Ptr<cv::DualTVL1OpticalFlow> tvl1 = cv::DualTVL1OpticalFlow::create();
|
||||
#endif
|
||||
switch (flow_alg_type) {
|
||||
case FLowAlgType::FarnebackOpticalFlow:
|
||||
cv::calcOpticalFlowFarneback(
|
||||
prev_gray,
|
||||
curr_gray,
|
||||
flow,
|
||||
std::sqrt(2) / 2.0,
|
||||
5,
|
||||
10,
|
||||
2,
|
||||
7,
|
||||
1.5,
|
||||
cv::OPTFLOW_FARNEBACK_GAUSSIAN);
|
||||
break;
|
||||
case FLowAlgType::DensePyrLKOpticalFlow:
|
||||
LOG(ERROR) << "DensePyrLKOpticalFlow only has sparse version on CPU";
|
||||
break;
|
||||
case FLowAlgType::BroxOpticalFlow:
|
||||
LOG(ERROR) << "BroxOpticalFlow on CPU is not available";
|
||||
break;
|
||||
case FLowAlgType::OpticalFlowDual_TVL1:
|
||||
tvl1->calc(prev_gray, curr_gray, flow);
|
||||
break;
|
||||
default:
|
||||
LOG(ERROR) << "Unsupported optical flow type " << flow_alg_type;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void MergeOpticalFlow(cv::Mat& prev_flow, const cv::Mat& curr_flow) {
|
||||
const int rows = prev_flow.rows;
|
||||
const int cols = prev_flow.cols;
|
||||
|
||||
// merge two optical flows into one
|
||||
for (int y = 0; y < rows; y++) {
|
||||
for (int x = 0; x < cols; x++) {
|
||||
cv::Point2f u = prev_flow.at<cv::Point2f>(y, x);
|
||||
// get the new location
|
||||
int x_new = std::min(cols - 1, std::max(0, cvRound(u.x + x)));
|
||||
int y_new = std::min(rows - 1, std::max(0, cvRound(u.y + y)));
|
||||
cv::Point2f u_new = curr_flow.at<cv::Point2f>(y_new, x_new);
|
||||
|
||||
// update the flow
|
||||
prev_flow.at<cv::Point2f>(y, x) += u_new;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void MultiFrameOpticalFlowExtractor(
|
||||
const std::vector<cv::Mat>& grays,
|
||||
const int optical_flow_alg_type,
|
||||
cv::Mat& flow) {
|
||||
int num_frames = grays.size();
|
||||
CAFFE_ENFORCE_GE(num_frames, 2, "need at least 2 frames!");
|
||||
|
||||
// compute optical flow for every two frames
|
||||
std::vector<cv::Mat> flows;
|
||||
for (int i = 0; i < num_frames - 1; i++) {
|
||||
cv::Mat tmp;
|
||||
OpticalFlowExtractor(grays[i], grays[i + 1], optical_flow_alg_type, tmp);
|
||||
flows.push_back(tmp);
|
||||
}
|
||||
|
||||
flows[0].copyTo(flow);
|
||||
// aggregate optical flow across multiple frame
|
||||
for (int i = 1; i < num_frames - 1; i++) {
|
||||
MergeOpticalFlow(flow, flows[i]);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
@ -1,50 +0,0 @@
|
||||
#ifndef CAFFE2_VIDEO_OPTICAL_FLOW_H_
|
||||
#define CAFFE2_VIDEO_OPTICAL_FLOW_H_
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
#include <opencv2/highgui.hpp>
|
||||
#include <opencv2/opencv.hpp>
|
||||
#include <opencv2/video.hpp>
|
||||
|
||||
#include <caffe2/core/logging.h>
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
// Four different types of optical flow algorithms supported;
|
||||
// BroxOpticalFlow doesn't have a CPU version;
|
||||
// DensePyrLKOpticalFlow only has sparse CPU version;
|
||||
enum FLowAlgType {
|
||||
FarnebackOpticalFlow = 0,
|
||||
DensePyrLKOpticalFlow = 1,
|
||||
BroxOpticalFlow = 2,
|
||||
OpticalFlowDual_TVL1 = 3,
|
||||
};
|
||||
|
||||
// Define different types of optical flow data type
|
||||
// 0: original two channel optical flow
|
||||
// 1: three channel optical flow with magnitude as the third channel
|
||||
// 2: two channel optical flow + one channel gray
|
||||
// 3: two channel optical flow + three channel rgb
|
||||
enum FlowDataType {
|
||||
Flow2C = 0,
|
||||
Flow3C = 1,
|
||||
FlowWithGray = 2,
|
||||
FlowWithRGB = 3,
|
||||
};
|
||||
|
||||
void OpticalFlowExtractor(
|
||||
const cv::Mat& prev_gray,
|
||||
const cv::Mat& curr_gray,
|
||||
const int optical_flow_alg_type,
|
||||
cv::Mat& flow);
|
||||
|
||||
void MergeOpticalFlow(cv::Mat& prev_flow, const cv::Mat& curr_flow);
|
||||
|
||||
void MultiFrameOpticalFlowExtractor(
|
||||
const std::vector<cv::Mat>& grays,
|
||||
const int optical_flow_alg_type,
|
||||
cv::Mat& flow);
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_VIDEO_OPTICAL_FLOW_H_
|
@ -1,800 +0,0 @@
|
||||
#include <assert.h>
|
||||
#include <caffe2/core/logging.h>
|
||||
#include <caffe2/video/video_decoder.h>
|
||||
#include <array>
|
||||
#include <mutex>
|
||||
#include <random>
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
VideoDecoder::VideoDecoder() {
|
||||
static bool gInitialized = false;
|
||||
static std::mutex gMutex;
|
||||
std::unique_lock<std::mutex> lock(gMutex);
|
||||
if (!gInitialized) {
|
||||
av_register_all();
|
||||
avcodec_register_all();
|
||||
avformat_network_init();
|
||||
gInitialized = true;
|
||||
}
|
||||
}
|
||||
|
||||
void VideoDecoder::getAudioSample(
|
||||
AVPacket& packet,
|
||||
AVCodecContext* audioCodecContext_,
|
||||
AVFrame* audioStreamFrame_,
|
||||
SwrContext* convertCtx_,
|
||||
Callback& callback,
|
||||
const Params& params) {
|
||||
int frame_finished = 0;
|
||||
auto result = avcodec_decode_audio4(
|
||||
audioCodecContext_, audioStreamFrame_, &frame_finished, &packet);
|
||||
|
||||
if (frame_finished) {
|
||||
// from
|
||||
// https://www.ffmpeg.org/doxygen/2.3/decoding_encoding_8c-example.html#a57
|
||||
auto c = audioCodecContext_;
|
||||
int data_size = av_samples_get_buffer_size(
|
||||
nullptr, c->channels, audioStreamFrame_->nb_samples, c->sample_fmt, 1);
|
||||
if (data_size < 0) {
|
||||
// This should not occur, checking just for paranoia
|
||||
LOG(ERROR) << "Failed to calculate data size";
|
||||
}
|
||||
|
||||
// from https://www.ffmpeg.org/doxygen/2.1/group__lswr.html#details
|
||||
uint8_t* output;
|
||||
auto swr = convertCtx_;
|
||||
auto inrate = audioCodecContext_->sample_rate;
|
||||
auto in_samples = audioStreamFrame_->nb_samples;
|
||||
|
||||
int out_samples = av_rescale_rnd(
|
||||
swr_get_delay(swr, inrate) + in_samples,
|
||||
params.outrate_,
|
||||
inrate,
|
||||
AV_ROUND_UP);
|
||||
|
||||
if (out_samples > 0) {
|
||||
auto input = (const uint8_t**)&audioStreamFrame_->data[0];
|
||||
av_samples_alloc(
|
||||
&output,
|
||||
nullptr,
|
||||
c->channels,
|
||||
out_samples,
|
||||
(AVSampleFormat)params.outfmt_,
|
||||
0);
|
||||
|
||||
// resample the audio data
|
||||
out_samples = swr_convert(swr, &output, out_samples, input, in_samples);
|
||||
auto sample_size = out_samples * c->channels * sizeof(float);
|
||||
auto buffer = std::make_unique<float[]>(sample_size);
|
||||
memcpy(buffer.get(), output, sample_size);
|
||||
av_freep(&output);
|
||||
|
||||
unique_ptr<DecodedAudio> audio_sample = make_unique<DecodedAudio>();
|
||||
audio_sample->dataSize_ = data_size;
|
||||
audio_sample->outSampleSize_ = out_samples * c->channels;
|
||||
audio_sample->audio_data_ = std::move(buffer);
|
||||
callback.audioDecoded(std::move(audio_sample));
|
||||
}
|
||||
} else {
|
||||
result = packet.size;
|
||||
}
|
||||
packet.size -= result;
|
||||
packet.data += result;
|
||||
}
|
||||
|
||||
void VideoDecoder::ResizeAndKeepAspectRatio(
|
||||
const int origWidth,
|
||||
const int origHeight,
|
||||
const int short_edge,
|
||||
const int long_edge,
|
||||
int& outWidth,
|
||||
int& outHeight) {
|
||||
if (origWidth < origHeight) {
|
||||
// dominant height
|
||||
if (short_edge > 0) {
|
||||
// use short_edge for rescale
|
||||
float ratio = short_edge / float(origWidth);
|
||||
outWidth = short_edge;
|
||||
outHeight = (int)round(ratio * origHeight);
|
||||
} else {
|
||||
// use long_edge for rescale
|
||||
float ratio = long_edge / float(origHeight);
|
||||
outHeight = long_edge;
|
||||
outWidth = (int)round(ratio * origWidth);
|
||||
}
|
||||
} else {
|
||||
// dominant width
|
||||
if (short_edge > 0) {
|
||||
// use short_edge for rescale
|
||||
float ratio = short_edge / float(origHeight);
|
||||
outHeight = short_edge;
|
||||
outWidth = (int)round(ratio * origWidth);
|
||||
} else {
|
||||
// use long_edge for rescale
|
||||
float ratio = long_edge / float(origWidth);
|
||||
outWidth = long_edge;
|
||||
outHeight = (int)round(ratio * origHeight);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void VideoDecoder::decodeLoop(
|
||||
const string& videoName,
|
||||
VideoIOContext& ioctx,
|
||||
const Params& params,
|
||||
const int start_frm,
|
||||
Callback& callback) {
|
||||
AVPixelFormat pixFormat = params.pixelFormat_;
|
||||
AVFormatContext* inputContext = avformat_alloc_context();
|
||||
AVStream* videoStream_ = nullptr;
|
||||
AVCodecContext* videoCodecContext_ = nullptr;
|
||||
AVCodecContext* audioCodecContext_ = nullptr;
|
||||
AVFrame* videoStreamFrame_ = nullptr;
|
||||
AVFrame* audioStreamFrame_ = nullptr;
|
||||
SwrContext* convertCtx_ = nullptr;
|
||||
AVPacket packet;
|
||||
av_init_packet(&packet); // init packet
|
||||
SwsContext* scaleContext_ = nullptr;
|
||||
|
||||
try {
|
||||
inputContext->pb = ioctx.get_avio();
|
||||
inputContext->flags |= AVFMT_FLAG_CUSTOM_IO;
|
||||
int ret = 0;
|
||||
|
||||
// Determining the input format:
|
||||
int probeSz = 1 * 1024 + AVPROBE_PADDING_SIZE;
|
||||
DecodedFrame::AvDataPtr probe((uint8_t*)av_malloc(probeSz));
|
||||
memset(probe.get(), 0, probeSz);
|
||||
int len = ioctx.read(probe.get(), probeSz - AVPROBE_PADDING_SIZE);
|
||||
if (len < probeSz - AVPROBE_PADDING_SIZE) {
|
||||
LOG(ERROR) << "Insufficient data to determine video format";
|
||||
return;
|
||||
}
|
||||
// seek back to start of stream
|
||||
ioctx.seek(0, SEEK_SET);
|
||||
|
||||
unique_ptr<AVProbeData> probeData(new AVProbeData());
|
||||
probeData->buf = probe.get();
|
||||
probeData->buf_size = len;
|
||||
probeData->filename = "";
|
||||
// Determine the input-format:
|
||||
inputContext->iformat = av_probe_input_format(probeData.get(), 1);
|
||||
// this is to avoid the double-free error
|
||||
if (inputContext->iformat == nullptr) {
|
||||
LOG(ERROR) << "inputContext iformat is nullptr!";
|
||||
return;
|
||||
}
|
||||
|
||||
ret = avformat_open_input(&inputContext, "", nullptr, nullptr);
|
||||
if (ret < 0) {
|
||||
LOG(ERROR) << "Unable to open stream : " << ffmpegErrorStr(ret);
|
||||
return;
|
||||
}
|
||||
|
||||
ret = avformat_find_stream_info(inputContext, nullptr);
|
||||
if (ret < 0) {
|
||||
LOG(ERROR) << "Unable to find stream info in " << videoName << " "
|
||||
<< ffmpegErrorStr(ret);
|
||||
return;
|
||||
}
|
||||
|
||||
// Decode the first video stream
|
||||
int videoStreamIndex_ = params.streamIndex_;
|
||||
int audioStreamIndex_ = params.streamIndex_;
|
||||
if (params.streamIndex_ == -1) {
|
||||
for (int i = 0; i < inputContext->nb_streams; i++) {
|
||||
auto stream = inputContext->streams[i];
|
||||
if (stream->codec->codec_type == AVMEDIA_TYPE_VIDEO &&
|
||||
videoStreamIndex_ == -1) {
|
||||
videoStreamIndex_ = i;
|
||||
videoStream_ = stream;
|
||||
} else if (
|
||||
stream->codec->codec_type == AVMEDIA_TYPE_AUDIO &&
|
||||
audioStreamIndex_ == -1) {
|
||||
audioStreamIndex_ = i;
|
||||
}
|
||||
if (videoStreamIndex_ != -1 && audioStreamIndex_ != -1) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (videoStream_ == nullptr) {
|
||||
LOG(ERROR) << "Unable to find video stream in " << videoName << " "
|
||||
<< ffmpegErrorStr(ret);
|
||||
return;
|
||||
}
|
||||
|
||||
// Initialize codec
|
||||
AVDictionary* opts = nullptr;
|
||||
videoCodecContext_ = videoStream_->codec;
|
||||
try {
|
||||
ret = avcodec_open2(
|
||||
videoCodecContext_,
|
||||
avcodec_find_decoder(videoCodecContext_->codec_id),
|
||||
&opts);
|
||||
} catch (const std::exception&) {
|
||||
LOG(ERROR) << "Exception during open video codec";
|
||||
return;
|
||||
}
|
||||
|
||||
if (ret < 0) {
|
||||
LOG(ERROR) << "Cannot open video codec : "
|
||||
<< videoCodecContext_->codec->name;
|
||||
return;
|
||||
}
|
||||
|
||||
if (params.getAudio_ && audioStreamIndex_ >= 0) {
|
||||
// see e.g. ridge/decoder/StreamDecoder.cpp
|
||||
audioCodecContext_ = inputContext->streams[audioStreamIndex_]->codec;
|
||||
ret = avcodec_open2(
|
||||
audioCodecContext_,
|
||||
avcodec_find_decoder(audioCodecContext_->codec_id),
|
||||
nullptr);
|
||||
|
||||
if (ret < 0) {
|
||||
LOG(ERROR) << "Cannot open audio codec : "
|
||||
<< audioCodecContext_->codec->name;
|
||||
return;
|
||||
}
|
||||
|
||||
convertCtx_ = swr_alloc_set_opts(
|
||||
nullptr,
|
||||
params.outlayout_,
|
||||
(AVSampleFormat)params.outfmt_,
|
||||
params.outrate_,
|
||||
audioCodecContext_->channel_layout,
|
||||
audioCodecContext_->sample_fmt,
|
||||
audioCodecContext_->sample_rate,
|
||||
0,
|
||||
nullptr);
|
||||
|
||||
if (convertCtx_ == nullptr) {
|
||||
LOG(ERROR) << "Cannot setup sample format converter.";
|
||||
return;
|
||||
}
|
||||
if (swr_init(convertCtx_) < 0) {
|
||||
LOG(ERROR) << "Cannot init sample format converter.";
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate if we need to rescale the frames
|
||||
const int origWidth = videoCodecContext_->width;
|
||||
const int origHeight = videoCodecContext_->height;
|
||||
int outWidth = origWidth;
|
||||
int outHeight = origHeight;
|
||||
|
||||
if (params.video_res_type_ == VideoResType::ORIGINAL_RES) {
|
||||
// if the original resolution is too low,
|
||||
// make it at least the same size as crop_size_
|
||||
if (params.crop_size_ > origWidth || params.crop_size_ > origHeight) {
|
||||
ResizeAndKeepAspectRatio(
|
||||
origWidth, origHeight, params.crop_size_, -1, outWidth, outHeight);
|
||||
}
|
||||
} else if (params.video_res_type_ == VideoResType::USE_SHORT_EDGE) {
|
||||
// resize the image to the predefined
|
||||
// short_edge_ resolution while keep the aspect ratio
|
||||
ResizeAndKeepAspectRatio(
|
||||
origWidth, origHeight, params.short_edge_, -1, outWidth, outHeight);
|
||||
} else if (params.video_res_type_ == VideoResType::USE_WIDTH_HEIGHT) {
|
||||
// resize the image to the predefined
|
||||
// resolution and ignore the aspect ratio
|
||||
outWidth = params.outputWidth_;
|
||||
outHeight = params.outputHeight_;
|
||||
} else {
|
||||
LOG(ERROR) << "Unknown VideoResType: " << params.video_res_type_;
|
||||
return;
|
||||
}
|
||||
|
||||
// Make sure that we have a valid format
|
||||
if (videoCodecContext_->pix_fmt == AV_PIX_FMT_NONE) {
|
||||
LOG(ERROR) << "pixel format is not valid.";
|
||||
return;
|
||||
}
|
||||
|
||||
// Create a scale context
|
||||
scaleContext_ = sws_getContext(
|
||||
videoCodecContext_->width,
|
||||
videoCodecContext_->height,
|
||||
videoCodecContext_->pix_fmt,
|
||||
outWidth,
|
||||
outHeight,
|
||||
pixFormat,
|
||||
SWS_FAST_BILINEAR,
|
||||
nullptr,
|
||||
nullptr,
|
||||
nullptr);
|
||||
|
||||
// Getting video meta data
|
||||
VideoMeta videoMeta;
|
||||
videoMeta.codec_type = videoCodecContext_->codec_type;
|
||||
videoMeta.width = outWidth;
|
||||
videoMeta.height = outHeight;
|
||||
videoMeta.pixFormat = pixFormat;
|
||||
|
||||
// avoid division by zero, code adapted from
|
||||
// https://www.ffmpeg.org/doxygen/0.6/rational_8h-source.html
|
||||
if (videoStream_->avg_frame_rate.num == 0 ||
|
||||
videoStream_->avg_frame_rate.den == 0) {
|
||||
LOG(ERROR) << "Frame rate is wrong. No data found.";
|
||||
return;
|
||||
}
|
||||
|
||||
videoMeta.fps = av_q2d(videoStream_->avg_frame_rate);
|
||||
callback.videoDecodingStarted(videoMeta);
|
||||
|
||||
if (params.intervals_.size() == 0) {
|
||||
LOG(ERROR) << "Empty sampling intervals.";
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<SampleInterval>::const_iterator itvlIter =
|
||||
params.intervals_.begin();
|
||||
if (itvlIter->timestamp != 0) {
|
||||
LOG(ERROR) << "Sampling interval starting timestamp is not zero.";
|
||||
return;
|
||||
}
|
||||
|
||||
double currFps = itvlIter->fps;
|
||||
if (currFps < 0 && currFps != SpecialFps::SAMPLE_ALL_FRAMES &&
|
||||
currFps != SpecialFps::SAMPLE_TIMESTAMP_ONLY) {
|
||||
// fps must be 0, -1, -2 or > 0
|
||||
LOG(ERROR) << "Invalid sampling fps.";
|
||||
return;
|
||||
}
|
||||
|
||||
double prevTimestamp = itvlIter->timestamp;
|
||||
itvlIter++;
|
||||
if (itvlIter != params.intervals_.end() &&
|
||||
prevTimestamp >= itvlIter->timestamp) {
|
||||
LOG(ERROR) << "Sampling interval timestamps must be strictly ascending.";
|
||||
return;
|
||||
}
|
||||
|
||||
double lastFrameTimestamp = -1.0;
|
||||
double timestamp = -1.0;
|
||||
|
||||
// Initialize frame and packet.
|
||||
// These will be reused across calls.
|
||||
videoStreamFrame_ = av_frame_alloc();
|
||||
audioStreamFrame_ = av_frame_alloc();
|
||||
|
||||
// frame index in video stream
|
||||
int frameIndex = -1;
|
||||
// frame index of outputed frames
|
||||
int outputFrameIndex = -1;
|
||||
|
||||
/* identify the starting point from where we must start decoding */
|
||||
std::mt19937 meta_randgen(time(nullptr));
|
||||
long int start_ts = -1;
|
||||
bool mustDecodeAll = false;
|
||||
|
||||
if (videoStream_->duration > 0 && videoStream_->nb_frames > 0) {
|
||||
/* we have a valid duration and nb_frames. We can safely
|
||||
* detect an intermediate timestamp to start decoding from. */
|
||||
|
||||
// leave a margin of 10 frames to take in to account the error
|
||||
// from av_seek_frame
|
||||
long int margin =
|
||||
int(ceil((10 * videoStream_->duration) / (videoStream_->nb_frames)));
|
||||
// if we need to do temporal jittering
|
||||
if (params.decode_type_ == DecodeType::DO_TMP_JITTER) {
|
||||
/* estimate the average duration for the required # of frames */
|
||||
double maxFramesDuration =
|
||||
(videoStream_->duration * params.num_of_required_frame_) /
|
||||
(videoStream_->nb_frames);
|
||||
int ts1 = 0;
|
||||
int ts2 = videoStream_->duration - int(ceil(maxFramesDuration));
|
||||
ts2 = ts2 > 0 ? ts2 : 0;
|
||||
// pick a random timestamp between ts1 and ts2. ts2 is selected such
|
||||
// that you have enough frames to satisfy the required # of frames.
|
||||
start_ts = std::uniform_int_distribution<>(ts1, ts2)(meta_randgen);
|
||||
// seek a frame at start_ts
|
||||
ret = av_seek_frame(
|
||||
inputContext,
|
||||
videoStreamIndex_,
|
||||
0 > (start_ts - margin) ? 0 : (start_ts - margin),
|
||||
AVSEEK_FLAG_BACKWARD);
|
||||
|
||||
// if we need to decode from the start_frm
|
||||
} else if (params.decode_type_ == DecodeType::USE_START_FRM) {
|
||||
if (videoStream_ == nullptr) {
|
||||
LOG(ERROR) << "Nullptr found at videoStream_";
|
||||
return;
|
||||
}
|
||||
start_ts = int(floor(
|
||||
(videoStream_->duration * start_frm) / (videoStream_->nb_frames)));
|
||||
// seek a frame at start_ts
|
||||
ret = av_seek_frame(
|
||||
inputContext,
|
||||
videoStreamIndex_,
|
||||
0 > (start_ts - margin) ? 0 : (start_ts - margin),
|
||||
AVSEEK_FLAG_BACKWARD);
|
||||
} else {
|
||||
mustDecodeAll = true;
|
||||
}
|
||||
|
||||
if (ret < 0) {
|
||||
LOG(INFO) << "Unable to decode from a random start point";
|
||||
/* fall back to default decoding of all frames from start */
|
||||
av_seek_frame(inputContext, videoStreamIndex_, 0, AVSEEK_FLAG_BACKWARD);
|
||||
mustDecodeAll = true;
|
||||
}
|
||||
} else {
|
||||
mustDecodeAll = true;
|
||||
}
|
||||
|
||||
int gotPicture = 0;
|
||||
int eof = 0;
|
||||
int selectiveDecodedFrames = 0;
|
||||
|
||||
int maxFrames = (params.decode_type_ == DecodeType::DO_UNIFORM_SMP)
|
||||
? MAX_DECODING_FRAMES
|
||||
: params.num_of_required_frame_;
|
||||
// There is a delay between reading packets from the
|
||||
// transport and getting decoded frames back.
|
||||
// Therefore, after EOF, continue going while
|
||||
// the decoder is still giving us frames.
|
||||
while ((!eof || gotPicture) &&
|
||||
/* either you must decode all frames or decode up to maxFrames
|
||||
* based on status of the mustDecodeAll flag */
|
||||
(mustDecodeAll || (selectiveDecodedFrames < maxFrames)) &&
|
||||
/* If on the last interval and not autodecoding keyframes and a
|
||||
* SpecialFps indicates no more frames are needed, stop decoding */
|
||||
!((itvlIter == params.intervals_.end() &&
|
||||
(currFps == SpecialFps::SAMPLE_TIMESTAMP_ONLY ||
|
||||
currFps == SpecialFps::SAMPLE_NO_FRAME)) &&
|
||||
!params.keyFrames_)) {
|
||||
try {
|
||||
if (!eof) {
|
||||
ret = av_read_frame(inputContext, &packet);
|
||||
if (ret == AVERROR_EOF) {
|
||||
eof = 1;
|
||||
av_free_packet(&packet);
|
||||
packet.data = nullptr;
|
||||
packet.size = 0;
|
||||
// stay in the while loop to flush frames
|
||||
} else if (ret == AVERROR(EAGAIN)) {
|
||||
av_free_packet(&packet);
|
||||
continue;
|
||||
} else if (ret < 0) {
|
||||
LOG(ERROR) << "Error reading packet : " << ffmpegErrorStr(ret);
|
||||
return;
|
||||
}
|
||||
|
||||
auto si = packet.stream_index;
|
||||
if (params.getAudio_ && audioStreamIndex_ >= 0 &&
|
||||
si == audioStreamIndex_) {
|
||||
// Audio packets can have multiple audio frames in a single packet
|
||||
while (packet.size > 0) {
|
||||
assert(audioCodecContext_ != nullptr);
|
||||
assert(convertCtx_ != nullptr);
|
||||
getAudioSample(
|
||||
packet,
|
||||
audioCodecContext_,
|
||||
audioStreamFrame_,
|
||||
convertCtx_,
|
||||
callback,
|
||||
params);
|
||||
}
|
||||
}
|
||||
|
||||
if (si != videoStreamIndex_) {
|
||||
av_free_packet(&packet);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
ret = avcodec_decode_video2(
|
||||
videoCodecContext_, videoStreamFrame_, &gotPicture, &packet);
|
||||
if (ret < 0) {
|
||||
LOG(ERROR) << "Error decoding video frame : " << ffmpegErrorStr(ret);
|
||||
return;
|
||||
}
|
||||
try {
|
||||
// Nothing to do without a picture
|
||||
if (!gotPicture) {
|
||||
av_free_packet(&packet);
|
||||
continue;
|
||||
}
|
||||
frameIndex++;
|
||||
|
||||
long int frame_ts =
|
||||
av_frame_get_best_effort_timestamp(videoStreamFrame_);
|
||||
timestamp = frame_ts * av_q2d(videoStream_->time_base);
|
||||
if ((frame_ts >= start_ts && !mustDecodeAll) || mustDecodeAll) {
|
||||
/* process current frame if:
|
||||
* 1) We are not doing selective decoding and mustDecodeAll
|
||||
* OR
|
||||
* 2) We are doing selective decoding and current frame
|
||||
* timestamp is >= start_ts from where we start selective
|
||||
* decoding*/
|
||||
// if reaching the next interval, update the current fps
|
||||
// and reset lastFrameTimestamp so the current frame could be
|
||||
// sampled (unless fps == SpecialFps::SAMPLE_NO_FRAME)
|
||||
if (itvlIter != params.intervals_.end() &&
|
||||
timestamp >= itvlIter->timestamp) {
|
||||
lastFrameTimestamp = -1.0;
|
||||
currFps = itvlIter->fps;
|
||||
prevTimestamp = itvlIter->timestamp;
|
||||
itvlIter++;
|
||||
if (itvlIter != params.intervals_.end() &&
|
||||
prevTimestamp >= itvlIter->timestamp) {
|
||||
LOG(ERROR)
|
||||
<< "Sampling interval timestamps must be strictly ascending.";
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// keyFrame will bypass all checks on fps sampling settings
|
||||
bool keyFrame = params.keyFrames_ && videoStreamFrame_->key_frame;
|
||||
if (!keyFrame) {
|
||||
// if fps == SpecialFps::SAMPLE_NO_FRAME (0), don't sample at all
|
||||
if (currFps == SpecialFps::SAMPLE_NO_FRAME) {
|
||||
av_free_packet(&packet);
|
||||
continue;
|
||||
}
|
||||
|
||||
// fps is considered reached in the following cases:
|
||||
// 1. lastFrameTimestamp < 0 - start of a new interval
|
||||
// (or first frame)
|
||||
// 2. currFps == SpecialFps::SAMPLE_ALL_FRAMES (-1) - sample every
|
||||
// frame
|
||||
// 3. timestamp - lastFrameTimestamp has reached target fps and
|
||||
// currFps > 0 (not special fps setting)
|
||||
// different modes for fps:
|
||||
// SpecialFps::SAMPLE_NO_FRAMES (0):
|
||||
// disable fps sampling, no frame sampled at all
|
||||
// SpecialFps::SAMPLE_ALL_FRAMES (-1):
|
||||
// unlimited fps sampling, will sample at native video fps
|
||||
// SpecialFps::SAMPLE_TIMESTAMP_ONLY (-2):
|
||||
// disable fps sampling, but will get the frame at specific
|
||||
// timestamp
|
||||
// others (> 0): decoding at the specified fps
|
||||
bool fpsReached = lastFrameTimestamp < 0 ||
|
||||
currFps == SpecialFps::SAMPLE_ALL_FRAMES ||
|
||||
(currFps > 0 &&
|
||||
timestamp >= lastFrameTimestamp + (1 / currFps));
|
||||
|
||||
if (!fpsReached) {
|
||||
av_free_packet(&packet);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
lastFrameTimestamp = timestamp;
|
||||
|
||||
outputFrameIndex++;
|
||||
if (params.maximumOutputFrames_ != -1 &&
|
||||
outputFrameIndex >= params.maximumOutputFrames_) {
|
||||
// enough frames
|
||||
av_free_packet(&packet);
|
||||
break;
|
||||
}
|
||||
|
||||
AVFrame* rgbFrame = av_frame_alloc();
|
||||
if (!rgbFrame) {
|
||||
LOG(ERROR) << "Error allocating AVframe";
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
// Determine required buffer size and allocate buffer
|
||||
int numBytes = avpicture_get_size(pixFormat, outWidth, outHeight);
|
||||
DecodedFrame::AvDataPtr buffer(
|
||||
(uint8_t*)av_malloc(numBytes * sizeof(uint8_t)));
|
||||
|
||||
int size = avpicture_fill(
|
||||
(AVPicture*)rgbFrame,
|
||||
buffer.get(),
|
||||
pixFormat,
|
||||
outWidth,
|
||||
outHeight);
|
||||
|
||||
sws_scale(
|
||||
scaleContext_,
|
||||
videoStreamFrame_->data,
|
||||
videoStreamFrame_->linesize,
|
||||
0,
|
||||
videoCodecContext_->height,
|
||||
rgbFrame->data,
|
||||
rgbFrame->linesize);
|
||||
|
||||
unique_ptr<DecodedFrame> frame = make_unique<DecodedFrame>();
|
||||
frame->width_ = outWidth;
|
||||
frame->height_ = outHeight;
|
||||
frame->data_ = std::move(buffer);
|
||||
frame->size_ = size;
|
||||
frame->index_ = frameIndex;
|
||||
frame->outputFrameIndex_ = outputFrameIndex;
|
||||
frame->timestamp_ = timestamp;
|
||||
frame->keyFrame_ = videoStreamFrame_->key_frame;
|
||||
|
||||
callback.frameDecoded(std::move(frame));
|
||||
|
||||
selectiveDecodedFrames++;
|
||||
av_frame_free(&rgbFrame);
|
||||
} catch (const std::exception&) {
|
||||
av_frame_free(&rgbFrame);
|
||||
}
|
||||
}
|
||||
av_frame_unref(videoStreamFrame_);
|
||||
av_frame_unref(audioStreamFrame_);
|
||||
} catch (const std::exception&) {
|
||||
av_frame_unref(videoStreamFrame_);
|
||||
av_frame_unref(audioStreamFrame_);
|
||||
}
|
||||
|
||||
av_free_packet(&packet);
|
||||
} catch (const std::exception&) {
|
||||
av_free_packet(&packet);
|
||||
}
|
||||
} // of while loop
|
||||
callback.videoDecodingEnded(timestamp);
|
||||
|
||||
// free all stuffs
|
||||
sws_freeContext(scaleContext_);
|
||||
swr_free(&convertCtx_);
|
||||
av_packet_unref(&packet);
|
||||
av_frame_free(&videoStreamFrame_);
|
||||
av_frame_free(&audioStreamFrame_);
|
||||
avcodec_close(videoCodecContext_);
|
||||
if (audioCodecContext_ != nullptr) {
|
||||
avcodec_close(audioCodecContext_);
|
||||
}
|
||||
avformat_close_input(&inputContext);
|
||||
avformat_free_context(inputContext);
|
||||
} catch (const std::exception&) {
|
||||
// In case of decoding error
|
||||
// free all stuffs
|
||||
sws_freeContext(scaleContext_);
|
||||
swr_free(&convertCtx_);
|
||||
av_packet_unref(&packet);
|
||||
av_frame_free(&videoStreamFrame_);
|
||||
av_frame_free(&audioStreamFrame_);
|
||||
avcodec_close(videoCodecContext_);
|
||||
avcodec_close(audioCodecContext_);
|
||||
avformat_close_input(&inputContext);
|
||||
avformat_free_context(inputContext);
|
||||
}
|
||||
}
|
||||
|
||||
void VideoDecoder::decodeMemory(
|
||||
const string& videoName,
|
||||
const char* buffer,
|
||||
const int size,
|
||||
const Params& params,
|
||||
const int start_frm,
|
||||
Callback& callback) {
|
||||
VideoIOContext ioctx(buffer, size);
|
||||
decodeLoop(videoName, ioctx, params, start_frm, callback);
|
||||
}
|
||||
|
||||
void VideoDecoder::decodeFile(
|
||||
const string& file,
|
||||
const Params& params,
|
||||
const int start_frm,
|
||||
Callback& callback) {
|
||||
VideoIOContext ioctx(file);
|
||||
decodeLoop(file, ioctx, params, start_frm, callback);
|
||||
}
|
||||
|
||||
string VideoDecoder::ffmpegErrorStr(int result) {
|
||||
std::array<char, 128> buf;
|
||||
av_strerror(result, buf.data(), buf.size());
|
||||
return string(buf.data());
|
||||
}
|
||||
|
||||
void FreeDecodedData(
|
||||
std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames,
|
||||
std::vector<std::unique_ptr<DecodedAudio>>& sampledAudio) {
|
||||
// free the sampledFrames and sampledAudio
|
||||
for (int i = 0; i < sampledFrames.size(); i++) {
|
||||
DecodedFrame* p = sampledFrames[i].release();
|
||||
delete p;
|
||||
}
|
||||
for (int i = 0; i < sampledAudio.size(); i++) {
|
||||
DecodedAudio* p = sampledAudio[i].release();
|
||||
delete p;
|
||||
}
|
||||
sampledFrames.clear();
|
||||
sampledAudio.clear();
|
||||
}
|
||||
|
||||
bool DecodeMultipleClipsFromVideo(
|
||||
const char* video_buffer,
|
||||
const std::string& video_filename,
|
||||
const int encoded_size,
|
||||
const Params& params,
|
||||
const int start_frm,
|
||||
const int clip_per_video,
|
||||
const std::vector<int>& clip_start_positions,
|
||||
const bool use_local_file,
|
||||
int& height,
|
||||
int& width,
|
||||
std::vector<unsigned char*>& buffer_rgb) {
|
||||
std::vector<std::unique_ptr<DecodedFrame>> sampledFrames;
|
||||
std::vector<std::unique_ptr<DecodedAudio>> sampledAudio;
|
||||
VideoDecoder decoder;
|
||||
|
||||
CallbackImpl callback;
|
||||
// decoding from buffer or file
|
||||
if (!use_local_file) {
|
||||
decoder.decodeMemory(
|
||||
string("Memory Buffer"),
|
||||
video_buffer,
|
||||
encoded_size,
|
||||
params,
|
||||
start_frm,
|
||||
callback);
|
||||
} else {
|
||||
decoder.decodeFile(video_filename, params, start_frm, callback);
|
||||
}
|
||||
|
||||
for (auto& frame : callback.frames) {
|
||||
sampledFrames.push_back(std::move(frame));
|
||||
}
|
||||
for (auto& audio_sample : callback.audio_samples) {
|
||||
sampledAudio.push_back(std::move(audio_sample));
|
||||
}
|
||||
|
||||
for (int i = 0; i < buffer_rgb.size(); i++) {
|
||||
unsigned char* buff = buffer_rgb[i];
|
||||
delete[] buff;
|
||||
}
|
||||
buffer_rgb.clear();
|
||||
|
||||
if (sampledFrames.size() < params.num_of_required_frame_) {
|
||||
LOG(ERROR)
|
||||
<< "The video seems faulty and we could not decode enough frames: "
|
||||
<< sampledFrames.size() << " VS " << params.num_of_required_frame_;
|
||||
FreeDecodedData(sampledFrames, sampledAudio);
|
||||
return true;
|
||||
}
|
||||
if (sampledFrames.size() == 0) {
|
||||
LOG(ERROR) << "The samples frames have size 0, no frame to process";
|
||||
FreeDecodedData(sampledFrames, sampledAudio);
|
||||
return true;
|
||||
}
|
||||
height = sampledFrames[0]->height_;
|
||||
width = sampledFrames[0]->width_;
|
||||
float sample_stepsz = (clip_per_video <= 1)
|
||||
? 0
|
||||
: (float(sampledFrames.size() - params.num_of_required_frame_) /
|
||||
(clip_per_video - 1));
|
||||
|
||||
int image_size = 3 * height * width;
|
||||
int clip_size = params.num_of_required_frame_ * image_size;
|
||||
// get the RGB frames for each clip
|
||||
if (clip_start_positions.size() > 0) {
|
||||
for (int i = 0; i < clip_start_positions.size(); i++) {
|
||||
unsigned char* buffer_rgb_ptr = new unsigned char[clip_size];
|
||||
int clip_start = clip_start_positions[i];
|
||||
for (int j = 0; j < params.num_of_required_frame_; j++) {
|
||||
memcpy(
|
||||
buffer_rgb_ptr + j * image_size,
|
||||
(unsigned char*)sampledFrames[j + clip_start]->data_.get(),
|
||||
image_size * sizeof(unsigned char));
|
||||
}
|
||||
buffer_rgb.push_back(buffer_rgb_ptr);
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < clip_per_video; i++) {
|
||||
unsigned char* buffer_rgb_ptr = new unsigned char[clip_size];
|
||||
int clip_start = floor(i * sample_stepsz);
|
||||
for (int j = 0; j < params.num_of_required_frame_; j++) {
|
||||
memcpy(
|
||||
buffer_rgb_ptr + j * image_size,
|
||||
(unsigned char*)sampledFrames[j + clip_start]->data_.get(),
|
||||
image_size * sizeof(unsigned char));
|
||||
}
|
||||
buffer_rgb.push_back(buffer_rgb_ptr);
|
||||
}
|
||||
}
|
||||
FreeDecodedData(sampledFrames, sampledAudio);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
@ -1,525 +0,0 @@
|
||||
#ifndef CAFFE2_VIDEO_VIDEO_DECODER_H_
|
||||
#define CAFFE2_VIDEO_VIDEO_DECODER_H_
|
||||
|
||||
#include <caffe2/core/logging.h>
|
||||
#include <stdio.h>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
extern "C" {
|
||||
#include <libavcodec/avcodec.h>
|
||||
#include <libavformat/avformat.h>
|
||||
#include <libavformat/avio.h>
|
||||
#include <libavutil/log.h>
|
||||
#include <libavutil/motion_vector.h>
|
||||
#include <libswresample/swresample.h>
|
||||
#include <libswscale/swscale.h>
|
||||
}
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
#define VIO_BUFFER_SZ 32768
|
||||
#define MAX_DECODING_FRAMES 10000
|
||||
|
||||
// enum to specify 3 special fps sampling behaviors:
|
||||
// 0: disable fps sampling, no frame sampled at all
|
||||
// -1: unlimited fps sampling, will sample at native video fps
|
||||
// -2: disable fps sampling, but will get the frame at specific timestamp
|
||||
enum SpecialFps {
|
||||
SAMPLE_NO_FRAME = 0,
|
||||
SAMPLE_ALL_FRAMES = -1,
|
||||
SAMPLE_TIMESTAMP_ONLY = -2,
|
||||
};
|
||||
|
||||
// three different types of resolution when decoding the video
|
||||
// 0: resize to width x height and ignore the aspect ratio;
|
||||
// 1: resize to short_edge and keep the aspect ratio;
|
||||
// 2: using the original resolution of the video; if resolution
|
||||
// is smaller than crop_size x crop_size, resize to crop_size
|
||||
// and keep the aspect ratio;
|
||||
// 3: for xray video service
|
||||
enum VideoResType {
|
||||
USE_WIDTH_HEIGHT = 0,
|
||||
USE_SHORT_EDGE = 1,
|
||||
ORIGINAL_RES = 2,
|
||||
};
|
||||
|
||||
// three different types of decoding behavior are supported
|
||||
// 0: do temporal jittering to sample a random clip from the video
|
||||
// 1: uniformly sample multiple clips from the video;
|
||||
// 2: sample a clip from a given starting frame
|
||||
// 3: for xray video service
|
||||
enum DecodeType {
|
||||
DO_TMP_JITTER = 0,
|
||||
DO_UNIFORM_SMP = 1,
|
||||
USE_START_FRM = 2,
|
||||
};
|
||||
|
||||
// sampling interval for fps starting at specified timestamp
|
||||
// use enum SpecialFps to set special fps decoding behavior
|
||||
// note sampled fps will not always accurately follow the target fps,
|
||||
// because sampled frame has to snap to actual frame timestamp,
|
||||
// e.g. video fps = 25, sample fps = 4 will sample every 0.28s, not 0.25
|
||||
// video fps = 25, sample fps = 5 will sample every 0.24s, not 0.2,
|
||||
// because of floating-point division accuracy (1 / 5.0 is not exactly 0.2)
|
||||
struct SampleInterval {
|
||||
double timestamp;
|
||||
double fps;
|
||||
SampleInterval() : timestamp(-1), fps(SpecialFps::SAMPLE_ALL_FRAMES) {}
|
||||
SampleInterval(double ts, double f) : timestamp(ts), fps(f) {}
|
||||
bool operator<(const SampleInterval& itvl) const {
|
||||
return (timestamp < itvl.timestamp);
|
||||
}
|
||||
};
|
||||
|
||||
class Params {
|
||||
public:
|
||||
// return all key-frames regardless of specified fps
|
||||
bool keyFrames_ = false;
|
||||
|
||||
// return audio data while decoding the video
|
||||
bool getAudio_ = false;
|
||||
|
||||
// for sampling audio data
|
||||
int outrate_ = 22000;
|
||||
int outfmt_ = AV_SAMPLE_FMT_FLT;
|
||||
int64_t outlayout_ = AV_CH_LAYOUT_MONO;
|
||||
|
||||
// Output image pixel format
|
||||
AVPixelFormat pixelFormat_ = AVPixelFormat::AV_PIX_FMT_RGB24;
|
||||
|
||||
// Index of stream to decode.
|
||||
// -1 will automatically decode the first video stream.
|
||||
int streamIndex_ = -1;
|
||||
|
||||
// How many frames to output at most from the video
|
||||
// -1 no limit
|
||||
int maximumOutputFrames_ = -1;
|
||||
|
||||
// params for video resolution
|
||||
int video_res_type_ = VideoResType::USE_WIDTH_HEIGHT;
|
||||
int crop_size_ = -1;
|
||||
int short_edge_ = -1;
|
||||
|
||||
// Output video size, -1 to preserve origianl dimension
|
||||
int outputWidth_ = -1;
|
||||
int outputHeight_ = -1;
|
||||
|
||||
// max output dimension, -1 to preserve original size
|
||||
// the larger dimension of the video will be scaled to this size,
|
||||
// and the second dimension will be scaled to preserve aspect ratio
|
||||
int maxOutputDimension_ = -1;
|
||||
|
||||
// params for decoding behavior
|
||||
int decode_type_ = DecodeType::DO_TMP_JITTER;
|
||||
int num_of_required_frame_ = -1;
|
||||
|
||||
// intervals_ control variable sampling fps between different timestamps
|
||||
// intervals_ must be ordered strictly ascending by timestamps
|
||||
// the first interval must have a timestamp of zero
|
||||
// fps must be either the 3 special fps defined in SpecialFps, or > 0
|
||||
std::vector<SampleInterval> intervals_ = {{0, SpecialFps::SAMPLE_ALL_FRAMES}};
|
||||
|
||||
Params() {}
|
||||
|
||||
/**
|
||||
* FPS of output frames
|
||||
* setting here will reset intervals_ and force decoding at target FPS
|
||||
* This can be used if user just want to decode at a steady fps
|
||||
*/
|
||||
Params& fps(float v) {
|
||||
intervals_.clear();
|
||||
intervals_.emplace_back(0, v);
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sample output frames at a specified list of timestamps
|
||||
* Timestamps must be in increasing order, and timestamps past the end of the
|
||||
* video will be ignored
|
||||
* Setting here will reset intervals_
|
||||
*/
|
||||
Params& setSampleTimestamps(const std::vector<double>& timestamps) {
|
||||
intervals_.clear();
|
||||
// insert an interval per desired frame.
|
||||
for (auto& timestamp : timestamps) {
|
||||
intervals_.emplace_back(timestamp, SpecialFps::SAMPLE_TIMESTAMP_ONLY);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Pixel format of output buffer, default PIX_FMT_RGB24
|
||||
*/
|
||||
Params& pixelFormat(AVPixelFormat pixelFormat) {
|
||||
pixelFormat_ = pixelFormat;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return all key-frames
|
||||
*/
|
||||
Params& keyFrames(bool keyFrames) {
|
||||
keyFrames_ = keyFrames;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Index of video stream to process, defaults to the first video stream
|
||||
*/
|
||||
Params& streamIndex(int index) {
|
||||
streamIndex_ = index;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Only output this many frames, default to no limit
|
||||
*/
|
||||
Params& maxOutputFrames(int count) {
|
||||
maximumOutputFrames_ = count;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Output frame width, default to video width
|
||||
*/
|
||||
Params& outputWidth(int width) {
|
||||
outputWidth_ = width;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Output frame height, default to video height
|
||||
*/
|
||||
Params& outputHeight(int height) {
|
||||
outputHeight_ = height;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Max dimension of either width or height, if any is bigger
|
||||
* it will be scaled down to this and econd dimension
|
||||
* will be scaled down to maintain aspect ratio.
|
||||
*/
|
||||
Params& maxOutputDimension(int size) {
|
||||
maxOutputDimension_ = size;
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
// data structure for storing decoded video frames
|
||||
class DecodedFrame {
|
||||
public:
|
||||
struct avDeleter {
|
||||
void operator()(unsigned char* p) const {
|
||||
av_free(p);
|
||||
}
|
||||
};
|
||||
using AvDataPtr = std::unique_ptr<uint8_t, avDeleter>;
|
||||
|
||||
// decoded data buffer
|
||||
AvDataPtr data_;
|
||||
|
||||
// size in bytes
|
||||
int size_ = 0;
|
||||
|
||||
// frame dimensions
|
||||
int width_ = 0;
|
||||
int height_ = 0;
|
||||
|
||||
// timestamp in seconds since beginning of video
|
||||
double timestamp_ = 0;
|
||||
|
||||
// true if this is a key frame.
|
||||
bool keyFrame_ = false;
|
||||
|
||||
// index of frame in video
|
||||
int index_ = -1;
|
||||
|
||||
// Sequential number of outputted frame
|
||||
int outputFrameIndex_ = -1;
|
||||
};
|
||||
|
||||
// data structure for storing decoded audio data
|
||||
struct DecodedAudio {
|
||||
int dataSize_;
|
||||
int outSampleSize_;
|
||||
std::unique_ptr<float[]> audio_data_;
|
||||
|
||||
explicit DecodedAudio(
|
||||
int dataSize = 0,
|
||||
int outSampleSize = 0,
|
||||
std::unique_ptr<float[]> audio_data = nullptr)
|
||||
: dataSize_(dataSize),
|
||||
outSampleSize_(outSampleSize),
|
||||
audio_data_(std::move(audio_data)) {}
|
||||
};
|
||||
|
||||
class VideoIOContext {
|
||||
public:
|
||||
explicit VideoIOContext(const std::string& fname)
|
||||
: workBuffersize_(VIO_BUFFER_SZ),
|
||||
workBuffer_((uint8_t*)av_malloc(workBuffersize_)),
|
||||
inputFile_(nullptr),
|
||||
inputBuffer_(nullptr),
|
||||
inputBufferSize_(0) {
|
||||
inputFile_ = fopen(fname.c_str(), "rb");
|
||||
if (inputFile_ == nullptr) {
|
||||
LOG(ERROR) << "Error opening video file " << fname;
|
||||
return;
|
||||
}
|
||||
ctx_ = avio_alloc_context(
|
||||
static_cast<unsigned char*>(workBuffer_.get()),
|
||||
workBuffersize_,
|
||||
0,
|
||||
this,
|
||||
&VideoIOContext::readFile,
|
||||
nullptr, // no write function
|
||||
&VideoIOContext::seekFile);
|
||||
}
|
||||
|
||||
explicit VideoIOContext(const char* buffer, int size)
|
||||
: workBuffersize_(VIO_BUFFER_SZ),
|
||||
workBuffer_((uint8_t*)av_malloc(workBuffersize_)),
|
||||
inputFile_(nullptr),
|
||||
inputBuffer_(buffer),
|
||||
inputBufferSize_(size) {
|
||||
ctx_ = avio_alloc_context(
|
||||
static_cast<unsigned char*>(workBuffer_.get()),
|
||||
workBuffersize_,
|
||||
0,
|
||||
this,
|
||||
&VideoIOContext::readMemory,
|
||||
nullptr, // no write function
|
||||
&VideoIOContext::seekMemory);
|
||||
}
|
||||
|
||||
~VideoIOContext() {
|
||||
av_free(ctx_);
|
||||
if (inputFile_) {
|
||||
fclose(inputFile_);
|
||||
}
|
||||
}
|
||||
|
||||
int read(unsigned char* buf, int buf_size) {
|
||||
if (inputBuffer_) {
|
||||
return readMemory(this, buf, buf_size);
|
||||
} else if (inputFile_) {
|
||||
return readFile(this, buf, buf_size);
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
int64_t seek(int64_t offset, int whence) {
|
||||
if (inputBuffer_) {
|
||||
return seekMemory(this, offset, whence);
|
||||
} else if (inputFile_) {
|
||||
return seekFile(this, offset, whence);
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
static int readFile(void* opaque, unsigned char* buf, int buf_size) {
|
||||
VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
|
||||
if (feof(h->inputFile_)) {
|
||||
return AVERROR_EOF;
|
||||
}
|
||||
size_t ret = fread(buf, 1, buf_size, h->inputFile_);
|
||||
if (ret < buf_size) {
|
||||
if (ferror(h->inputFile_)) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int64_t seekFile(void* opaque, int64_t offset, int whence) {
|
||||
VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
|
||||
switch (whence) {
|
||||
case SEEK_CUR: // from current position
|
||||
case SEEK_END: // from eof
|
||||
case SEEK_SET: // from beginning of file
|
||||
return fseek(h->inputFile_, static_cast<long>(offset), whence);
|
||||
break;
|
||||
case AVSEEK_SIZE:
|
||||
int64_t cur = ftell(h->inputFile_);
|
||||
fseek(h->inputFile_, 0L, SEEK_END);
|
||||
int64_t size = ftell(h->inputFile_);
|
||||
fseek(h->inputFile_, cur, SEEK_SET);
|
||||
return size;
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
static int readMemory(void* opaque, unsigned char* buf, int buf_size) {
|
||||
VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
|
||||
if (buf_size < 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
int reminder = h->inputBufferSize_ - h->offset_;
|
||||
int r = buf_size < reminder ? buf_size : reminder;
|
||||
if (r < 0) {
|
||||
return AVERROR_EOF;
|
||||
}
|
||||
|
||||
memcpy(buf, h->inputBuffer_ + h->offset_, r);
|
||||
h->offset_ += r;
|
||||
return r;
|
||||
}
|
||||
|
||||
static int64_t seekMemory(void* opaque, int64_t offset, int whence) {
|
||||
VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
|
||||
switch (whence) {
|
||||
case SEEK_CUR: // from current position
|
||||
h->offset_ += offset;
|
||||
break;
|
||||
case SEEK_END: // from eof
|
||||
h->offset_ = h->inputBufferSize_ + offset;
|
||||
break;
|
||||
case SEEK_SET: // from beginning of file
|
||||
h->offset_ = offset;
|
||||
break;
|
||||
case AVSEEK_SIZE:
|
||||
return h->inputBufferSize_;
|
||||
}
|
||||
return h->offset_;
|
||||
}
|
||||
|
||||
AVIOContext* get_avio() {
|
||||
return ctx_;
|
||||
}
|
||||
|
||||
private:
|
||||
int workBuffersize_;
|
||||
DecodedFrame::AvDataPtr workBuffer_;
|
||||
// for file mode
|
||||
FILE* inputFile_;
|
||||
|
||||
// for memory mode
|
||||
const char* inputBuffer_;
|
||||
int inputBufferSize_;
|
||||
int offset_ = 0;
|
||||
|
||||
AVIOContext* ctx_;
|
||||
};
|
||||
|
||||
struct VideoMeta {
|
||||
double fps;
|
||||
int width;
|
||||
int height;
|
||||
enum AVMediaType codec_type;
|
||||
AVPixelFormat pixFormat;
|
||||
VideoMeta()
|
||||
: fps(-1),
|
||||
width(-1),
|
||||
height(-1),
|
||||
codec_type(AVMEDIA_TYPE_VIDEO),
|
||||
pixFormat(AVPixelFormat::AV_PIX_FMT_RGB24) {}
|
||||
};
|
||||
|
||||
class Callback {
|
||||
public:
|
||||
virtual void frameDecoded(std::unique_ptr<DecodedFrame> img) = 0;
|
||||
virtual void audioDecoded(
|
||||
std::unique_ptr<DecodedAudio> /*decoded audio data*/) {}
|
||||
virtual void videoDecodingStarted(const VideoMeta& /*videoMeta*/) {}
|
||||
virtual void videoDecodingEnded(double /*lastFrameTimestamp*/) {}
|
||||
virtual ~Callback() {}
|
||||
};
|
||||
|
||||
class VideoDecoder {
|
||||
public:
|
||||
VideoDecoder();
|
||||
|
||||
void decodeFile(
|
||||
const std::string& filename,
|
||||
const Params& params,
|
||||
const int start_frm,
|
||||
Callback& callback);
|
||||
|
||||
void decodeMemory(
|
||||
const std::string& filename,
|
||||
const char* buffer,
|
||||
const int size,
|
||||
const Params& params,
|
||||
const int start_frm,
|
||||
Callback& callback);
|
||||
|
||||
private:
|
||||
std::string ffmpegErrorStr(int result);
|
||||
|
||||
void ResizeAndKeepAspectRatio(
|
||||
const int origWidth,
|
||||
const int origHeight,
|
||||
const int short_edge,
|
||||
const int long_edge,
|
||||
int& outWidth,
|
||||
int& outHeight);
|
||||
|
||||
void getAudioSample(
|
||||
AVPacket& packet,
|
||||
AVCodecContext* audioCodecContext_,
|
||||
AVFrame* audioStreamFrame_,
|
||||
SwrContext* convertCtx_,
|
||||
Callback& callback,
|
||||
const Params& params);
|
||||
|
||||
void decodeLoop(
|
||||
const std::string& videoName,
|
||||
VideoIOContext& ioctx,
|
||||
const Params& params,
|
||||
const int start_frm,
|
||||
Callback& callback);
|
||||
};
|
||||
|
||||
TORCH_API void FreeDecodedData(
|
||||
std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames,
|
||||
std::vector<std::unique_ptr<DecodedAudio>>& sampledAudio);
|
||||
|
||||
TORCH_API bool DecodeMultipleClipsFromVideo(
|
||||
const char* video_buffer,
|
||||
const std::string& video_filename,
|
||||
const int encoded_size,
|
||||
const Params& params,
|
||||
const int start_frm,
|
||||
const int clip_per_video,
|
||||
const std::vector<int>& clip_start_positions,
|
||||
const bool use_local_file,
|
||||
int& height,
|
||||
int& width,
|
||||
std::vector<unsigned char*>& buffer_rgb);
|
||||
|
||||
class CallbackImpl : public Callback {
|
||||
public:
|
||||
std::vector<std::unique_ptr<DecodedFrame>> frames;
|
||||
std::vector<std::unique_ptr<DecodedAudio>> audio_samples;
|
||||
|
||||
explicit CallbackImpl() {
|
||||
clear();
|
||||
}
|
||||
|
||||
void clear() {
|
||||
FreeDecodedData(frames, audio_samples);
|
||||
}
|
||||
|
||||
void frameDecoded(std::unique_ptr<DecodedFrame> frame) override {
|
||||
frames.push_back(std::move(frame));
|
||||
}
|
||||
|
||||
void audioDecoded(std::unique_ptr<DecodedAudio> audio_sample) override {
|
||||
audio_samples.push_back(std::move(audio_sample));
|
||||
}
|
||||
|
||||
void videoDecodingStarted(const VideoMeta& /*videoMeta*/) override {
|
||||
clear();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_VIDEO_VIDEO_DECODER_H_
|
@ -1,93 +0,0 @@
|
||||
#include <caffe2/video/video_input_op.h>
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
REGISTER_CPU_OPERATOR(VideoInput, VideoInputOp<CPUContext>);
|
||||
|
||||
OPERATOR_SCHEMA(VideoInput)
|
||||
.NumInputs(0, 1)
|
||||
.NumOutputs(2, 5)
|
||||
.TensorInferenceFunction(
|
||||
[](const OperatorDef& def,
|
||||
const vector<TensorShape>& /* unused */ /*in*/) {
|
||||
ArgumentHelper helper(def);
|
||||
int batch_size = helper.GetSingleArgument<int>("batch_size", 0);
|
||||
int clip_per_video =
|
||||
helper.GetSingleArgument<int>("clip_per_video", 1);
|
||||
int crop_size = helper.GetSingleArgument<int>("crop_size", -1);
|
||||
int length_rgb = helper.GetSingleArgument<int>("length_rgb", 0);
|
||||
int channels_rgb = helper.GetSingleArgument<int>("channels_rgb", 3);
|
||||
int length_of = helper.GetSingleArgument<int>("length_of", 0);
|
||||
int channels_of = helper.GetSingleArgument<int>("channels_of", 2);
|
||||
|
||||
// get the flags
|
||||
bool get_rgb = helper.GetSingleArgument<bool>("get_rgb", true);
|
||||
bool get_optical_flow =
|
||||
helper.GetSingleArgument<bool>("get_optical_flow", false);
|
||||
bool do_multi_label =
|
||||
helper.GetSingleArgument<bool>("do_multi_label", false);
|
||||
bool get_video_id =
|
||||
helper.GetSingleArgument<bool>("get_video_id", false);
|
||||
bool get_start_frame =
|
||||
helper.GetSingleArgument<bool>("get_start_frame", false);
|
||||
// get starting positions if available
|
||||
vector<int> clip_start_positions =
|
||||
helper.GetRepeatedArgument<int>("clip_start_positions", {});
|
||||
// In case clip_start_positions are given, set the clip_per_video arg
|
||||
if (clip_start_positions.size() > 0) {
|
||||
clip_per_video = clip_start_positions.size();
|
||||
}
|
||||
|
||||
int output_size = 1;
|
||||
if (get_rgb) {
|
||||
output_size++;
|
||||
}
|
||||
if (get_optical_flow) {
|
||||
output_size++;
|
||||
}
|
||||
if (get_video_id) {
|
||||
output_size++;
|
||||
}
|
||||
if (get_start_frame) {
|
||||
output_size++;
|
||||
}
|
||||
|
||||
int index = 0;
|
||||
vector<TensorShape> out(output_size);
|
||||
TORCH_CHECK_GT(crop_size, 0);
|
||||
batch_size *= clip_per_video;
|
||||
if (get_rgb) {
|
||||
out[index++] = CreateTensorShape(
|
||||
vector<int>{
|
||||
batch_size, channels_rgb, length_rgb, crop_size, crop_size},
|
||||
TensorProto::FLOAT);
|
||||
}
|
||||
if (get_optical_flow) {
|
||||
out[index++] = CreateTensorShape(
|
||||
vector<int>{
|
||||
batch_size, channels_of, length_of, crop_size, crop_size},
|
||||
TensorProto::FLOAT);
|
||||
}
|
||||
if (!do_multi_label) {
|
||||
out[index++] = CreateTensorShape(
|
||||
vector<int>{1, batch_size}, TensorProto::INT32);
|
||||
} else {
|
||||
int num_of_class = helper.GetSingleArgument<int>("num_of_class", 0);
|
||||
out[index++] = CreateTensorShape(
|
||||
vector<int>{batch_size, num_of_class}, TensorProto::INT32);
|
||||
}
|
||||
if (get_video_id) {
|
||||
out[index++] = CreateTensorShape(
|
||||
vector<int64_t>{1, batch_size}, TensorProto::INT64);
|
||||
}
|
||||
if (get_start_frame) {
|
||||
out[index] = CreateTensorShape(
|
||||
vector<int>{1, batch_size}, TensorProto::INT32);
|
||||
}
|
||||
|
||||
return out;
|
||||
});
|
||||
|
||||
NO_GRADIENT(VideoInput);
|
||||
|
||||
} // namespace caffe2
|
File diff suppressed because it is too large
Load Diff
@ -1,9 +0,0 @@
|
||||
#include <caffe2/core/common_gpu.h>
|
||||
#include <caffe2/core/context_gpu.h>
|
||||
#include <caffe2/video/video_input_op.h>
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
REGISTER_CUDA_OPERATOR(VideoInput, VideoInputOp<CUDAContext>);
|
||||
|
||||
} // namespace caffe2
|
@ -1,210 +0,0 @@
|
||||
#include <caffe2/core/logging.h>
|
||||
#include <caffe2/video/video_io.h>
|
||||
#include <algorithm>
|
||||
#include <random>
|
||||
#include <string>
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
void ClipTransformRGB(
|
||||
const unsigned char* buffer_rgb,
|
||||
const int crop_size,
|
||||
const int length_rgb,
|
||||
const int channels_rgb,
|
||||
const int sampling_rate_rgb,
|
||||
const int height,
|
||||
const int width,
|
||||
const int h_off,
|
||||
const int w_off,
|
||||
const bool mirror_me,
|
||||
const std::vector<float>& mean_rgb,
|
||||
const std::vector<float>& inv_std_rgb,
|
||||
float* transformed_clip) {
|
||||
// The order of output dimensions is C, L, H, W
|
||||
int orig_index, tran_index;
|
||||
for (int c = 0; c < channels_rgb; ++c) {
|
||||
for (int l = 0; l < length_rgb; ++l) {
|
||||
int orig_index_l = l * sampling_rate_rgb * height * width * channels_rgb;
|
||||
int tran_index_l = (c * length_rgb + l) * crop_size;
|
||||
|
||||
for (int h = 0; h < crop_size; ++h) {
|
||||
int orig_index_h = orig_index_l + (h + h_off) * width * channels_rgb;
|
||||
int tran_index_h = (tran_index_l + h) * crop_size;
|
||||
|
||||
for (int w = 0; w < crop_size; ++w) {
|
||||
orig_index = orig_index_h + (w + w_off) * channels_rgb + c;
|
||||
|
||||
// mirror the frame
|
||||
if (mirror_me) {
|
||||
tran_index = tran_index_h + (crop_size - 1 - w);
|
||||
} else {
|
||||
tran_index = tran_index_h + w;
|
||||
}
|
||||
|
||||
// normalize and transform the clip
|
||||
transformed_clip[tran_index] =
|
||||
(buffer_rgb[orig_index] - mean_rgb[c]) * inv_std_rgb[c];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ClipTransformOpticalFlow(
|
||||
const unsigned char* buffer_rgb,
|
||||
const int crop_size,
|
||||
const int length_of,
|
||||
const int channels_of,
|
||||
const int sampling_rate_of,
|
||||
const int height,
|
||||
const int width,
|
||||
const cv::Rect& rect,
|
||||
const int channels_rgb,
|
||||
const bool mirror_me,
|
||||
const int flow_alg_type,
|
||||
const int flow_data_type,
|
||||
const int frame_gap_of,
|
||||
const bool do_flow_aggregation,
|
||||
const std::vector<float>& mean_of,
|
||||
const std::vector<float>& inv_std_of,
|
||||
float* transformed_clip) {
|
||||
const int frame_size = crop_size * crop_size;
|
||||
const int channel_size_flow = length_of * frame_size;
|
||||
|
||||
// for get the mean and std of the input data
|
||||
bool extract_statistics = false;
|
||||
static std::vector<double> mean_static(channels_of, 0.f);
|
||||
static std::vector<double> std_static(channels_of, 0.f);
|
||||
static long long count = 0;
|
||||
cv::Scalar mean_img, std_img;
|
||||
|
||||
for (int l = 0; l < length_of; l++) {
|
||||
// get the grayscale frames
|
||||
std::vector<cv::Mat> grays, rgbs;
|
||||
int step_size = do_flow_aggregation ? 1 : frame_gap_of;
|
||||
for (int j = 0; j <= frame_gap_of; j += step_size) {
|
||||
// get the current frame
|
||||
const unsigned char* curr_frame = buffer_rgb +
|
||||
(l * sampling_rate_of + j) * height * width * channels_rgb;
|
||||
cv::Mat img = cv::Mat::zeros(height, width, CV_8UC3);
|
||||
memcpy(
|
||||
img.data,
|
||||
curr_frame,
|
||||
height * width * channels_rgb * sizeof(unsigned char));
|
||||
|
||||
// crop and mirror the frame
|
||||
cv::Mat img_cropped = img(rect);
|
||||
if (mirror_me) {
|
||||
cv::flip(img_cropped, img_cropped, 1);
|
||||
}
|
||||
|
||||
cv::Mat gray;
|
||||
cv::cvtColor(img_cropped, gray, cv::COLOR_RGB2GRAY);
|
||||
grays.push_back(gray);
|
||||
rgbs.push_back(img_cropped);
|
||||
}
|
||||
|
||||
cv::Mat first_gray, first_rgb;
|
||||
cv::Mat flow = cv::Mat::zeros(crop_size, crop_size, CV_32FC2);
|
||||
MultiFrameOpticalFlowExtractor(grays, flow_alg_type, flow);
|
||||
|
||||
std::vector<cv::Mat> imgs;
|
||||
cv::split(flow, imgs);
|
||||
// save the 2-channel optical flow first
|
||||
int c = 0;
|
||||
for (; c < 2; c++) {
|
||||
if (extract_statistics) {
|
||||
cv::meanStdDev(imgs[c], mean_img, std_img);
|
||||
mean_static[c] += mean_img[0];
|
||||
std_static[c] += std_img[0];
|
||||
}
|
||||
|
||||
imgs[c] -= mean_of[c];
|
||||
imgs[c] *= inv_std_of[c];
|
||||
memcpy(
|
||||
transformed_clip + c * channel_size_flow + l * frame_size,
|
||||
imgs[c].data,
|
||||
frame_size * sizeof(float));
|
||||
}
|
||||
|
||||
cv::Mat mag;
|
||||
std::vector<cv::Mat> chans;
|
||||
// augment the optical flow with more channels
|
||||
switch (flow_data_type) {
|
||||
case FlowDataType::Flow2C:
|
||||
// nothing to do if we only need two channels
|
||||
break;
|
||||
|
||||
case FlowDataType::Flow3C:
|
||||
// use magnitude as the third channel
|
||||
mag = cv::abs(imgs[0]) + cv::abs(imgs[1]);
|
||||
if (extract_statistics) {
|
||||
cv::meanStdDev(mag, mean_img, std_img);
|
||||
mean_static[c] += mean_img[0];
|
||||
std_static[c] += std_img[0];
|
||||
}
|
||||
|
||||
mag -= mean_of[c];
|
||||
mag *= inv_std_of[c];
|
||||
memcpy(
|
||||
transformed_clip + c * channel_size_flow + l * frame_size,
|
||||
mag.data,
|
||||
frame_size * sizeof(float));
|
||||
break;
|
||||
|
||||
case FlowDataType::FlowWithGray:
|
||||
// add grayscale image as the third channel
|
||||
grays[0].convertTo(first_gray, CV_32FC1);
|
||||
if (extract_statistics) {
|
||||
cv::meanStdDev(first_gray, mean_img, std_img);
|
||||
mean_static[c] += mean_img[0];
|
||||
std_static[c] += std_img[0];
|
||||
}
|
||||
|
||||
first_gray -= mean_of[c];
|
||||
first_gray *= inv_std_of[c];
|
||||
memcpy(
|
||||
transformed_clip + c * channel_size_flow + l * frame_size,
|
||||
first_gray.data,
|
||||
frame_size * sizeof(float));
|
||||
break;
|
||||
|
||||
case FlowDataType::FlowWithRGB:
|
||||
// add all three rgb channels
|
||||
rgbs[0].convertTo(first_rgb, CV_32FC3);
|
||||
cv::split(first_rgb, chans);
|
||||
for (; c < channels_of; c++) {
|
||||
if (extract_statistics) {
|
||||
cv::meanStdDev(chans[c - 2], mean_img, std_img);
|
||||
mean_static[c] += mean_img[0];
|
||||
std_static[c] += std_img[0];
|
||||
}
|
||||
|
||||
chans[c - 2] -= mean_of[c];
|
||||
chans[c - 2] *= inv_std_of[c];
|
||||
memcpy(
|
||||
transformed_clip + c * channel_size_flow + l * frame_size,
|
||||
chans[c - 2].data,
|
||||
frame_size * sizeof(float));
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
LOG(ERROR) << "Unsupported optical flow data type " << flow_data_type;
|
||||
break;
|
||||
}
|
||||
|
||||
if (extract_statistics) {
|
||||
count++;
|
||||
if (count % 1000 == 1) {
|
||||
for (int i = 0; i < channels_of; i++) {
|
||||
LOG(INFO) << i
|
||||
<< "-th channel mean: " << mean_static[i] / float(count)
|
||||
<< " std: " << std_static[i] / float(count);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
@ -1,51 +0,0 @@
|
||||
#ifndef CAFFE2_VIDEO_VIDEO_IO_H_
|
||||
#define CAFFE2_VIDEO_VIDEO_IO_H_
|
||||
|
||||
#include <caffe2/core/common.h>
|
||||
#include <caffe2/video/optical_flow.h>
|
||||
#include <caffe2/video/video_decoder.h>
|
||||
#include <opencv2/opencv.hpp>
|
||||
#include <random>
|
||||
|
||||
#include <istream>
|
||||
#include <ostream>
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
TORCH_API void ClipTransformRGB(
|
||||
const unsigned char* buffer_rgb,
|
||||
const int crop_size,
|
||||
const int length_rgb,
|
||||
const int channels_rgb,
|
||||
const int sampling_rate_rgb,
|
||||
const int height,
|
||||
const int width,
|
||||
const int h_off,
|
||||
const int w_off,
|
||||
const bool mirror_me,
|
||||
const std::vector<float>& mean_rgb,
|
||||
const std::vector<float>& inv_std_rgb,
|
||||
float* transformed_clip);
|
||||
|
||||
TORCH_API void ClipTransformOpticalFlow(
|
||||
const unsigned char* buffer_rgb,
|
||||
const int crop_size,
|
||||
const int length_of,
|
||||
const int channels_of,
|
||||
const int sampling_rate_of,
|
||||
const int height,
|
||||
const int width,
|
||||
const cv::Rect& rect,
|
||||
const int channels_rgb,
|
||||
const bool mirror_me,
|
||||
const int flow_alg_type,
|
||||
const int flow_data_type,
|
||||
const int frame_gap_of,
|
||||
const bool do_flow_aggregation,
|
||||
const std::vector<float>& mean_of,
|
||||
const std::vector<float>& inv_std_of,
|
||||
float* transformed_clip);
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_VIDEO_VIDEO_IO_H_
|
@ -932,45 +932,6 @@ if(USE_REDIS)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
# ---[ OpenCV
|
||||
if(USE_OPENCV)
|
||||
# OpenCV 4
|
||||
find_package(OpenCV 4 QUIET COMPONENTS core highgui imgproc imgcodecs optflow videoio video)
|
||||
if(NOT OpenCV_FOUND)
|
||||
# OpenCV 3
|
||||
find_package(OpenCV 3 QUIET COMPONENTS core highgui imgproc imgcodecs videoio video)
|
||||
if(NOT OpenCV_FOUND)
|
||||
# OpenCV 2
|
||||
find_package(OpenCV QUIET COMPONENTS core highgui imgproc)
|
||||
endif()
|
||||
endif()
|
||||
if(OpenCV_FOUND)
|
||||
include_directories(SYSTEM ${OpenCV_INCLUDE_DIRS})
|
||||
list(APPEND Caffe2_DEPENDENCY_LIBS ${OpenCV_LIBS})
|
||||
if(MSVC AND USE_CUDA)
|
||||
list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS ${OpenCV_LIBS})
|
||||
endif()
|
||||
message(STATUS "OpenCV found (${OpenCV_CONFIG_PATH})")
|
||||
else()
|
||||
message(WARNING "Not compiling with OpenCV. Suppress this warning with -DUSE_OPENCV=OFF")
|
||||
caffe2_update_option(USE_OPENCV OFF)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# ---[ FFMPEG
|
||||
if(USE_FFMPEG)
|
||||
find_package(FFmpeg REQUIRED)
|
||||
if(FFMPEG_FOUND)
|
||||
message("Found FFMPEG/LibAV libraries")
|
||||
include_directories(SYSTEM ${FFMPEG_INCLUDE_DIR})
|
||||
list(APPEND Caffe2_DEPENDENCY_LIBS ${FFMPEG_LIBRARIES})
|
||||
else()
|
||||
message("Not compiling with FFmpeg. Suppress this warning with -DUSE_FFMPEG=OFF")
|
||||
caffe2_update_option(USE_FFMPEG OFF)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(USE_ITT)
|
||||
find_package(ITT)
|
||||
if(ITT_FOUND)
|
||||
|
@ -1,71 +0,0 @@
|
||||
# - Try to find ffmpeg libraries
|
||||
# (libavcodec, libavformat, libavutil, libswscale)
|
||||
# Once done this will define
|
||||
#
|
||||
# FFMPEG_FOUND - system has ffmpeg or libav
|
||||
# FFMPEG_INCLUDE_DIR - the ffmpeg include directory
|
||||
# FFMPEG_LIBRARIES - Link these to use ffmpeg
|
||||
#
|
||||
|
||||
if (FFMPEG_LIBRARIES AND FFMPEG_INCLUDE_DIR)
|
||||
# in cache already
|
||||
set(FFMPEG_FOUND TRUE)
|
||||
else (FFMPEG_LIBRARIES AND FFMPEG_INCLUDE_DIR)
|
||||
|
||||
find_path(FFMPEG_AVCODEC_INCLUDE_DIR
|
||||
NAMES libavcodec/avcodec.h
|
||||
PATHS ${_FFMPEG_AVCODEC_INCLUDE_DIRS} /usr/include /usr/local/include /opt/local/include /sw/include
|
||||
PATH_SUFFIXES ffmpeg libav
|
||||
)
|
||||
|
||||
find_library(FFMPEG_LIBAVCODEC
|
||||
NAMES avcodec
|
||||
PATHS ${_FFMPEG_AVCODEC_LIBRARY_DIRS} /usr/lib /usr/local/lib /opt/local/lib /sw/lib
|
||||
)
|
||||
|
||||
find_library(FFMPEG_LIBAVFORMAT
|
||||
NAMES avformat
|
||||
PATHS ${_FFMPEG_AVFORMAT_LIBRARY_DIRS} /usr/lib /usr/local/lib /opt/local/lib /sw/lib
|
||||
)
|
||||
|
||||
find_library(FFMPEG_LIBAVUTIL
|
||||
NAMES avutil
|
||||
PATHS ${_FFMPEG_AVUTIL_LIBRARY_DIRS} /usr/lib /usr/local/lib /opt/local/lib /sw/lib
|
||||
)
|
||||
|
||||
|
||||
find_library(FFMPEG_LIBSWSCALE
|
||||
NAMES swscale
|
||||
PATHS ${_FFMPEG_SWSCALE_LIBRARY_DIRS} /usr/lib /usr/local/lib /opt/local/lib /sw/lib
|
||||
)
|
||||
|
||||
find_library(FFMPEG_LIBSWRESAMPLE
|
||||
NAMES swresample
|
||||
PATHS ${_FFMPEG_SWSCALE_LIBRARY_DIRS} /usr/lib /usr/local/lib /opt/local/lib /sw/lib
|
||||
)
|
||||
|
||||
if (FFMPEG_LIBAVCODEC AND FFMPEG_LIBAVFORMAT)
|
||||
set(FFMPEG_FOUND TRUE)
|
||||
endif()
|
||||
|
||||
if (FFMPEG_FOUND)
|
||||
set(FFMPEG_INCLUDE_DIR ${FFMPEG_AVCODEC_INCLUDE_DIR})
|
||||
|
||||
set(FFMPEG_LIBRARIES
|
||||
${FFMPEG_LIBAVCODEC}
|
||||
${FFMPEG_LIBAVFORMAT}
|
||||
${FFMPEG_LIBAVUTIL}
|
||||
${FFMPEG_LIBSWSCALE}
|
||||
${FFMPEG_LIBSWRESAMPLE}
|
||||
)
|
||||
|
||||
if (NOT FFMPEG_FIND_QUIETLY)
|
||||
message(STATUS "Found FFMPEG or Libav: ${FFMPEG_LIBRARIES}, ${FFMPEG_INCLUDE_DIR}")
|
||||
endif (NOT FFMPEG_FIND_QUIETLY)
|
||||
else (FFMPEG_FOUND)
|
||||
if (FFMPEG_FIND_REQUIRED)
|
||||
message(FATAL_ERROR "Could not find libavcodec or libavformat or libavutil")
|
||||
endif (FFMPEG_FIND_REQUIRED)
|
||||
endif (FFMPEG_FOUND)
|
||||
|
||||
endif (FFMPEG_LIBRARIES AND FFMPEG_INCLUDE_DIR)
|
@ -128,7 +128,6 @@ function(caffe2_print_configuration_summary)
|
||||
message(STATUS " USE_FBGEMM : ${USE_FBGEMM}")
|
||||
message(STATUS " USE_FAKELOWP : ${USE_FAKELOWP}")
|
||||
message(STATUS " USE_KINETO : ${USE_KINETO}")
|
||||
message(STATUS " USE_FFMPEG : ${USE_FFMPEG}")
|
||||
message(STATUS " USE_GFLAGS : ${USE_GFLAGS}")
|
||||
message(STATUS " USE_GLOG : ${USE_GLOG}")
|
||||
message(STATUS " USE_LEVELDB : ${USE_LEVELDB}")
|
||||
@ -164,10 +163,6 @@ function(caffe2_print_configuration_summary)
|
||||
message(STATUS " USE_NUMPY : ${USE_NUMPY}")
|
||||
message(STATUS " USE_OBSERVERS : ${USE_OBSERVERS}")
|
||||
message(STATUS " USE_OPENCL : ${USE_OPENCL}")
|
||||
message(STATUS " USE_OPENCV : ${USE_OPENCV}")
|
||||
if(${USE_OPENCV})
|
||||
message(STATUS " OpenCV version : ${OpenCV_VERSION}")
|
||||
endif()
|
||||
message(STATUS " USE_OPENMP : ${USE_OPENMP}")
|
||||
message(STATUS " USE_TBB : ${USE_TBB}")
|
||||
if(${USE_TBB})
|
||||
|
Reference in New Issue
Block a user