mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Remove caffe2 from more build files (#125898)
Co-authored-by: Aaron Gokaslan <aaronGokaslan@gmail.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/125898 Approved by: https://github.com/Skylion007
This commit is contained in:
committed by
PyTorch MergeBot
parent
b620231378
commit
b9e7b35912
@ -44,11 +44,6 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ ${BUILD_ENVIRONMENT} == *"caffe2"* ]]; then
|
||||
echo "Caffe2 build is ON"
|
||||
export BUILD_CAFFE2=ON
|
||||
fi
|
||||
|
||||
if [[ ${BUILD_ENVIRONMENT} == *"paralleltbb"* ]]; then
|
||||
export ATEN_THREADING=TBB
|
||||
export USE_TBB=1
|
||||
|
@ -181,11 +181,7 @@ option(BUILD_BINARY "Build C++ binaries" OFF)
|
||||
option(BUILD_DOCS "Build Caffe2 documentation" OFF)
|
||||
option(BUILD_CUSTOM_PROTOBUF "Build and use Caffe2's own protobuf under third_party" ON)
|
||||
option(BUILD_PYTHON "Build Python binaries" ON)
|
||||
option(BUILD_CAFFE2 "Master flag to build Caffe2" OFF)
|
||||
option(BUILD_LITE_INTERPRETER "Master flag to build Lite Interpreter" OFF)
|
||||
cmake_dependent_option(
|
||||
BUILD_CAFFE2_OPS "Build Caffe2 operators" ON
|
||||
"BUILD_CAFFE2" OFF)
|
||||
option(BUILD_SHARED_LIBS "Build libcaffe2.so" ON)
|
||||
cmake_dependent_option(
|
||||
CAFFE2_LINK_LOCAL_PROTOBUF "If set, build protobuf inside libcaffe2.so." ON
|
||||
@ -635,7 +631,6 @@ if(INTERN_BUILD_MOBILE)
|
||||
endif()
|
||||
set(BUILD_PYTHON OFF)
|
||||
set(BUILD_FUNCTORCH OFF)
|
||||
set(BUILD_CAFFE2_OPS OFF)
|
||||
set(USE_DISTRIBUTED OFF)
|
||||
set(NO_API ON)
|
||||
set(USE_FBGEMM OFF)
|
||||
@ -1208,13 +1203,6 @@ else()
|
||||
"shared libs.")
|
||||
endif()
|
||||
|
||||
# ---[ Modules
|
||||
# If master flag for buildling Caffe2 is disabled, we also disable the
|
||||
# build for Caffe2 related operator modules.
|
||||
if(BUILD_CAFFE2)
|
||||
add_subdirectory(modules)
|
||||
endif()
|
||||
|
||||
# ---[ Binaries
|
||||
# Binaries will be built after the Caffe2 main libraries and the modules
|
||||
# are built. For the binaries, they will be linked to the Caffe2 main
|
||||
|
@ -667,7 +667,6 @@ only interested in a specific component.
|
||||
- Working on a test binary? Run `(cd build && ninja bin/test_binary_name)` to
|
||||
rebuild only that test binary (without rerunning cmake). (Replace `ninja` with
|
||||
`make` if you don't have ninja installed).
|
||||
- Don't need Caffe2? Pass `BUILD_CAFFE2=0` to disable Caffe2 build.
|
||||
|
||||
On the initial build, you can also speed things up with the environment
|
||||
variables `DEBUG`, `USE_DISTRIBUTED`, `USE_MKLDNN`, `USE_CUDA`, `USE_FLASH_ATTENTION`, `USE_MEM_EFF_ATTENTION`, `BUILD_TEST`, `USE_FBGEMM`, `USE_NNPACK` and `USE_QNNPACK`.
|
||||
@ -1196,7 +1195,7 @@ build_with_asan()
|
||||
LDFLAGS="-stdlib=libstdc++" \
|
||||
CFLAGS="-fsanitize=address -fno-sanitize-recover=all -shared-libasan -pthread" \
|
||||
CXX_FLAGS="-pthread" \
|
||||
USE_CUDA=0 USE_OPENMP=0 BUILD_CAFFE2_OPS=0 USE_DISTRIBUTED=0 DEBUG=1 \
|
||||
USE_CUDA=0 USE_OPENMP=0 USE_DISTRIBUTED=0 DEBUG=1 \
|
||||
python setup.py develop
|
||||
}
|
||||
|
||||
|
@ -379,7 +379,7 @@ You can also pass the `CMAKE_VARS="..."` environment variable to specify additio
|
||||
See [setup.py](./setup.py) for the list of available variables.
|
||||
|
||||
```bash
|
||||
CMAKE_VARS="BUILD_CAFFE2=ON BUILD_CAFFE2_OPS=ON" make -f docker.Makefile
|
||||
make -f docker.Makefile
|
||||
```
|
||||
|
||||
### Building the Documentation
|
||||
|
@ -54,7 +54,7 @@ if(NOT BUILD_LITE_INTERPRETER)
|
||||
endif()
|
||||
EXCLUDE(ATen_CORE_SRCS "${ATen_CORE_SRCS}" ${ATen_CORE_TEST_SRCS})
|
||||
# Exclude TensorImpl_test.cpp if compiling without Caffe2
|
||||
if(NOT BUILD_CAFFE2 AND NOT BUILD_LITE_INTERPRETER)
|
||||
if(NOT BUILD_LITE_INTERPRETER)
|
||||
file(GLOB_RECURSE ATen_CORE_EXCLUDED_TEST_SRCS "core/TensorImpl_test.cpp")
|
||||
EXCLUDE(ATen_CORE_TEST_SRCS "${ATen_CORE_TEST_SRCS}" ${ATen_CORE_EXCLUDED_TEST_SRCS})
|
||||
endif()
|
||||
|
@ -123,16 +123,6 @@ list(APPEND ATen_XPU_TEST_SRCS
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/xpu_generator_test.cpp
|
||||
)
|
||||
|
||||
# Caffe2 specific tests
|
||||
if(BUILD_CAFFE2)
|
||||
list(APPEND ATen_CPU_TEST_SRCS
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/ExclusivelyOwned_test.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/tensor_interop_test.cpp)
|
||||
list(APPEND ATen_CUDA_TEST_SRCS
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/cuda_tensor_interop_test.cpp)
|
||||
endif()
|
||||
|
||||
|
||||
# ---[ Send the lists to the parent scope.
|
||||
set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE)
|
||||
set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE)
|
||||
|
@ -7,16 +7,6 @@ if(INTERN_BUILD_MOBILE)
|
||||
return()
|
||||
endif()
|
||||
|
||||
if(BUILD_CAFFE2)
|
||||
caffe2_binary_target("at_launch_benchmark.cc")
|
||||
target_include_directories(at_launch_benchmark PUBLIC
|
||||
${CMAKE_BINARY_DIR}/aten/src)
|
||||
|
||||
caffe2_binary_target("intra_inter_benchmark.cc")
|
||||
target_include_directories(intra_inter_benchmark PUBLIC
|
||||
${CMAKE_BINARY_DIR}/aten/src)
|
||||
endif()
|
||||
|
||||
caffe2_binary_target("parallel_info.cc")
|
||||
target_include_directories(parallel_info PUBLIC
|
||||
${CMAKE_BINARY_DIR}/aten/src) # provides "ATen/TypeExtendedInterface.h" to ATen.h
|
||||
|
@ -279,7 +279,6 @@ def get_pt_preprocessor_flags():
|
||||
"-D_THP_CORE",
|
||||
"-DUSE_SCALARS",
|
||||
"-DNO_CUDNN_DESTROY_HANDLE",
|
||||
"-DBUILD_CAFFE2",
|
||||
]
|
||||
|
||||
if _is_build_mode_dev():
|
||||
|
@ -110,21 +110,11 @@ endif()
|
||||
add_subdirectory(core)
|
||||
add_subdirectory(serialize)
|
||||
add_subdirectory(utils)
|
||||
if(BUILD_CAFFE2 OR (NOT USE_FBGEMM))
|
||||
if(NOT USE_FBGEMM)
|
||||
add_subdirectory(perfkernels)
|
||||
endif()
|
||||
|
||||
# Skip modules that are not used by libtorch mobile yet.
|
||||
if(BUILD_CAFFE2 AND NOT INTERN_BUILD_MOBILE)
|
||||
add_subdirectory(core/nomnigraph)
|
||||
if(USE_NVRTC)
|
||||
add_subdirectory(cuda_rtc)
|
||||
endif()
|
||||
if(BUILD_CAFFE2_OPS)
|
||||
endif()
|
||||
add_subdirectory(proto)
|
||||
endif()
|
||||
if(NOT BUILD_CAFFE2 AND NOT INTERN_BUILD_MOBILE)
|
||||
if(NOT INTERN_BUILD_MOBILE)
|
||||
add_subdirectory(proto)
|
||||
endif()
|
||||
|
||||
@ -585,17 +575,10 @@ if(NOT INTERN_BUILD_MOBILE AND NOT BUILD_LITE_INTERPRETER)
|
||||
${TORCH_SRC_DIR}/csrc/utils/byte_order.cpp
|
||||
)
|
||||
|
||||
# Disable legacy import of building without Caffe2 support
|
||||
if(BUILD_CAFFE2)
|
||||
list(APPEND TORCH_SRCS
|
||||
${TORCH_SRC_DIR}/csrc/jit/serialization/import_legacy.cpp
|
||||
)
|
||||
else()
|
||||
set_source_files_properties(
|
||||
${TORCH_SRC_DIR}/csrc/jit/serialization/import.cpp
|
||||
PROPERTIES COMPILE_FLAGS "-DC10_DISABLE_LEGACY_IMPORT"
|
||||
)
|
||||
endif()
|
||||
set_source_files_properties(
|
||||
${TORCH_SRC_DIR}/csrc/jit/serialization/import.cpp
|
||||
PROPERTIES COMPILE_FLAGS "-DC10_DISABLE_LEGACY_IMPORT"
|
||||
)
|
||||
if(USE_DISTRIBUTED)
|
||||
append_filelist("libtorch_distributed_base_sources" TORCH_SRCS)
|
||||
if(NOT WIN32)
|
||||
@ -809,11 +792,6 @@ if(HAVE_SOVERSION)
|
||||
VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION})
|
||||
endif()
|
||||
torch_compile_options(torch_cpu) # see cmake/public/utils.cmake
|
||||
if(BUILD_CAFFE2 AND NOT MSVC)
|
||||
# Caffe2 has too many signed-unsigned violation, but the framework is dead
|
||||
# So no point in fixing those
|
||||
target_compile_options(torch_cpu PRIVATE "-Wno-sign-compare")
|
||||
endif()
|
||||
|
||||
# Ignore Wdeprecated-XXX errors from third-party libraries
|
||||
if(NOT MSVC)
|
||||
@ -1921,14 +1899,6 @@ if(BUILD_TEST)
|
||||
endif()
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
# For special tests that explicitly uses dependencies, we add them here
|
||||
if(BUILD_CAFFE2 AND USE_MPI)
|
||||
target_link_libraries(mpi_test MPI::MPI_CXX)
|
||||
if(USE_CUDA)
|
||||
target_link_libraries(mpi_gpu_test MPI::MPI_CXX)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(MSVC)
|
||||
@ -1998,11 +1968,6 @@ if(BUILD_PYTHON)
|
||||
set_source_files_properties(${TORCH_SRC_DIR}/../caffe2/operators/box_with_nms_limit_op.cc PROPERTIES COMPILE_FLAGS -Wno-attributes)
|
||||
endif()
|
||||
|
||||
# ---[ Python.
|
||||
if(BUILD_CAFFE2)
|
||||
target_compile_definitions(torch PRIVATE BUILD_CAFFE2)
|
||||
endif()
|
||||
|
||||
# generated pb files are copied from build/caffe2 to caffe2
|
||||
# if we copied them back to build this would create a build cycle
|
||||
# consider removing the need for globs
|
||||
|
@ -2,5 +2,4 @@ import warnings
|
||||
from torch.onnx import _CAFFE2_ATEN_FALLBACK
|
||||
|
||||
if not _CAFFE2_ATEN_FALLBACK:
|
||||
warnings.warn("Caffe2 support is not fully enabled in this PyTorch build. "
|
||||
"Please enable Caffe2 by building PyTorch from source with `BUILD_CAFFE2=1` flag.")
|
||||
warnings.warn("Caffe2 support is no longer present in PyTorch.")
|
||||
|
@ -1,68 +1,4 @@
|
||||
if(NOT BUILD_CAFFE2 OR INTERN_BUILD_MOBILE)
|
||||
list(APPEND Caffe2_CPU_SRCS
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/common.cc"
|
||||
)
|
||||
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
|
||||
return()
|
||||
endif()
|
||||
|
||||
# ---[ GPU files
|
||||
# ------[ cuDNN
|
||||
if(USE_CUDNN)
|
||||
file(GLOB tmp *_cudnn.cc)
|
||||
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
|
||||
endif()
|
||||
# ------[ general GPU
|
||||
file(GLOB tmp *_gpu.cc)
|
||||
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
|
||||
# ------[ CUDA sources
|
||||
file(GLOB tmp *.cu)
|
||||
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
|
||||
# exclude test files
|
||||
file(GLOB tmp *_test.cc)
|
||||
exclude(Caffe2_GPU_SRCS "${Caffe2_GPU_SRCS}" ${tmp})
|
||||
|
||||
# ---[ general HIP files
|
||||
file(GLOB tmp hip/*.cc)
|
||||
set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${tmp})
|
||||
# ------[ HIP sources
|
||||
file(GLOB tmp hip/*.hip)
|
||||
set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${tmp})
|
||||
# exclude test files
|
||||
file(GLOB tmp hip/*_test.cc)
|
||||
exclude(Caffe2_HIP_SRCS "${Caffe2_HIP_SRCS}" ${tmp})
|
||||
|
||||
# ---[ CPU files.
|
||||
file(GLOB tmp *.cc)
|
||||
# Manually remove the cudnn files since we might be using USE_CUDNN=OFF
|
||||
# TODO: when we move to explicit file list, this would not be needed.
|
||||
file(GLOB tmp_cudnn *_cudnn.cc)
|
||||
exclude(tmp "${tmp}" ${tmp_cudnn})
|
||||
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp})
|
||||
# exclude test files and gpu files
|
||||
file(GLOB tmp *_test.cc)
|
||||
exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp})
|
||||
exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_GPU_SRCS})
|
||||
exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_HIP_SRCS})
|
||||
|
||||
# ---[ GPU test files
|
||||
file(GLOB tmp *_gpu_test.cc)
|
||||
set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} ${tmp})
|
||||
|
||||
# ---[ HIP test files
|
||||
file(GLOB tmp hip/*_test.cc)
|
||||
set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} ${tmp})
|
||||
|
||||
# ---[ CPU test files
|
||||
file(GLOB tmp *_test.cc)
|
||||
set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp})
|
||||
exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_GPU_TEST_SRCS})
|
||||
exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_HIP_TEST_SRCS})
|
||||
|
||||
# ---[ Send the lists to the parent scope.
|
||||
list(APPEND Caffe2_CPU_SRCS
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/common.cc"
|
||||
)
|
||||
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
|
||||
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
|
||||
set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE)
|
||||
set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
|
||||
set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
|
||||
set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} PARENT_SCOPE)
|
||||
|
@ -1,5 +1,6 @@
|
||||
#include "caffe2/core/blob_serialization.h"
|
||||
|
||||
#include <limits>
|
||||
#include <mutex>
|
||||
#include <sstream>
|
||||
#include <utility>
|
||||
@ -83,8 +84,7 @@ Range<T*> GetMutableTensorDataRange(
|
||||
size_t start,
|
||||
size_t numElements) {
|
||||
CAFFE_ENFORCE(
|
||||
// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
|
||||
start + numElements <= tensor.numel(),
|
||||
static_cast<int64_t>(start + numElements) <= tensor.numel(),
|
||||
"Requested invalid mutable tensor range [",
|
||||
start,
|
||||
", ",
|
||||
@ -100,8 +100,7 @@ c10::ArrayRef<T> GetTensorDataRange(
|
||||
size_t start,
|
||||
size_t numElements) {
|
||||
CAFFE_ENFORCE(
|
||||
// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
|
||||
start + numElements <= tensor.numel(),
|
||||
static_cast<int64_t>(start + numElements) <= tensor.numel(),
|
||||
"Requested invalid tensor range [",
|
||||
start,
|
||||
", ",
|
||||
@ -390,8 +389,7 @@ void TensorSerializer::SerializeWithOptions(
|
||||
// Poorman's IOBound ThreadPool
|
||||
SimpleQueue<size_t> chunkQueue;
|
||||
auto task = [&]() {
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
size_t chunkStart;
|
||||
size_t chunkStart = std::numeric_limits<size_t>::max();
|
||||
while (chunkQueue.Pop(&chunkStart)) {
|
||||
processChunk(chunkStart);
|
||||
}
|
||||
@ -409,8 +407,7 @@ void TensorSerializer::SerializeWithOptions(
|
||||
VLOG(1) << "Serializing blob " << name;
|
||||
// Serialize whole vector. If vector is empty, it's shape still needs to be
|
||||
// serialized in empty proto
|
||||
for (size_t chunkBegin = 0;
|
||||
// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
|
||||
for (int64_t chunkBegin = 0;
|
||||
chunkBegin < std::max(tensor.numel(), static_cast<int64_t>(1));
|
||||
chunkBegin += chunk_size) {
|
||||
VLOG(2) << "Starting a chunk at " << chunkBegin;
|
||||
@ -582,8 +579,7 @@ void SerializeTensorData(const SerializeParams<float>& params) {
|
||||
BlobSerializationOptions_FloatFormat_FLOAT_BFLOAT16) {
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
std::unique_ptr<float[]> tmp_buffer;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
const float* src;
|
||||
const float* src = nullptr;
|
||||
if (params.context.device() == CPU) {
|
||||
src = params.input.data();
|
||||
} else {
|
||||
@ -653,14 +649,12 @@ void TensorSerializer::Serialize(
|
||||
size_t chunkBegin,
|
||||
int32_t chunkSize) {
|
||||
CAFFE_ENFORCE(
|
||||
// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
|
||||
chunkBegin <= input.numel(),
|
||||
static_cast<int64_t>(chunkBegin) <= input.numel(),
|
||||
"Chunk begin is out of tensor: ",
|
||||
chunkBegin,
|
||||
' ',
|
||||
input.numel());
|
||||
// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
|
||||
if (chunkBegin + chunkSize > input.numel()) {
|
||||
if (static_cast<int64_t>(chunkBegin + chunkSize) > input.numel()) {
|
||||
chunkSize = input.numel() - chunkBegin;
|
||||
}
|
||||
|
||||
@ -1029,8 +1023,7 @@ DESERIALIZE_IMPL(float, FMT_BFLOAT16) {
|
||||
params.tensor_proto.raw_data().data());
|
||||
|
||||
// If we are on a big-endian machine, byte-swap the serialized data.
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
const fbgemm::bfloat16* src;
|
||||
const fbgemm::bfloat16* src = nullptr;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
std::unique_ptr<fbgemm::bfloat16[]> bswap_buffer;
|
||||
if (kIsLittleEndian) {
|
||||
@ -1045,8 +1038,7 @@ DESERIALIZE_IMPL(float, FMT_BFLOAT16) {
|
||||
// bfloat16 to float conversion.
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
std::unique_ptr<float[]> tmp_buffer;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
||||
float* dest;
|
||||
float* dest = nullptr;
|
||||
if (params.context.device() == CPU) {
|
||||
dest = params.dest.data();
|
||||
} else {
|
||||
|
@ -1,8 +1,4 @@
|
||||
if(BUILD_CAFFE2)
|
||||
file(GLOB Caffe2_PROTOBUF_FILES "${CMAKE_CURRENT_SOURCE_DIR}/*.proto")
|
||||
else()
|
||||
set(Caffe2_PROTOBUF_FILES "${CMAKE_CURRENT_SOURCE_DIR}/torch.proto;${CMAKE_CURRENT_SOURCE_DIR}/caffe2.proto")
|
||||
endif()
|
||||
set(Caffe2_PROTOBUF_FILES "${CMAKE_CURRENT_SOURCE_DIR}/torch.proto;${CMAKE_CURRENT_SOURCE_DIR}/caffe2.proto")
|
||||
|
||||
caffe2_protobuf_generate_cpp_py(Caffe2_PROTO_SRCS Caffe2_PROTO_HEADERS Caffe2_PROTO_PY ${Caffe2_PROTOBUF_FILES})
|
||||
|
||||
|
@ -14,8 +14,7 @@ import warnings
|
||||
try:
|
||||
from caffe2.proto import caffe2_pb2, metanet_pb2, torch_pb2
|
||||
except ImportError:
|
||||
warnings.warn('Caffe2 support is not enabled in this PyTorch build. '
|
||||
'Please enable Caffe2 by building PyTorch from source with `BUILD_CAFFE2=1` flag.')
|
||||
warnings.warn('Caffe2 support is no longer present in PyTorch.')
|
||||
raise
|
||||
|
||||
try:
|
||||
|
@ -6,8 +6,7 @@ import warnings
|
||||
try:
|
||||
from caffe2.proto import caffe2_pb2
|
||||
except ImportError:
|
||||
warnings.warn('Caffe2 support is not enabled in this PyTorch build. '
|
||||
'Please enable Caffe2 by building PyTorch from source with `BUILD_CAFFE2=1` flag.')
|
||||
warnings.warn('Caffe2 support is no longer present in PyTorch.')
|
||||
raise
|
||||
|
||||
# TODO: refactor & remove the following alias
|
||||
|
@ -1,100 +1,18 @@
|
||||
if(NOT BUILD_CAFFE2 OR INTERN_BUILD_MOBILE)
|
||||
list(APPEND Caffe2_CPU_SRCS
|
||||
utils/string_utils.cc
|
||||
utils/threadpool/ThreadPool.cc
|
||||
)
|
||||
|
||||
if(USE_PTHREADPOOL AND NOT USE_INTERNAL_PTHREADPOOL_IMPL)
|
||||
list(APPEND Caffe2_CPU_SRCS
|
||||
utils/threadpool/pthreadpool-cpp.cc
|
||||
utils/threadpool/thread_pool_guard.cpp
|
||||
)
|
||||
endif()
|
||||
|
||||
if(NOT BUILD_CAFFE2 AND NOT INTERN_BUILD_MOBILE)
|
||||
list(APPEND Caffe2_CPU_SRCS
|
||||
utils/proto_wrap.cc
|
||||
)
|
||||
endif()
|
||||
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
|
||||
return()
|
||||
endif()
|
||||
|
||||
list(APPEND Caffe2_CPU_SRCS
|
||||
utils/bench_utils.cc
|
||||
utils/cpuid.cc
|
||||
utils/math/broadcast.cc
|
||||
utils/math/elementwise.cc
|
||||
utils/math/reduce.cc
|
||||
utils/math/transpose.cc
|
||||
utils/math/utils.cc
|
||||
utils/math_cpu.cc
|
||||
utils/murmur_hash3.cc
|
||||
utils/proto_utils.cc
|
||||
utils/proto_wrap.cc
|
||||
utils/string_utils.cc
|
||||
utils/threadpool/ThreadPool.cc
|
||||
utils/signal_handler.cc
|
||||
utils/smart_tensor_printer.cc
|
||||
utils/string_utils.cc)
|
||||
)
|
||||
|
||||
if(USE_PTHREADPOOL)
|
||||
if(USE_PTHREADPOOL AND NOT USE_INTERNAL_PTHREADPOOL_IMPL)
|
||||
list(APPEND Caffe2_CPU_SRCS
|
||||
utils/threadpool/pthreadpool-cpp.cc
|
||||
utils/threadpool/thread_pool_guard.cpp)
|
||||
if(USE_INTERNAL_PTHREADPOOL_IMPL)
|
||||
list(APPEND Caffe2_CPU_SRCS
|
||||
utils/threadpool/pthreadpool.cc
|
||||
utils/threadpool/pthreadpool_impl.cc)
|
||||
endif()
|
||||
utils/threadpool/thread_pool_guard.cpp
|
||||
)
|
||||
endif()
|
||||
|
||||
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS}
|
||||
utils/math/broadcast.cu
|
||||
utils/math/elementwise.cu
|
||||
utils/math/reduce.cu
|
||||
utils/math/transpose.cu
|
||||
utils/math_gpu.cu
|
||||
)
|
||||
|
||||
set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS}
|
||||
utils/math/hip/broadcast.hip
|
||||
utils/math/hip/elementwise.hip
|
||||
utils/math/hip/reduce.hip
|
||||
utils/math/hip/transpose.hip
|
||||
utils/hip/math_gpu.hip
|
||||
)
|
||||
|
||||
set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS}
|
||||
utils/fixed_divisor_test.cc
|
||||
utils/math_test.cc
|
||||
utils/fatal_signal_asan_no_sig_test.cc
|
||||
utils/simple_queue_test.cc
|
||||
utils/proto_utils_test.cc
|
||||
utils/smart_tensor_printer_test.cc
|
||||
utils/cast_test.cc
|
||||
)
|
||||
|
||||
if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "s390x")
|
||||
set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS}
|
||||
utils/cpuid_test.cc
|
||||
)
|
||||
if(NOT INTERN_BUILD_MOBILE)
|
||||
list(APPEND Caffe2_CPU_SRCS
|
||||
utils/proto_wrap.cc
|
||||
)
|
||||
endif()
|
||||
|
||||
set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS}
|
||||
utils/math_gpu_test.cc
|
||||
)
|
||||
|
||||
set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS}
|
||||
utils/hip/math_gpu_test.cc
|
||||
utils/hip/math_blas_gpu_test.cc
|
||||
)
|
||||
|
||||
# TODO Once all source files are defined inside the local c10_utils_xxx targets,
|
||||
# it should be the job of the parent CMakeLists.txt to decide what to do with the target (i.e. link it to caffe2)
|
||||
# instead of us locally adding it to Caffe2_xxx variables.
|
||||
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
|
||||
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
|
||||
set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE)
|
||||
set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
|
||||
set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
|
||||
set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} PARENT_SCOPE)
|
||||
|
@ -1685,9 +1685,6 @@ if(NOT INTERN_BUILD_MOBILE)
|
||||
if(MKLDNN_FOUND)
|
||||
set(AT_MKLDNN_ENABLED 1)
|
||||
include_directories(AFTER SYSTEM ${MKLDNN_INCLUDE_DIR})
|
||||
if(BUILD_CAFFE2_OPS)
|
||||
list(APPEND Caffe2_DEPENDENCY_LIBS caffe2::mkldnn)
|
||||
endif(BUILD_CAFFE2_OPS)
|
||||
else()
|
||||
message(WARNING "MKLDNN could not be found.")
|
||||
caffe2_update_option(USE_MKLDNN OFF)
|
||||
|
@ -23,8 +23,6 @@ function(caffe2_print_configuration_summary)
|
||||
message(STATUS "")
|
||||
|
||||
message(STATUS " TORCH_VERSION : ${TORCH_VERSION}")
|
||||
message(STATUS " BUILD_CAFFE2 : ${BUILD_CAFFE2}")
|
||||
message(STATUS " BUILD_CAFFE2_OPS : ${BUILD_CAFFE2_OPS}")
|
||||
message(STATUS " BUILD_STATIC_RUNTIME_BENCHMARK: ${BUILD_STATIC_RUNTIME_BENCHMARK}")
|
||||
message(STATUS " BUILD_BINARY : ${BUILD_BINARY}")
|
||||
message(STATUS " BUILD_CUSTOM_PROTOBUF : ${BUILD_CUSTOM_PROTOBUF}")
|
||||
|
@ -80,9 +80,6 @@ else()
|
||||
# shared library.
|
||||
# TODO: this list might be incomplete.
|
||||
append_torchlib_if_found(c10)
|
||||
if(@BUILD_CAFFE2@)
|
||||
append_torchlib_if_found(Caffe2_perfkernels_avx512 Caffe2_perfkernels_avx2 Caffe2_perfkernels_avx)
|
||||
endif()
|
||||
|
||||
if(@USE_NNPACK@)
|
||||
append_torchlib_if_found(nnpack)
|
||||
|
@ -1,7 +0,0 @@
|
||||
project(modules CXX C)
|
||||
add_subdirectory(detectron)
|
||||
add_subdirectory(module_test)
|
||||
add_subdirectory(observers)
|
||||
|
||||
# Finally, set Caffe2_MODULES to parent scope.
|
||||
set(Caffe2_MODULES ${Caffe2_MODULES} PARENT_SCOPE)
|
@ -1,57 +0,0 @@
|
||||
file(GLOB Detectron_CPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc)
|
||||
file(GLOB Detectron_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cu)
|
||||
file(GLOB_RECURSE Detectron_HIP_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.hip)
|
||||
|
||||
if(BUILD_CAFFE2_OPS)
|
||||
# Note(ilijar): Since Detectron ops currently have no
|
||||
# CPU implementation, we only build GPU ops for now.
|
||||
if(USE_CUDA)
|
||||
add_library(
|
||||
caffe2_detectron_ops_gpu SHARED
|
||||
${Detectron_CPU_SRCS}
|
||||
${Detectron_GPU_SRCS})
|
||||
|
||||
target_link_libraries(caffe2_detectron_ops_gpu PRIVATE torch)
|
||||
if(USE_OPENMP)
|
||||
target_link_libraries(caffe2_detectron_ops_gpu PRIVATE caffe2::openmp)
|
||||
endif()
|
||||
|
||||
if(USE_MKLDNN)
|
||||
target_link_libraries(caffe2_detectron_ops_gpu PRIVATE caffe2::mkldnn)
|
||||
endif()
|
||||
install(TARGETS caffe2_detectron_ops_gpu DESTINATION lib)
|
||||
if(MSVC)
|
||||
install(FILES $<TARGET_PDB_FILE:caffe2_detectron_ops_gpu> DESTINATION lib OPTIONAL)
|
||||
endif()
|
||||
elseif(USE_ROCM)
|
||||
hip_include_directories(${Caffe2_HIP_INCLUDES})
|
||||
set_source_files_properties(${Detectron_HIP_SRCS} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
|
||||
HIP_ADD_LIBRARY(
|
||||
caffe2_detectron_ops_hip SHARED
|
||||
${Detectron_CPU_SRCS}
|
||||
${Detectron_HIP_SRCS})
|
||||
target_compile_options(caffe2_detectron_ops_hip PRIVATE ${HIP_CXX_FLAGS})
|
||||
if(USE_MKLDNN)
|
||||
target_link_libraries(caffe2_detectron_ops_hip PRIVATE caffe2::mkldnn)
|
||||
endif()
|
||||
target_link_libraries(caffe2_detectron_ops_hip PRIVATE torch)
|
||||
install(TARGETS caffe2_detectron_ops_hip DESTINATION lib)
|
||||
elseif(NOT IOS_PLATFORM)
|
||||
add_library(caffe2_detectron_ops SHARED ${Detectron_CPU_SRCS})
|
||||
if(HAVE_SOVERSION)
|
||||
set_target_properties(caffe2_detectron_ops PROPERTIES
|
||||
VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION})
|
||||
endif()
|
||||
target_link_libraries(caffe2_detectron_ops PRIVATE torch)
|
||||
if(USE_OPENMP)
|
||||
target_link_libraries(caffe2_detectron_ops PRIVATE caffe2::openmp)
|
||||
endif()
|
||||
if(USE_MKLDNN)
|
||||
target_link_libraries(caffe2_detectron_ops PRIVATE caffe2::mkldnn)
|
||||
endif()
|
||||
install(TARGETS caffe2_detectron_ops DESTINATION lib)
|
||||
if(MSVC)
|
||||
install(FILES $<TARGET_PDB_FILE:caffe2_detectron_ops> DESTINATION lib OPTIONAL)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
@ -1,83 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "modules/detectron/group_spatial_softmax_op.h"
|
||||
|
||||
#include "caffe2/operators/softmax_utils.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
REGISTER_CPU_OPERATOR(
|
||||
GroupSpatialSoftmax,
|
||||
GroupSpatialSoftmaxOp<float, CPUContext>);
|
||||
REGISTER_CPU_OPERATOR(
|
||||
GroupSpatialSoftmaxGradient,
|
||||
GroupSpatialSoftmaxGradientOp<float, CPUContext>);
|
||||
|
||||
OPERATOR_SCHEMA(GroupSpatialSoftmax)
|
||||
.NumInputs(1)
|
||||
.NumOutputs(1)
|
||||
.SetDoc(R"DOC(
|
||||
RetinaNet specific form of spatial softmax.
|
||||
|
||||
The input is assumed to be unnormalized scores (sometimes called 'logits')
|
||||
arranged in a 4D tensor with shape (N, C, H, W), where N is the number of
|
||||
elements in the batch, H and W are the height and width, and C = num_anchors *
|
||||
num_classes defines num_anchors 'groups' of softmax inputs, each of length
|
||||
num_classes. The softmax is applied to each group independently.
|
||||
|
||||
See: https://arxiv.org/abs/1708.02002 for details.
|
||||
)DOC")
|
||||
.Arg(
|
||||
"num_classes",
|
||||
"(int) default 81; number of classes in each softmax group.")
|
||||
.Input(
|
||||
0,
|
||||
"scores",
|
||||
"4D tensor of softmax inputs (called 'scores' or 'logits') with shape "
|
||||
"(N, C, H, W), where C = num_anchors * num_classes defines num_anchors "
|
||||
"groups of contiguous num_classes softmax inputs.")
|
||||
.Output(
|
||||
0,
|
||||
"probabilities",
|
||||
"4D tensor of softmax probabilities with shape (N, C, H, W), where "
|
||||
"C = num_anchors * num_classes, and softmax was applied to each of the "
|
||||
"num_anchors groups; within a group the num_classes values sum to 1.");
|
||||
|
||||
OPERATOR_SCHEMA(GroupSpatialSoftmaxGradient)
|
||||
.NumInputs(2)
|
||||
.NumOutputs(1)
|
||||
.Input(0, "scores", "See GroupSpatialSoftmax")
|
||||
.Input(
|
||||
1,
|
||||
"d_probabilities",
|
||||
"Gradient of forward output 0 (probabilities).")
|
||||
.Output(0, "d_scores", "Gradient of forward input 0 (scores).");
|
||||
|
||||
class GetGroupSpatialSoftmaxGradient : public GradientMakerBase {
|
||||
using GradientMakerBase::GradientMakerBase;
|
||||
vector<OperatorDef> GetGradientDefs() override {
|
||||
return SingleGradientDef(
|
||||
"GroupSpatialSoftmaxGradient",
|
||||
"",
|
||||
vector<string>{O(0), GO(0)},
|
||||
vector<string>{GI(0)});
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_GRADIENT(GroupSpatialSoftmax, GetGroupSpatialSoftmaxGradient);
|
||||
|
||||
} // namespace caffe2
|
@ -1,181 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <cfloat>
|
||||
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "modules/detectron/group_spatial_softmax_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
namespace {
|
||||
|
||||
__global__ void GroupSpatialSoftmaxKernel(const int num, const int A, const int W,
|
||||
const int H, const float* Xdata, float* Pdata, const int num_classes) {
|
||||
// Loop through labels (N x A x H x W)
|
||||
CUDA_1D_KERNEL_LOOP(index, num * A * H * W) {
|
||||
int D = num_classes * A;
|
||||
int x = index % W;
|
||||
int y = (index / W) % H;
|
||||
int a = (index / (W * H)) % A;
|
||||
int i = index / W / H / A;
|
||||
|
||||
// Subtract max on each cell for numerical reasons
|
||||
float max_val = -FLT_MAX;
|
||||
for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
|
||||
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
|
||||
max_val = max(max_val, Xdata[idx]);
|
||||
}
|
||||
// Exponentiate
|
||||
float expsum = 0.0f;
|
||||
for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
|
||||
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
|
||||
float expx = exp(Xdata[idx] - max_val);
|
||||
Pdata[idx] = expx;
|
||||
expsum += expx;
|
||||
}
|
||||
|
||||
// Normalize
|
||||
for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
|
||||
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
|
||||
Pdata[idx] /= expsum;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void SumProbsKernel(const int N, const int A, const int W,
|
||||
const int H, const float* Ydata, const float* dYdata,
|
||||
float* sum_probs_data, const int num_classes) {
|
||||
CUDA_1D_KERNEL_LOOP(i, N * A * W * H) {
|
||||
int D = num_classes * A;
|
||||
int x = i % W;
|
||||
int y = (i / W) % H;
|
||||
int a = (i / (W * H)) % A;
|
||||
int n = i / (W * H * A);
|
||||
|
||||
sum_probs_data[i] = 0.0;
|
||||
for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
|
||||
int idx = n * (H * W * D) + c * (H * W) + y * W + x;
|
||||
sum_probs_data[i] += (Ydata[idx] * dYdata[idx]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void SubSumKernel(
|
||||
const int N, const int A, const int W, const int H,
|
||||
const float* sum_probs_data, float* dXdata, const int num_classes) {
|
||||
CUDA_1D_KERNEL_LOOP(i, N * (A * num_classes) * W * H) {
|
||||
int D = num_classes * A;
|
||||
int x = i % W;
|
||||
int y = (i / W) % H;
|
||||
int a = ((i / (W * H)) % D) / num_classes;
|
||||
int n = i / W / H / D;
|
||||
int idx = n * (H * W * A) + a * (H * W) + y * W + x;
|
||||
dXdata[i] = (dXdata[i] - sum_probs_data[idx]);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
|
||||
template <>
|
||||
bool GroupSpatialSoftmaxOp<float, CUDAContext>::RunOnDevice() {
|
||||
auto& X = Input(0); // Logits
|
||||
|
||||
int N = X.dim32(0);
|
||||
int D = X.dim32(1);
|
||||
int H = X.dim32(2);
|
||||
int W = X.dim32(3);
|
||||
int A = D / num_classes_;
|
||||
|
||||
auto* P = Output(0, X.sizes(), at::dtype<float>()); // Probabilities from softmax
|
||||
TORCH_DCHECK_EQ(X.ndim(), 4);
|
||||
|
||||
const float* Xdata = X.data<float>();
|
||||
float* Pdata = P->mutable_data<float>();
|
||||
|
||||
// Softmax for each x,y location
|
||||
GroupSpatialSoftmaxKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
N, A, W, H, Xdata, Pdata, num_classes_);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
template<>
|
||||
bool GroupSpatialSoftmaxGradientOp<float, CUDAContext>::RunOnDevice() {
|
||||
auto& Y = Input(0); // Probabilities from softmax
|
||||
auto& dY = Input(1);
|
||||
|
||||
|
||||
TORCH_DCHECK_EQ(Y.ndim(), 4);
|
||||
|
||||
int N = Y.dim32(0);
|
||||
int D = Y.dim32(1);
|
||||
int H = Y.dim32(2);
|
||||
int W = Y.dim32(3);
|
||||
int A = D / num_classes_;
|
||||
|
||||
auto* dX = Output(0, Y.sizes(), at::dtype<float>());
|
||||
|
||||
if (sum_probs_.size() != N * A * H * W) {
|
||||
ReinitializeTensor(&sum_probs_, {N * A * H * W}, at::dtype<float>().device(CUDA));
|
||||
}
|
||||
|
||||
const float* Ydata = Y.data<float>();
|
||||
const float* dYdata = dY.data<float>();
|
||||
float* dXdata = dX->mutable_data<float>();
|
||||
|
||||
float* sum_probs_data = sum_probs_.mutable_data<float>();
|
||||
math::Set<float, CUDAContext>(
|
||||
sum_probs_.size(), 0.0f, sum_probs_data, &context_);
|
||||
|
||||
// Complete math:
|
||||
// J_ij = h_i (delta_ij - h_j)
|
||||
// d x_i = sum_j d h_ij = sum_j J_ij * dy_j
|
||||
// = sum_j h_i (delta_ij - h_j) * dy_j
|
||||
// = h_i dy_i - (sum_j h_i h_j dy_j)
|
||||
// = h_i dy_i - h_i sum_j h_j dy_j
|
||||
|
||||
// Step 0: dx = dy
|
||||
context_.Copy<float, CUDAContext, CUDAContext>(Y.size(), dYdata, dXdata);
|
||||
|
||||
// Step 1: s = Sum(dY[j] * Y[j])
|
||||
SumProbsKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS, 0,
|
||||
context_.cuda_stream()>>>(
|
||||
N, A, W, H, Ydata, dYdata, sum_probs_data, num_classes_);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
// Step 2: dX[i] = dX[i] - s
|
||||
SubSumKernel<<<CAFFE_GET_BLOCKS(Y.size()), CAFFE_CUDA_NUM_THREADS, 0,
|
||||
context_.cuda_stream()>>>(
|
||||
N, A, W, H, sum_probs_.data<float>(), dXdata, num_classes_);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
// Step 3: dX[i] = Y[i] * dX[i]
|
||||
math::Mul<float, CUDAContext>(Y.size(), dXdata, Ydata, dXdata, &context_);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
REGISTER_CUDA_OPERATOR(GroupSpatialSoftmax,
|
||||
GroupSpatialSoftmaxOp<float, CUDAContext>);
|
||||
REGISTER_CUDA_OPERATOR(GroupSpatialSoftmaxGradient,
|
||||
GroupSpatialSoftmaxGradientOp<float, CUDAContext>);
|
||||
} // namespace caffe2
|
@ -1,76 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef GROUP_SPATIAL_SOFTMAX_OP_H_
|
||||
#define GROUP_SPATIAL_SOFTMAX_OP_H_
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename T, class Context>
|
||||
class GroupSpatialSoftmaxOp final : public Operator<Context> {
|
||||
public:
|
||||
GroupSpatialSoftmaxOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<Context>(operator_def, ws),
|
||||
num_classes_(this->template GetSingleArgument<int>("num_classes", 81)),
|
||||
order_(StringToStorageOrder(
|
||||
this->template GetSingleArgument<string>("order", "NCHW"))) {
|
||||
CAFFE_ENFORCE_EQ(
|
||||
order_, StorageOrder::NCHW, "Only NCHW order is supported right now.");
|
||||
}
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
|
||||
bool RunOnDevice() override {
|
||||
// No CPU implementation for now
|
||||
CAFFE_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
protected:
|
||||
int num_classes_;
|
||||
StorageOrder order_;
|
||||
};
|
||||
|
||||
template <typename T, class Context>
|
||||
class GroupSpatialSoftmaxGradientOp final : public Operator<Context> {
|
||||
public:
|
||||
GroupSpatialSoftmaxGradientOp(const OperatorDef& def, Workspace* ws)
|
||||
: Operator<Context>(def, ws),
|
||||
num_classes_(this->template GetSingleArgument<int>("num_classes", 81)),
|
||||
order_(StringToStorageOrder(
|
||||
this->template GetSingleArgument<string>("order", "NCHW"))) {
|
||||
CAFFE_ENFORCE_EQ(
|
||||
order_, StorageOrder::NCHW, "Only NCHW order is supported right now.");
|
||||
}
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
|
||||
bool RunOnDevice() override {
|
||||
// No CPU implementation for now
|
||||
CAFFE_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
protected:
|
||||
int num_classes_;
|
||||
StorageOrder order_;
|
||||
Tensor sum_probs_;
|
||||
};
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // GROUP_SPATIAL_SOFTMAX_OP_H_
|
@ -1,106 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "ps_roi_pool_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
REGISTER_CPU_OPERATOR(PSRoIPool, PSRoIPoolOp<float, CPUContext>);
|
||||
REGISTER_CPU_OPERATOR(
|
||||
PSRoIPoolGradient,
|
||||
PSRoIPoolGradientOp<float, CPUContext>);
|
||||
|
||||
OPERATOR_SCHEMA(PSRoIPool)
|
||||
.NumInputs(2)
|
||||
.NumOutputs(2)
|
||||
.SetDoc(R"DOC(
|
||||
Position Sensitive Region of Interest Pooling as used in R-FCN.
|
||||
)DOC")
|
||||
.Arg(
|
||||
"spatial_scale",
|
||||
"(float) default 1.0; Spatial scale of the input feature map X "
|
||||
"relative to the input image. E.g., 0.0625 if X has a stride of 16 "
|
||||
"w.r.t. the input image.")
|
||||
.Arg(
|
||||
"group_size",
|
||||
"(int) default 1; pooled_h = pooled_w = group_size where pooled_{h,w} "
|
||||
"is the pooled output Y's height and width, respectively.")
|
||||
.Arg(
|
||||
"output_dim",
|
||||
"(int) default 1; number of channels in the pooled output, which might "
|
||||
"be the number of classes is used for classification or 4 if used for "
|
||||
"class agnostic bounding box regression.")
|
||||
.Input(
|
||||
0,
|
||||
"X",
|
||||
"4D position sensitive feature map input of shape (N, C, H, W), where "
|
||||
"C = group_size**2 * output_dim.")
|
||||
.Input(
|
||||
1,
|
||||
"RoIs",
|
||||
"2D input of shape (R, 5) specifying R RoIs with five columns "
|
||||
"representing: batch index in [0, N - 1], x1, y1, x2, y2. The RoI "
|
||||
"coordinates are in the coordinate system of the input image.")
|
||||
.Output(
|
||||
0,
|
||||
"Y",
|
||||
"4D output of shape (R, output_dim, pooled_h, pooled_w). The r-th "
|
||||
"batch element is a pooled feature map cooresponding to the r-th RoI.")
|
||||
.Output(
|
||||
1,
|
||||
"argmaxes",
|
||||
"4D output of shape (R, output_dim, pooled_h, pooled_w). Same as Y, "
|
||||
"except it records the argmax indices rather than the max pooled "
|
||||
"values.");
|
||||
|
||||
OPERATOR_SCHEMA(PSRoIPoolGradient)
|
||||
.NumInputs(4)
|
||||
.NumOutputs(1)
|
||||
.Input(
|
||||
0,
|
||||
"X",
|
||||
"See PSRoIPool.")
|
||||
.Input(
|
||||
1,
|
||||
"RoIs",
|
||||
"See PSRoIPool.")
|
||||
.Input(
|
||||
2,
|
||||
"argmaxes",
|
||||
"See PSRoIPool.")
|
||||
.Input(
|
||||
3,
|
||||
"dY",
|
||||
"Gradient of forward output 0 (Y)")
|
||||
.Output(
|
||||
0,
|
||||
"dX",
|
||||
"Gradient of forward input 0 (X)");
|
||||
|
||||
class GetPSRoIPoolGradient : public GradientMakerBase {
|
||||
using GradientMakerBase::GradientMakerBase;
|
||||
vector<OperatorDef> GetGradientDefs() override {
|
||||
return SingleGradientDef(
|
||||
"PSRoIPoolGradient",
|
||||
"",
|
||||
vector<string>{I(0), I(1), O(1), GO(0)},
|
||||
vector<string>{GI(0)});
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_GRADIENT(PSRoIPool, GetPSRoIPoolGradient);
|
||||
|
||||
} // namespace caffe2
|
@ -1,289 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// Based on https://github.com/daijifeng001/caffe-rfcn/blob/r-fcn/src/caffe/layers/psroi_pooling_layer.cu
|
||||
//
|
||||
// ------------------------------------------------------------------
|
||||
// R-FCN
|
||||
// Copyright (c) 2016 Microsoft
|
||||
// Licensed under The MIT License [see r-fcn/LICENSE for details]
|
||||
// Written by Yi Li
|
||||
// ------------------------------------------------------------------
|
||||
//
|
||||
// COPYRIGHT
|
||||
//
|
||||
// All contributions by the University of California:
|
||||
// Copyright (c) 2014, 2015, The Regents of the University of California
|
||||
// (Regents)
|
||||
// All rights reserved.
|
||||
//
|
||||
// All other contributions:
|
||||
// Copyright (c) 2014, 2015, the respective contributors
|
||||
// All rights reserved.
|
||||
//
|
||||
// Caffe uses a shared copyright model: each contributor holds copyright over
|
||||
// their contributions to Caffe. The project versioning records all such
|
||||
// contribution and copyright details. If a contributor wants to further mark
|
||||
// their specific copyright on a particular contribution, they should indicate
|
||||
// their copyright solely in the commit message of the change when it is
|
||||
// committed.
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
// 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
// POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// CONTRIBUTION AGREEMENT
|
||||
//
|
||||
// By contributing to the BVLC/caffe repository through pull-request, comment,
|
||||
// or otherwise, the contributor releases their content to the
|
||||
// license and copyright terms herein.
|
||||
|
||||
#include <cfloat>
|
||||
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "modules/detectron/ps_roi_pool_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename T>
|
||||
inline __device__ T gpu_atomic_add(const T val, T* address);
|
||||
|
||||
template <>
|
||||
inline __device__
|
||||
float gpu_atomic_add(const float val, float* address) {
|
||||
return atomicAdd(address, val);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void PSRoIPoolForward(
|
||||
const int nthreads,
|
||||
const T* bottom_data,
|
||||
const T spatial_scale,
|
||||
const int channels,
|
||||
const int height,
|
||||
const int width,
|
||||
const int pooled_height,
|
||||
const int pooled_width,
|
||||
const T* bottom_rois,
|
||||
const int output_dim,
|
||||
const int group_size,
|
||||
T* top_data,
|
||||
int* mapping_channel) {
|
||||
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
||||
// The output is in order (n, ctop, ph, pw)
|
||||
int pw = index % pooled_width;
|
||||
int ph = (index / pooled_width) % pooled_height;
|
||||
int ctop = (index / pooled_width / pooled_height) % output_dim;
|
||||
int n = index / pooled_width / pooled_height / output_dim;
|
||||
|
||||
// [start, end) interval for spatial sampling
|
||||
const T* offset_bottom_rois = bottom_rois + n * 5;
|
||||
int roi_batch_ind = offset_bottom_rois[0];
|
||||
T roi_start_w = static_cast<T>(
|
||||
roundf(offset_bottom_rois[1])) * spatial_scale;
|
||||
T roi_start_h = static_cast<T>(
|
||||
roundf(offset_bottom_rois[2])) * spatial_scale;
|
||||
T roi_end_w = static_cast<T>(
|
||||
roundf(offset_bottom_rois[3]) + 1.) * spatial_scale;
|
||||
T roi_end_h = static_cast<T>(
|
||||
roundf(offset_bottom_rois[4]) + 1.) * spatial_scale;
|
||||
|
||||
// Force too small ROIs to be 1x1
|
||||
T roi_width = c10::cuda::compat::max(roi_end_w - roi_start_w, static_cast<T>(0.1)); // avoid 0
|
||||
T roi_height = c10::cuda::compat::max(roi_end_h - roi_start_h, static_cast<T>(0.1));
|
||||
|
||||
// Compute w and h at bottom
|
||||
T bin_size_h = roi_height / static_cast<T>(pooled_height);
|
||||
T bin_size_w = roi_width / static_cast<T>(pooled_width);
|
||||
|
||||
// Add roi offsets and clip to input boundaries
|
||||
int hstart = floor(
|
||||
static_cast<T>(ph) * bin_size_h + roi_start_h);
|
||||
int wstart = floor(
|
||||
static_cast<T>(pw)* bin_size_w + roi_start_w);
|
||||
int hend = ceil(
|
||||
static_cast<T>(ph + 1) * bin_size_h + roi_start_h);
|
||||
int wend = ceil(
|
||||
static_cast<T>(pw + 1) * bin_size_w + roi_start_w);
|
||||
|
||||
hstart = min(max(hstart, 0), height);
|
||||
hend = min(max(hend, 0), height);
|
||||
wstart = min(max(wstart, 0),width);
|
||||
wend = min(max(wend, 0), width);
|
||||
bool is_empty = (hend <= hstart) || (wend <= wstart);
|
||||
|
||||
int gw = pw;
|
||||
int gh = ph;
|
||||
int c = (ctop * group_size + gh) * group_size + gw;
|
||||
|
||||
const T* offset_bottom_data =
|
||||
bottom_data + (roi_batch_ind * channels + c) * height * width;
|
||||
T out_sum = 0;
|
||||
for (int h = hstart; h < hend; ++h){
|
||||
for (int w = wstart; w < wend; ++w){
|
||||
int bottom_index = h*width + w;
|
||||
out_sum += offset_bottom_data[bottom_index];
|
||||
}
|
||||
}
|
||||
|
||||
T bin_area = (hend - hstart) * (wend - wstart);
|
||||
top_data[index] = is_empty ? 0. : out_sum / bin_area;
|
||||
mapping_channel[index] = c;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void PSRoIPoolBackward(
|
||||
const int nthreads,
|
||||
const T* top_diff,
|
||||
const int* mapping_channel,
|
||||
const int num_rois,
|
||||
const T spatial_scale,
|
||||
const int channels,
|
||||
const int height,
|
||||
const int width,
|
||||
const int pooled_height,
|
||||
const int pooled_width,
|
||||
const int output_dim,
|
||||
T* bottom_diff,
|
||||
const T* bottom_rois) {
|
||||
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
||||
// The output is in order (n, ctop, ph, pw)
|
||||
int pw = index % pooled_width;
|
||||
int ph = (index / pooled_width) % pooled_height;
|
||||
int n = index / pooled_width / pooled_height / output_dim;
|
||||
|
||||
// [start, end) interval for spatial sampling
|
||||
const T* offset_bottom_rois = bottom_rois + n * 5;
|
||||
int roi_batch_ind = offset_bottom_rois[0];
|
||||
T roi_start_w = static_cast<T>(
|
||||
roundf(offset_bottom_rois[1])) * spatial_scale;
|
||||
T roi_start_h = static_cast<T>(
|
||||
roundf(offset_bottom_rois[2])) * spatial_scale;
|
||||
T roi_end_w = static_cast<T>(
|
||||
roundf(offset_bottom_rois[3]) + 1.) * spatial_scale;
|
||||
T roi_end_h = static_cast<T>(
|
||||
roundf(offset_bottom_rois[4]) + 1.) * spatial_scale;
|
||||
|
||||
// Force too small ROIs to be 1x1
|
||||
T roi_width = c10::cuda::compat::max(roi_end_w - roi_start_w, static_cast<T>(0.1)); //avoid 0
|
||||
T roi_height = c10::cuda::compat::max(roi_end_h - roi_start_h, static_cast<T>(0.1));
|
||||
|
||||
// Compute w and h at bottom
|
||||
T bin_size_h = roi_height / static_cast<T>(pooled_height);
|
||||
T bin_size_w = roi_width / static_cast<T>(pooled_width);
|
||||
|
||||
int hstart = floor(
|
||||
static_cast<T>(ph)* bin_size_h + roi_start_h);
|
||||
int wstart = floor(
|
||||
static_cast<T>(pw)* bin_size_w + roi_start_w);
|
||||
int hend = ceil(
|
||||
static_cast<T>(ph + 1) * bin_size_h + roi_start_h);
|
||||
int wend = ceil(
|
||||
static_cast<T>(pw + 1) * bin_size_w + roi_start_w);
|
||||
// Add roi offsets and clip to input boundaries
|
||||
hstart = min(max(hstart, 0), height);
|
||||
hend = min(max(hend, 0), height);
|
||||
wstart = min(max(wstart, 0), width);
|
||||
wend = min(max(wend, 0), width);
|
||||
bool is_empty = (hend <= hstart) || (wend <= wstart);
|
||||
|
||||
// Compute c at bottom
|
||||
int c = mapping_channel[index];
|
||||
T* offset_bottom_diff =
|
||||
bottom_diff + (roi_batch_ind * channels + c) * height * width;
|
||||
T bin_area = (hend - hstart) * (wend - wstart);
|
||||
T diff_val = is_empty ? 0. : top_diff[index] / bin_area;
|
||||
for (int h = hstart; h < hend; ++h){
|
||||
for (int w = wstart; w < wend; ++w){
|
||||
int bottom_index = h * width + w;
|
||||
gpu_atomic_add(diff_val, offset_bottom_diff + bottom_index);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
template<>
|
||||
bool PSRoIPoolOp<float, CUDAContext>::RunOnDevice() {
|
||||
auto& X = Input(0); // Input data to pool
|
||||
auto& R = Input(1); // RoIs
|
||||
|
||||
auto* Y = Output(0, {R.dim32(0), output_dim_, pooled_height_, pooled_width_}, at::dtype<float>()); // PSRoI pooled data
|
||||
auto* A = Output(1, Y->sizes(), at::dtype<int>()); // mapping_channel
|
||||
int output_size = Y->numel();
|
||||
PSRoIPoolForward<float><<<CAFFE_GET_BLOCKS(output_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
output_size, X.data<float>(), spatial_scale_, X.dim32(1), X.dim32(2),
|
||||
X.dim32(3), pooled_height_, pooled_width_, R.data<float>(), output_dim_,
|
||||
group_size_, Y->mutable_data<float>(), A->mutable_data<int>());
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
template<>
|
||||
bool PSRoIPoolGradientOp<float, CUDAContext>::RunOnDevice() {
|
||||
auto& X = Input(0); // Input data to pool
|
||||
auto& R = Input(1); // RoIs
|
||||
auto& A = Input(2); // mapping channels
|
||||
auto& dY = Input(3); // Gradient of net w.r.t. output of "forward" op
|
||||
// (aka "gradOutput")
|
||||
|
||||
auto* dX = Output(0, X.sizes(), at::dtype<float>()); // Gradient of net w.r.t. input to "forward" op
|
||||
// (aka "gradInput")
|
||||
// Must zero-out dX before accumulating gradients
|
||||
math::Set<float, CUDAContext>(
|
||||
dX->size(), 0.f, dX->mutable_data<float>(), &context_);
|
||||
PSRoIPoolBackward<float><<<CAFFE_GET_BLOCKS(dY.size()),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
dY.size(), dY.data<float>(), A.data<int>(), R.dim32(0), spatial_scale_,
|
||||
X.dim32(1), X.dim32(2), X.dim32(3), pooled_height_, pooled_width_,
|
||||
output_dim_, dX->mutable_data<float>(), R.data<float>());
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
REGISTER_CUDA_OPERATOR(PSRoIPool,
|
||||
PSRoIPoolOp<float, CUDAContext>);
|
||||
REGISTER_CUDA_OPERATOR(PSRoIPoolGradient,
|
||||
PSRoIPoolGradientOp<float, CUDAContext>);
|
||||
} // namespace caffe2
|
@ -1,93 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef PS_ROI_POOL_OP_H_
|
||||
#define PS_ROI_POOL_OP_H_
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename T, class Context>
|
||||
class PSRoIPoolOp final : public Operator<Context> {
|
||||
public:
|
||||
PSRoIPoolOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<Context>(operator_def, ws),
|
||||
spatial_scale_(this->template GetSingleArgument<float>(
|
||||
"spatial_scale", 1.)),
|
||||
group_size_(this->template GetSingleArgument<int>("group_size", 1)),
|
||||
output_dim_(this->template GetSingleArgument<int>("output_dim", 1)) {
|
||||
TORCH_DCHECK_GT(spatial_scale_, 0);
|
||||
TORCH_DCHECK_GT(group_size_, 0);
|
||||
pooled_height_ = group_size_;
|
||||
pooled_width_ = group_size_;
|
||||
}
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
|
||||
bool RunOnDevice() override {
|
||||
// No CPU implementation for now
|
||||
CAFFE_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
protected:
|
||||
float spatial_scale_;
|
||||
int group_size_;
|
||||
int output_dim_;
|
||||
int pooled_height_;
|
||||
int pooled_width_;
|
||||
int channels_;
|
||||
int height_;
|
||||
int width_;
|
||||
};
|
||||
|
||||
template <typename T, class Context>
|
||||
class PSRoIPoolGradientOp final : public Operator<Context> {
|
||||
public:
|
||||
PSRoIPoolGradientOp(const OperatorDef& def, Workspace* ws)
|
||||
: Operator<Context>(def, ws),
|
||||
spatial_scale_(this->template GetSingleArgument<float>(
|
||||
"spatial_scale", 1.)),
|
||||
group_size_(this->template GetSingleArgument<int>("group_size", 1)),
|
||||
output_dim_(this->template GetSingleArgument<int>("output_dim", 1)) {
|
||||
TORCH_DCHECK_GT(spatial_scale_, 0);
|
||||
TORCH_DCHECK_GT(group_size_, 0);
|
||||
pooled_height_ = group_size_;
|
||||
pooled_width_ = group_size_;
|
||||
}
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
|
||||
bool RunOnDevice() override {
|
||||
// No CPU implementation for now
|
||||
CAFFE_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
protected:
|
||||
float spatial_scale_;
|
||||
int group_size_;
|
||||
int output_dim_;
|
||||
int pooled_height_;
|
||||
int pooled_width_;
|
||||
int channels_;
|
||||
int height_;
|
||||
int width_;
|
||||
};
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // PS_ROI_POOL_OP_H_
|
@ -1,99 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "roi_pool_f_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
REGISTER_CPU_OPERATOR(RoIPoolF, RoIPoolFOp<float, CPUContext>);
|
||||
REGISTER_CPU_OPERATOR(RoIPoolFGradient, RoIPoolFGradientOp<float, CPUContext>);
|
||||
|
||||
OPERATOR_SCHEMA(RoIPoolF)
|
||||
.NumInputs(2)
|
||||
.NumOutputs(2)
|
||||
.SetDoc(R"DOC(
|
||||
Region of Interest (RoI) pooling operation as used in Fast R-CNN.
|
||||
)DOC")
|
||||
.Arg(
|
||||
"spatial_scale",
|
||||
"(float) default 1.0; Spatial scale of the input feature map X "
|
||||
"relative to the input image. E.g., 0.0625 if X has a stride of 16 "
|
||||
"w.r.t. the input image.")
|
||||
.Arg(
|
||||
"pooled_h",
|
||||
"(int) default 1; Pooled output Y's height.")
|
||||
.Arg(
|
||||
"pooled_w",
|
||||
"(int) default 1; Pooled output Y's width.")
|
||||
.Input(
|
||||
0,
|
||||
"X",
|
||||
"4D feature map input of shape (N, C, H, W).")
|
||||
.Input(
|
||||
1,
|
||||
"RoIs",
|
||||
"2D input of shape (R, 5) specifying R RoIs with five columns "
|
||||
"representing: batch index in [0, N - 1], x1, y1, x2, y2. The RoI "
|
||||
"coordinates are in the coordinate system of the input image.")
|
||||
.Output(
|
||||
0,
|
||||
"Y",
|
||||
"4D output of shape (R, C, pooled_h, pooled_w). The r-th batch element "
|
||||
"is a pooled feature map cooresponding to the r-th RoI.")
|
||||
.Output(
|
||||
1,
|
||||
"argmaxes",
|
||||
"4D output of shape (R, C, pooled_h, pooled_w). Same as Y, except it "
|
||||
"records the argmax indices rather than the max pooled values.");
|
||||
|
||||
OPERATOR_SCHEMA(RoIPoolFGradient)
|
||||
.NumInputs(4)
|
||||
.NumOutputs(1)
|
||||
.Input(
|
||||
0,
|
||||
"X",
|
||||
"See RoIPoolF.")
|
||||
.Input(
|
||||
1,
|
||||
"RoIs",
|
||||
"See RoIPoolF.")
|
||||
.Input(
|
||||
2,
|
||||
"argmaxes",
|
||||
"See RoIPoolF.")
|
||||
.Input(
|
||||
3,
|
||||
"dY",
|
||||
"Gradient of forward output 0 (Y)")
|
||||
.Output(
|
||||
0,
|
||||
"dX",
|
||||
"Gradient of forward input 0 (X)");
|
||||
|
||||
class GetRoIPoolFGradient : public GradientMakerBase {
|
||||
using GradientMakerBase::GradientMakerBase;
|
||||
vector<OperatorDef> GetGradientDefs() override {
|
||||
return SingleGradientDef(
|
||||
"RoIPoolFGradient",
|
||||
"",
|
||||
vector<string>{I(0), I(1), O(1), GO(0)},
|
||||
vector<string>{GI(0)});
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_GRADIENT(RoIPoolF, GetRoIPoolFGradient);
|
||||
|
||||
} // namespace caffe2
|
@ -1,187 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <cfloat>
|
||||
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "modules/detectron/roi_pool_f_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename T>
|
||||
inline __device__ T gpu_atomic_add(const T val, T* address);
|
||||
|
||||
template <>
|
||||
inline __device__
|
||||
float gpu_atomic_add(const float val, float* address) {
|
||||
return atomicAdd(address, val);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void RoIPoolFForward(const int nthreads, const T* bottom_data,
|
||||
const T spatial_scale, const int channels, const int height,
|
||||
const int width, const int pooled_height, const int pooled_width,
|
||||
const T* bottom_rois, T* top_data, int* argmax_data) {
|
||||
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
||||
// (n, c, ph, pw) is an element in the pooled output
|
||||
int pw = index % pooled_width;
|
||||
int ph = (index / pooled_width) % pooled_height;
|
||||
int c = (index / pooled_width / pooled_height) % channels;
|
||||
int n = index / pooled_width / pooled_height / channels;
|
||||
|
||||
const T* offset_bottom_rois = bottom_rois + n * 5;
|
||||
int roi_batch_ind = offset_bottom_rois[0];
|
||||
int roi_start_w = roundf(offset_bottom_rois[1] * spatial_scale);
|
||||
int roi_start_h = roundf(offset_bottom_rois[2] * spatial_scale);
|
||||
int roi_end_w = roundf(offset_bottom_rois[3] * spatial_scale);
|
||||
int roi_end_h = roundf(offset_bottom_rois[4] * spatial_scale);
|
||||
|
||||
// Force malformed ROIs to be 1x1
|
||||
int roi_width = max(roi_end_w - roi_start_w + 1, 1);
|
||||
int roi_height = max(roi_end_h - roi_start_h + 1, 1);
|
||||
T bin_size_h = static_cast<T>(roi_height)
|
||||
/ static_cast<T>(pooled_height);
|
||||
T bin_size_w = static_cast<T>(roi_width)
|
||||
/ static_cast<T>(pooled_width);
|
||||
|
||||
int hstart = static_cast<int>(floor(static_cast<T>(ph)
|
||||
* bin_size_h));
|
||||
int wstart = static_cast<int>(floor(static_cast<T>(pw)
|
||||
* bin_size_w));
|
||||
int hend = static_cast<int>(ceil(static_cast<T>(ph + 1)
|
||||
* bin_size_h));
|
||||
int wend = static_cast<int>(ceil(static_cast<T>(pw + 1)
|
||||
* bin_size_w));
|
||||
|
||||
// Add roi offsets and clip to input boundaries
|
||||
hstart = min(max(hstart + roi_start_h, 0), height);
|
||||
hend = min(max(hend + roi_start_h, 0), height);
|
||||
wstart = min(max(wstart + roi_start_w, 0), width);
|
||||
wend = min(max(wend + roi_start_w, 0), width);
|
||||
bool is_empty = (hend <= hstart) || (wend <= wstart);
|
||||
|
||||
// Define an empty pooling region to be zero
|
||||
T maxval = is_empty ? 0 : -FLT_MAX;
|
||||
// If nothing is pooled, argmax = -1 causes nothing to be backprop'd
|
||||
int maxidx = -1;
|
||||
const T* offset_bottom_data =
|
||||
bottom_data + (roi_batch_ind * channels + c) * height * width;
|
||||
for (int h = hstart; h < hend; ++h) {
|
||||
for (int w = wstart; w < wend; ++w) {
|
||||
int bottom_index = h * width + w;
|
||||
if (offset_bottom_data[bottom_index] > maxval) {
|
||||
maxval = offset_bottom_data[bottom_index];
|
||||
maxidx = bottom_index;
|
||||
}
|
||||
}
|
||||
}
|
||||
top_data[index] = maxval;
|
||||
argmax_data[index] = maxidx;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void RoIPoolFBackward(const int nthreads, const T* top_diff,
|
||||
const int* argmax_data, const int num_rois, const T spatial_scale,
|
||||
const int channels, const int height, const int width,
|
||||
const int pooled_height, const int pooled_width, T* bottom_diff,
|
||||
const T* bottom_rois) {
|
||||
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
||||
// (n, c, ph, pw) is an element in the pooled output
|
||||
int pw = index % pooled_width;
|
||||
int ph = (index / pooled_width) % pooled_height;
|
||||
int c = (index / pooled_width / pooled_height) % channels;
|
||||
int n = index / pooled_width / pooled_height / channels;
|
||||
|
||||
const T* offset_bottom_rois = bottom_rois + n * 5;
|
||||
int roi_batch_ind = offset_bottom_rois[0];
|
||||
int bottom_offset = (roi_batch_ind * channels + c) * height * width;
|
||||
int top_offset = (n * channels + c) * pooled_height * pooled_width;
|
||||
const T* offset_top_diff = top_diff + top_offset;
|
||||
T* offset_bottom_diff = bottom_diff + bottom_offset;
|
||||
const int* offset_argmax_data = argmax_data + top_offset;
|
||||
|
||||
int argmax = offset_argmax_data[ph * pooled_width + pw];
|
||||
if (argmax != -1) {
|
||||
gpu_atomic_add(
|
||||
static_cast<T>(offset_top_diff[ph * pooled_width + pw]),
|
||||
offset_bottom_diff + argmax);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
template<>
|
||||
bool RoIPoolFOp<float, CUDAContext>::RunOnDevice() {
|
||||
auto& X = Input(0); // Input data to pool
|
||||
auto& R = Input(1); // RoIs
|
||||
|
||||
if (R.size() == 0) {
|
||||
// Handle empty rois
|
||||
std::vector<int64_t> sizes = {0, X.dim32(1), pooled_height_, pooled_width_};
|
||||
/* auto* Y = */ Output(0, sizes, at::dtype<float>());
|
||||
/* auto* A = */ Output(1, sizes, at::dtype<int>());
|
||||
return true;
|
||||
}
|
||||
|
||||
auto* Y = Output(0, {R.dim32(0), X.dim32(1), pooled_height_, pooled_width_}, at::dtype<float>()); // RoI pooled data
|
||||
auto* A = Output(1, Y->sizes(), at::dtype<int>()); // argmaxes
|
||||
int output_size = Y->size();
|
||||
RoIPoolFForward<float><<<CAFFE_GET_BLOCKS(output_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
output_size, X.data<float>(), spatial_scale_, X.dim32(1), X.dim32(2),
|
||||
X.dim32(3), pooled_height_, pooled_width_, R.data<float>(),
|
||||
Y->mutable_data<float>(), A->mutable_data<int>());
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
template<>
|
||||
bool RoIPoolFGradientOp<float, CUDAContext>::RunOnDevice() {
|
||||
auto& X = Input(0); // Input data to pool
|
||||
auto& R = Input(1); // RoIs
|
||||
auto& A = Input(2); // argmaxes
|
||||
auto& dY = Input(3); // Gradient of net w.r.t. output of "forward" op
|
||||
// (aka "gradOutput")
|
||||
|
||||
auto* dX = Output(0, X.sizes(), at::dtype<float>()); // Gradient of net w.r.t. input to "forward" op
|
||||
// (aka "gradInput")
|
||||
// Must zero-out dX before accumulating gradients
|
||||
math::Set<float, CUDAContext>(
|
||||
dX->size(), 0.f, dX->mutable_data<float>(), &context_);
|
||||
if (dY.size() > 0) { // Handle possibly empty gradient if there were no rois
|
||||
RoIPoolFBackward<float><<<CAFFE_GET_BLOCKS(dY.size()),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
dY.size(), dY.data<float>(), A.data<int>(), R.dim32(0), spatial_scale_,
|
||||
X.dim32(1), X.dim32(2), X.dim32(3), pooled_height_, pooled_width_,
|
||||
dX->mutable_data<float>(), R.data<float>());
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
REGISTER_CUDA_OPERATOR(RoIPoolF,
|
||||
RoIPoolFOp<float, CUDAContext>);
|
||||
REGISTER_CUDA_OPERATOR(RoIPoolFGradient,
|
||||
RoIPoolFGradientOp<float, CUDAContext>);
|
||||
} // namespace caffe2
|
@ -1,81 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef ROI_POOL_F_OP_H_
|
||||
#define ROI_POOL_F_OP_H_
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename T, class Context>
|
||||
class RoIPoolFOp final : public Operator<Context> {
|
||||
public:
|
||||
RoIPoolFOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<Context>(operator_def, ws),
|
||||
spatial_scale_(this->template GetSingleArgument<float>(
|
||||
"spatial_scale", 1.)),
|
||||
pooled_height_(this->template GetSingleArgument<int>("pooled_h", 1)),
|
||||
pooled_width_(this->template GetSingleArgument<int>("pooled_w", 1)) {
|
||||
TORCH_DCHECK_GT(spatial_scale_, 0);
|
||||
TORCH_DCHECK_GT(pooled_height_, 0);
|
||||
TORCH_DCHECK_GT(pooled_width_, 0);
|
||||
}
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
|
||||
bool RunOnDevice() override {
|
||||
// No CPU implementation for now
|
||||
CAFFE_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
protected:
|
||||
float spatial_scale_;
|
||||
int pooled_height_;
|
||||
int pooled_width_;
|
||||
};
|
||||
|
||||
template <typename T, class Context>
|
||||
class RoIPoolFGradientOp final : public Operator<Context> {
|
||||
public:
|
||||
RoIPoolFGradientOp(const OperatorDef& def, Workspace* ws)
|
||||
: Operator<Context>(def, ws),
|
||||
spatial_scale_(this->template GetSingleArgument<float>(
|
||||
"spatial_scale", 1.)),
|
||||
pooled_height_(this->template GetSingleArgument<int>("pooled_h", 1)),
|
||||
pooled_width_(this->template GetSingleArgument<int>("pooled_w", 1)) {
|
||||
TORCH_DCHECK_GT(spatial_scale_, 0);
|
||||
TORCH_DCHECK_GT(pooled_height_, 0);
|
||||
TORCH_DCHECK_GT(pooled_width_, 0);
|
||||
}
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
|
||||
bool RunOnDevice() override {
|
||||
// No CPU implementation for now
|
||||
CAFFE_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
protected:
|
||||
float spatial_scale_;
|
||||
int pooled_height_;
|
||||
int pooled_width_;
|
||||
};
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // ROI_POOL_F_OP_H_
|
@ -1,81 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "sample_as_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
REGISTER_CPU_OPERATOR(SampleAs, SampleAsOp<float, CPUContext>);
|
||||
REGISTER_CPU_OPERATOR(SampleAsGradient, SampleAsGradientOp<float, CPUContext>);
|
||||
|
||||
OPERATOR_SCHEMA(SampleAs)
|
||||
.NumInputs(2)
|
||||
.NumOutputs(1)
|
||||
.SetDoc(R"DOC(
|
||||
Select the batch elements from input tensor X where the corresponding input
|
||||
label value is > 0.
|
||||
)DOC")
|
||||
.Input(
|
||||
0,
|
||||
"X",
|
||||
"Tensor of at least 1D shape (N, ...).")
|
||||
.Input(
|
||||
1,
|
||||
"labels",
|
||||
"Tensor of type int with 1D shape (N, ).")
|
||||
.Output(
|
||||
0,
|
||||
"Y",
|
||||
"Tensor with number of dims matching X, but with the length of dim 0 "
|
||||
"equal to the number of non-zero elements in labels. The batch items "
|
||||
"from X corresponding to the non-zero elements in labels are copied "
|
||||
"into Y.");
|
||||
|
||||
OPERATOR_SCHEMA(SampleAsGradient)
|
||||
.NumInputs(3)
|
||||
.NumOutputs(1)
|
||||
.Input(
|
||||
0,
|
||||
"X",
|
||||
"See SampleAs.")
|
||||
.Input(
|
||||
1,
|
||||
"labels",
|
||||
"See SampleAs."
|
||||
)
|
||||
.Input(
|
||||
2,
|
||||
"dY",
|
||||
"Gradient of forward output 0 (Y).")
|
||||
.Output(
|
||||
0,
|
||||
"dX",
|
||||
"Gradient of forward input 0 (X).");
|
||||
|
||||
class GetSampleAsGradient : public GradientMakerBase {
|
||||
using GradientMakerBase::GradientMakerBase;
|
||||
vector<OperatorDef> GetGradientDefs() override {
|
||||
return SingleGradientDef(
|
||||
"SampleAsGradient",
|
||||
"",
|
||||
vector<string>{I(0), I(1), GO(0)},
|
||||
vector<string>{GI(0)});
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_GRADIENT(SampleAs, GetSampleAsGradient);
|
||||
|
||||
} // namespace caffe2
|
@ -1,116 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/* SampleAs by Kaiming He for Mask R-CNN
|
||||
X.dim32(0) = L.dim32(0)
|
||||
Y's output samples are the samples of X for which L > 0.
|
||||
*/
|
||||
#include <cfloat>
|
||||
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "modules/detectron/sample_as_op.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <>
|
||||
bool SampleAsOp<float, CUDAContext>::RunOnDevice() {
|
||||
auto& X = Input(0); // Input data to be sliced
|
||||
auto& L = Input(1); // Target data that provide the identity
|
||||
|
||||
CAFFE_ENFORCE(
|
||||
X.dim32(0) == L.dim32(0),
|
||||
"X.dim32(0) must be equal to L.dim32(0)",
|
||||
"(",
|
||||
X.dim32(0),
|
||||
" vs. ",
|
||||
L.dim32(0),
|
||||
")");
|
||||
|
||||
// copy L to CPU:
|
||||
std::vector<int> labels(L.dim32(0));
|
||||
context_.CopyBytes<CUDAContext, CPUContext>(
|
||||
L.dim32(0) * sizeof(int), L.data<int>(), &labels[0]);
|
||||
// Make sure that the copy is finished
|
||||
context_.FinishDeviceComputation();
|
||||
|
||||
int count = 0;
|
||||
for (int i = 0; i < L.dim32(0); i++) {
|
||||
if (labels[i] > 0) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
assert(count > 0);
|
||||
|
||||
// resize Y
|
||||
vector<int64_t> out_shape(X.sizes().vec());
|
||||
out_shape[0] = count;
|
||||
auto* Y = Output(0, out_shape, at::dtype<float>()); // Sliced data (Y.dim32(0) = num of (L > 0))
|
||||
|
||||
const int len = X.size() / X.dim32(0);
|
||||
|
||||
float* output = Y->mutable_data<float>();
|
||||
for (int i = 0; i < L.dim32(0); i++) {
|
||||
if (labels[i] > 0) {
|
||||
context_.CopyBytes<CUDAContext, CUDAContext>(
|
||||
len * sizeof(float), X.data<float>() + i * len, output);
|
||||
output += len;
|
||||
} // if
|
||||
} // i
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool SampleAsGradientOp<float, CUDAContext>::RunOnDevice() {
|
||||
auto& X = Input(0);
|
||||
auto& L = Input(1);
|
||||
auto& dY = Input(2);
|
||||
|
||||
|
||||
auto* dX = Output(0, X.sizes(), at::dtype<float>());
|
||||
|
||||
// copy L to CPU:
|
||||
std::vector<int> labels(L.dim32(0));
|
||||
context_.CopyBytes<CUDAContext, CPUContext>(
|
||||
L.dim32(0) * sizeof(int), L.data<int>(), &labels[0]);
|
||||
// Make sure that the copy is finished
|
||||
context_.FinishDeviceComputation();
|
||||
|
||||
// zero-out dX
|
||||
math::Set<float, CUDAContext>(
|
||||
dX->size(), 0.f, dX->mutable_data<float>(), &context_);
|
||||
|
||||
const int len = X.size() / X.dim32(0);
|
||||
|
||||
const float* input = dY.data<float>();
|
||||
for (int i = 0; i < L.dim32(0); i++) {
|
||||
if (labels[i] > 0) {
|
||||
context_.CopyBytes<CUDAContext, CUDAContext>(
|
||||
len * sizeof(float), input, dX->mutable_data<float>() + i * len);
|
||||
input += len;
|
||||
} // if
|
||||
} // i
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
REGISTER_CUDA_OPERATOR(SampleAs, SampleAsOp<float, CUDAContext>);
|
||||
REGISTER_CUDA_OPERATOR(
|
||||
SampleAsGradient,
|
||||
SampleAsGradientOp<float, CUDAContext>);
|
||||
} // namespace caffe2
|
@ -1,55 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef SAMPLE_AS_OP_H_
|
||||
#define SAMPLE_AS_OP_H_
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename T, class Context>
|
||||
class SampleAsOp final : public Operator<Context> {
|
||||
public:
|
||||
SampleAsOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<Context>(operator_def, ws) {}
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
|
||||
bool RunOnDevice() override {
|
||||
// No CPU implementation for now
|
||||
CAFFE_NOT_IMPLEMENTED;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, class Context>
|
||||
class SampleAsGradientOp final : public Operator<Context> {
|
||||
public:
|
||||
SampleAsGradientOp(const OperatorDef& def, Workspace* ws)
|
||||
: Operator<Context>(def, ws) {}
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
|
||||
bool RunOnDevice() override {
|
||||
// No CPU implementation for now
|
||||
CAFFE_NOT_IMPLEMENTED;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // SAMPLE_AS_OP_H_
|
@ -1,107 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "select_smooth_l1_loss_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
REGISTER_CPU_OPERATOR(
|
||||
SelectSmoothL1Loss,
|
||||
SelectSmoothL1LossOp<float, CPUContext>);
|
||||
REGISTER_CPU_OPERATOR(
|
||||
SelectSmoothL1LossGradient,
|
||||
SelectSmoothL1LossGradientOp<float, CPUContext>);
|
||||
|
||||
OPERATOR_SCHEMA(SelectSmoothL1Loss)
|
||||
.NumInputs(4)
|
||||
.NumOutputs(1)
|
||||
.SetDoc(R"DOC(
|
||||
RetinaNet specific op for computing Smooth L1 Loss at select locations in a 4D
|
||||
tensor that encodes bounding box regression predictions.
|
||||
)DOC")
|
||||
.Arg(
|
||||
"beta",
|
||||
"(float) default 1.0; L2 to L1 transition point.")
|
||||
.Arg(
|
||||
"scale",
|
||||
"(float) default 1.0; multiply the loss by this scale factor.")
|
||||
.Input(
|
||||
0,
|
||||
"Y_hat",
|
||||
"4D tensor of bounding box regression predictions with shape "
|
||||
"(N, 4 * num_bbox_classes * num_anchors, H, W).")
|
||||
.Input(
|
||||
1,
|
||||
"Y",
|
||||
"2D tensor of labels shape (M, 4) for 4 contiguous channels starting "
|
||||
"at each of the M locations selected by the locations input.")
|
||||
.Input(
|
||||
2,
|
||||
"locations",
|
||||
"2D tensor of shape (M, 4) that identifies M 'select' locations "
|
||||
"encoded by the four columns: (n, c, y, x). The loss is computed on the "
|
||||
"four contiguous channel locations [c, c + 3] (inclusive).")
|
||||
.Input(
|
||||
3,
|
||||
"normalizer",
|
||||
"Scalar; the loss is divided by max(1, normalizer).")
|
||||
.Output(
|
||||
0,
|
||||
"loss",
|
||||
"Scalar loss.");
|
||||
|
||||
OPERATOR_SCHEMA(SelectSmoothL1LossGradient)
|
||||
.NumInputs(5)
|
||||
.NumOutputs(1)
|
||||
.Input(
|
||||
0,
|
||||
"Y_hat",
|
||||
"See SelectSmoothL1Loss.")
|
||||
.Input(
|
||||
1,
|
||||
"Y",
|
||||
"See SelectSmoothL1Loss.")
|
||||
.Input(
|
||||
2,
|
||||
"locations",
|
||||
"See SelectSmoothL1Loss.")
|
||||
.Input(
|
||||
3,
|
||||
"normalizer",
|
||||
"See SelectSmoothL1Loss.")
|
||||
.Input(
|
||||
4,
|
||||
"d_loss",
|
||||
"Gradient of forward output 0 (loss).")
|
||||
.Output(
|
||||
0,
|
||||
"d_Y_hat",
|
||||
"Gradient of forward input 0 (Y_hat).");
|
||||
|
||||
class GetSelectSmoothL1LossGradient : public GradientMakerBase {
|
||||
using GradientMakerBase::GradientMakerBase;
|
||||
vector<OperatorDef> GetGradientDefs() override {
|
||||
return SingleGradientDef(
|
||||
"SelectSmoothL1LossGradient",
|
||||
"",
|
||||
vector<string>{I(0), I(1), I(2), I(3), GO(0)},
|
||||
vector<string>{GI(0)});
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_GRADIENT(SelectSmoothL1Loss, GetSelectSmoothL1LossGradient);
|
||||
|
||||
} // namespace caffe2
|
@ -1,189 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "modules/detectron/select_smooth_l1_loss_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
namespace {
|
||||
__global__ void SelectSmoothL1Kernel(
|
||||
const int D, const int H, const int W,
|
||||
const int M, const float* Y_hat, const float* Y, const float* L, float* out,
|
||||
const float* S, const float beta) {
|
||||
// f(x) = 0.5 * x^2 / beta if |x| < beta
|
||||
// |x| - 0.5 * beta otherwise
|
||||
CUDA_1D_KERNEL_LOOP(i, M) {
|
||||
int n = L[i * 4];
|
||||
int c = L[i * 4 + 1];
|
||||
int y = L[i * 4 + 2];
|
||||
int x = L[i * 4 + 3];
|
||||
|
||||
for (int j = 0; j < 4; j++){
|
||||
// Y_hat: N x (A * CLS * 4) x H x W
|
||||
int ind = n * (D * H * W) + (c + j) * (H * W) + y * W + x;
|
||||
float y_hat = Y_hat[ind];
|
||||
float y = Y[i * 4 + j];
|
||||
float val = y_hat - y;
|
||||
float abs_val = c10::cuda::compat::abs(val);
|
||||
if (abs_val < beta) {
|
||||
out[ind] = (0.5 * val * val / beta) / c10::cuda::compat::max(S[0], static_cast<float>(1.0));
|
||||
} else {
|
||||
out[ind] = (abs_val - 0.5 * beta) / c10::cuda::compat::max(S[0], static_cast<float>(1.0));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__global__ void SelectSmoothL1GradientKernel(
|
||||
const int D, const int H, const int W,
|
||||
const int M,
|
||||
const float* Y_hat,
|
||||
const float* Y,
|
||||
const float* L,
|
||||
float* out,
|
||||
const float* d_loss_data,
|
||||
float norm,
|
||||
const float* S,
|
||||
float beta) {
|
||||
// f'(x) = x / beta if |x| < beta
|
||||
// = sign(x) otherwise
|
||||
// We also scale by norm * d_loss in this kernel for convenience
|
||||
CUDA_1D_KERNEL_LOOP(i, M) {
|
||||
int n = L[i * 4];
|
||||
int c = L[i * 4 + 1];
|
||||
int y = L[i * 4 + 2];
|
||||
int x = L[i * 4 + 3];
|
||||
float d_loss = *d_loss_data;
|
||||
|
||||
for (int j = 0; j < 4; j++) {
|
||||
int ind = n * (D * H * W) + (c + j) * (H * W) + y * W + x;
|
||||
float y_hat = Y_hat[ind];
|
||||
float y = Y[i * 4 + j];
|
||||
float val = y_hat - y;
|
||||
float abs_val = c10::cuda::compat::abs(val);
|
||||
if (abs_val < beta) {
|
||||
out[ind] = norm * d_loss * val / beta / c10::cuda::compat::max(S[0], static_cast<float>(1.0));
|
||||
} else {
|
||||
out[ind] = norm * d_loss * ((float(0) < val) - (val < float(0))) / c10::cuda::compat::max(S[0], static_cast<float>(1.0));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
|
||||
template<>
|
||||
bool SelectSmoothL1LossOp<float, CUDAContext>::RunOnDevice() {
|
||||
// bbox targets predictions, for example: N x (A * 4) H x W in cls-agnostic case
|
||||
auto& Y_hat = Input(0);
|
||||
// true targets: for example: M x 4 where M is the #fg boxes per fpn level
|
||||
auto& Y = Input(1);
|
||||
// locations of fg boxes: M x 4
|
||||
auto& L = Input(2);
|
||||
// total number of fg boxes across all FPN levels: scalar
|
||||
auto& S = Input(3);
|
||||
|
||||
|
||||
auto* avg_loss = Output(0, vector<int64_t>(), at::dtype<float>());
|
||||
if (Y.size() == 0){
|
||||
math::Set<float, CUDAContext>(
|
||||
1, static_cast<float>(0), avg_loss->mutable_data<float>(), &context_);
|
||||
return true;
|
||||
}
|
||||
|
||||
int N = Y_hat.dim32(0);
|
||||
int D = Y_hat.dim32(1);
|
||||
int H = Y_hat.dim32(2);
|
||||
int W = Y_hat.dim32(3);
|
||||
|
||||
int M = Y.dim32(0);
|
||||
|
||||
// initialization
|
||||
buff_.ResizeLike(Y_hat);
|
||||
math::Set<float, CUDAContext>(
|
||||
1, static_cast<float>(0), avg_loss->mutable_data<float>(), &context_);
|
||||
math::Set<float, CUDAContext>(
|
||||
buff_.size(), 0.0, buff_.mutable_data<float>(), &context_);
|
||||
|
||||
// Element-wise smooth l1 loss
|
||||
// l := SelectSmoothL1((y_hat - y))
|
||||
SelectSmoothL1Kernel<<<CAFFE_GET_BLOCKS(buff_.size()),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
D, H, W,
|
||||
M, Y_hat.data<float>(), Y.data<float>(),
|
||||
L.data<float>(), buff_.mutable_data<float>(),
|
||||
S.data<float>(), beta_);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
// Sum of all losses
|
||||
// al := sum_i l_i
|
||||
float* avg_loss_data = avg_loss->mutable_data<float>();
|
||||
math::Sum<float, CUDAContext>(
|
||||
buff_.size(), buff_.data<float>(), avg_loss_data, &context_);
|
||||
|
||||
// Average of input batch size
|
||||
math::Scale<float, float, CUDAContext>(
|
||||
1, scale_, avg_loss_data, avg_loss_data, &context_);
|
||||
return true;
|
||||
}
|
||||
|
||||
template<>
|
||||
bool SelectSmoothL1LossGradientOp<float, CUDAContext>::RunOnDevice() {
|
||||
auto& Y_hat = Input(0);
|
||||
auto& Y = Input(1);
|
||||
auto& L = Input(2);
|
||||
auto& S = Input(3);
|
||||
// Below is gradient of net w.r.t. avg_loss ("gradOutput"), should be all 1's
|
||||
auto& d_avg_loss = Input(4);
|
||||
|
||||
auto* d_Y_hat = Output(0, Y_hat.sizes(), at::dtype<float>()); // gradient of net w.r.t. Y_hat ("gradInput")
|
||||
math::Set<float, CUDAContext>(
|
||||
d_Y_hat->size(), 0.0, d_Y_hat->mutable_data<float>(), &context_);
|
||||
if (Y.size() == 0){
|
||||
return true;
|
||||
}
|
||||
|
||||
int N = Y_hat.dim32(0);
|
||||
int D = Y_hat.dim32(1);
|
||||
int H = Y_hat.dim32(2);
|
||||
int W = Y_hat.dim32(3);
|
||||
|
||||
int M = Y.dim32(0);
|
||||
// Element-wise weighted difference (can be used to ignore or reweight
|
||||
// specific components)
|
||||
// d := (y_hat - y)
|
||||
// d_Y_hat := d_avg_loss * SelectSmoothL1'((y_hat - y))
|
||||
|
||||
SelectSmoothL1GradientKernel<<<CAFFE_GET_BLOCKS(d_Y_hat->size()),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
D, H, W, M, Y_hat.data<float>(), Y.data<float>(),
|
||||
L.data<float>(), d_Y_hat->mutable_data<float>(),
|
||||
d_avg_loss.data<float>(), scale_, S.data<float>(), beta_);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
REGISTER_CUDA_OPERATOR(SelectSmoothL1Loss,
|
||||
SelectSmoothL1LossOp<float, CUDAContext>);
|
||||
REGISTER_CUDA_OPERATOR(SelectSmoothL1LossGradient,
|
||||
SelectSmoothL1LossGradientOp<float, CUDAContext>);
|
||||
} // namespace caffe2
|
@ -1,77 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef SELECT_SMOOTH_L1_LOSS_OP_H_
|
||||
#define SELECT_SMOOTH_L1_LOSS_OP_H_
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename T, class Context>
|
||||
class SelectSmoothL1LossOp final : public Operator<Context> {
|
||||
public:
|
||||
SelectSmoothL1LossOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<Context>(operator_def, ws),
|
||||
beta_(this->template GetSingleArgument<float>("beta", 1.)),
|
||||
scale_(this->template GetSingleArgument<float>("scale", 1.)) {
|
||||
CAFFE_ENFORCE(beta_ > 0);
|
||||
CAFFE_ENFORCE(scale_ >= 0);
|
||||
}
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
|
||||
bool RunOnDevice() override {
|
||||
// No CPU implementation for now
|
||||
CAFFE_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
protected:
|
||||
float beta_; // Transition point from L1 to L2 loss
|
||||
float scale_; // Scale the loss by scale_
|
||||
int dim_; // dimension for 1 anchor prediction
|
||||
Tensor buff_{Context::GetDeviceType()}; // Buffer for element-wise differences
|
||||
};
|
||||
|
||||
template <typename T, class Context>
|
||||
class SelectSmoothL1LossGradientOp final : public Operator<Context> {
|
||||
public:
|
||||
SelectSmoothL1LossGradientOp(const OperatorDef& def, Workspace* ws)
|
||||
: Operator<Context>(def, ws),
|
||||
beta_(this->template GetSingleArgument<float>("beta", 1.)),
|
||||
scale_(this->template GetSingleArgument<float>("scale", 1.)) {
|
||||
CAFFE_ENFORCE(beta_ > 0);
|
||||
CAFFE_ENFORCE(scale_ >= 0);
|
||||
}
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
|
||||
bool RunOnDevice() override {
|
||||
// No CPU implementation for now
|
||||
CAFFE_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
protected:
|
||||
float beta_; // Transition point from L1 to L2 loss
|
||||
float scale_; // Scale the loss by scale_
|
||||
int dim_; // dimension for 1 anchor prediction
|
||||
Tensor buff_{Context::GetDeviceType()}; // Buffer for element-wise differences
|
||||
};
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // SELECT_SMOOTH_L1_LOSS_OP_H_
|
@ -1,96 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "sigmoid_cross_entropy_loss_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
REGISTER_CPU_OPERATOR(
|
||||
SigmoidCrossEntropyLoss,
|
||||
SigmoidCrossEntropyLossOp<float, CPUContext>);
|
||||
REGISTER_CPU_OPERATOR(
|
||||
SigmoidCrossEntropyLossGradient,
|
||||
SigmoidCrossEntropyLossGradientOp<float, CPUContext>);
|
||||
|
||||
OPERATOR_SCHEMA(SigmoidCrossEntropyLoss)
|
||||
.NumInputs(2)
|
||||
.NumOutputs(1)
|
||||
.SetDoc(R"DOC(
|
||||
Compute sigmoid activations followed by averaged binary cross entropy loss. The
|
||||
target values may be in {-1, 0, 1}, where -1 indicates that the corresponding
|
||||
sample should be ignored and {0, 1} correspond to the binary classes 0 and 1. By
|
||||
default the loss is divided by the number of targets > -1 and then multiplied by
|
||||
the `scale` op argument. The divisive normalization may be disable by setting
|
||||
the op argument `normalize` to 0 (the multiplication by `scale` still takes
|
||||
effect).
|
||||
|
||||
This op fuses sigmoid and cross entropy for numerical stability in both forward
|
||||
and gradient computation.
|
||||
)DOC")
|
||||
.Arg(
|
||||
"scale",
|
||||
"(float) default 1.0; multiply the loss by this scale factor.")
|
||||
.Arg(
|
||||
"normalize",
|
||||
"(int) default 1; if true, divide the loss by the number of targets > "
|
||||
"-1.")
|
||||
.Input(
|
||||
0,
|
||||
"X",
|
||||
"Tensor of predicted logits (shape must be at least 1D).")
|
||||
.Input(
|
||||
1,
|
||||
"targets",
|
||||
"Tensor of targets of type int and same shape as logits X.")
|
||||
.Output(
|
||||
0,
|
||||
"loss",
|
||||
"Scalar loss.");
|
||||
|
||||
OPERATOR_SCHEMA(SigmoidCrossEntropyLossGradient)
|
||||
.NumInputs(3)
|
||||
.NumOutputs(1)
|
||||
.Input(
|
||||
0,
|
||||
"X",
|
||||
"See SigmoidCrossEntropyLoss.")
|
||||
.Input(
|
||||
1,
|
||||
"targets",
|
||||
"See SigmoidCrossEntropyLoss.")
|
||||
.Input(
|
||||
2,
|
||||
"d_loss",
|
||||
"Gradient of forward output 0 (loss).")
|
||||
.Output(
|
||||
0,
|
||||
"dX",
|
||||
"Gradient of forward input 0 (X).");
|
||||
|
||||
class GetSigmoidCrossEntropyLossGradient : public GradientMakerBase {
|
||||
using GradientMakerBase::GradientMakerBase;
|
||||
vector<OperatorDef> GetGradientDefs() override {
|
||||
return SingleGradientDef(
|
||||
"SigmoidCrossEntropyLossGradient",
|
||||
"",
|
||||
vector<string>{I(0), I(1), GO(0)},
|
||||
vector<string>{GI(0)});
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_GRADIENT(SigmoidCrossEntropyLoss, GetSigmoidCrossEntropyLossGradient);
|
||||
|
||||
} // namespace caffe2
|
@ -1,190 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "modules/detectron/sigmoid_cross_entropy_loss_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
namespace {
|
||||
__global__ void ElementwiseMaxKernel(const int n, float* data, const float a) {
|
||||
CUDA_1D_KERNEL_LOOP(index, n) {
|
||||
data[index] = (data[index] > a) ? data[index] : a;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void SigmoidCrossEntropyLossKernel(
|
||||
const int n,
|
||||
const float* logits,
|
||||
const int* targets,
|
||||
float* losses,
|
||||
float* counts) {
|
||||
CUDA_1D_KERNEL_LOOP(index, n) {
|
||||
if (targets[index] == -1) {
|
||||
losses[index] = 0.;
|
||||
counts[index] = 0.;
|
||||
} else {
|
||||
losses[index] =
|
||||
-1. * logits[index] * (targets[index] - (logits[index] >= 0)) +
|
||||
logf(
|
||||
1 +
|
||||
expf(logits[index] - 2 * logits[index] * (logits[index] >= 0)));
|
||||
counts[index] = 1.;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void SigmoidCrossEntropyLossGradientKernel(
|
||||
const int n,
|
||||
const float* logits,
|
||||
const int* targets,
|
||||
float* d_logits,
|
||||
float* counts) {
|
||||
CUDA_1D_KERNEL_LOOP(index, n) {
|
||||
if (targets[index] == -1) {
|
||||
d_logits[index] = 0.;
|
||||
counts[index] = 0.;
|
||||
} else {
|
||||
d_logits[index] = 1. / (1. + expf(-logits[index])) - targets[index];
|
||||
counts[index] = 1.;
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
template <>
|
||||
bool SigmoidCrossEntropyLossOp<float, CUDAContext>::RunOnDevice() {
|
||||
auto& X = Input(0);
|
||||
auto& T = Input(1);
|
||||
|
||||
|
||||
CAFFE_ENFORCE(
|
||||
X.size() == T.size(),
|
||||
"Logit and target must have the same size",
|
||||
"(",
|
||||
X.size(),
|
||||
" vs. ",
|
||||
T.size(),
|
||||
")");
|
||||
auto* avg_loss = Output(0, vector<int64_t>(), at::dtype<float>());
|
||||
counts_.ResizeLike(X);
|
||||
losses_.ResizeLike(X);
|
||||
ReinitializeTensor(&normalizer_, vector<int64_t>(), at::dtype<float>().device(CUDA));
|
||||
SigmoidCrossEntropyLossKernel<<<
|
||||
CAFFE_GET_BLOCKS(X.size()),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context_.cuda_stream()>>>(
|
||||
X.size(),
|
||||
X.data<float>(),
|
||||
T.data<int>(),
|
||||
losses_.mutable_data<float>(),
|
||||
counts_.mutable_data<float>());
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
float* avg_loss_data = avg_loss->mutable_data<float>();
|
||||
math::Sum<float, CUDAContext>(
|
||||
losses_.size(), losses_.data<float>(), avg_loss_data, &context_);
|
||||
if (normalize_) {
|
||||
float* normalizer_data = normalizer_.mutable_data<float>();
|
||||
math::Sum<float, CUDAContext>(
|
||||
counts_.size(), counts_.data<float>(), normalizer_data, &context_);
|
||||
// Prevent division by zero is all counts are zero
|
||||
ElementwiseMaxKernel<<<
|
||||
CAFFE_GET_BLOCKS(normalizer_.size()),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context_.cuda_stream()>>>(normalizer_.size(), normalizer_data, 1e-5);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
math::Div<float, CUDAContext>(
|
||||
1, avg_loss_data, normalizer_data, avg_loss_data, &context_);
|
||||
}
|
||||
math::Scale<float, float, CUDAContext>(
|
||||
1, scale_, avg_loss_data, avg_loss_data, &context_);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool SigmoidCrossEntropyLossGradientOp<float, CUDAContext>::RunOnDevice() {
|
||||
auto& X = Input(0);
|
||||
auto& T = Input(1);
|
||||
auto& d_avg_loss = Input(2);
|
||||
|
||||
|
||||
auto* dX = Output(0, X.sizes(), at::dtype<float>());
|
||||
counts_.ResizeLike(X);
|
||||
ReinitializeTensor(&normalizer_, vector<int64_t>(), at::dtype<float>().device(CUDA));
|
||||
SigmoidCrossEntropyLossGradientKernel<<<
|
||||
CAFFE_GET_BLOCKS(X.size()),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context_.cuda_stream()>>>(
|
||||
X.size(),
|
||||
X.data<float>(),
|
||||
T.data<int>(),
|
||||
dX->mutable_data<float>(),
|
||||
counts_.mutable_data<float>());
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
if (normalize_) {
|
||||
float* normalizer_data = normalizer_.mutable_data<float>();
|
||||
math::Sum<float, CUDAContext>(
|
||||
counts_.size(), counts_.data<float>(), normalizer_data, &context_);
|
||||
// Prevent division by zero is all counts are zero
|
||||
ElementwiseMaxKernel<<<
|
||||
CAFFE_GET_BLOCKS(normalizer_.size()),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context_.cuda_stream()>>>(normalizer_.size(), normalizer_data, 1e-5);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
math::Div<float, CUDAContext>(
|
||||
1,
|
||||
d_avg_loss.data<float>(),
|
||||
normalizer_data,
|
||||
normalizer_data,
|
||||
&context_);
|
||||
math::Scale<float, float, CUDAContext>(
|
||||
1, scale_, normalizer_data, normalizer_data, &context_);
|
||||
math::Scale<float, float, CUDAContext>(
|
||||
dX->size(),
|
||||
normalizer_data,
|
||||
dX->data<float>(),
|
||||
dX->mutable_data<float>(),
|
||||
&context_);
|
||||
} else {
|
||||
math::Scale<float, float, CUDAContext>(
|
||||
dX->size(),
|
||||
scale_,
|
||||
dX->data<float>(),
|
||||
dX->mutable_data<float>(),
|
||||
&context_);
|
||||
math::Scale<float, float, CUDAContext>(
|
||||
dX->size(),
|
||||
d_avg_loss.data<float>(),
|
||||
dX->data<float>(),
|
||||
dX->mutable_data<float>(),
|
||||
&context_);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
REGISTER_CUDA_OPERATOR(
|
||||
SigmoidCrossEntropyLoss,
|
||||
SigmoidCrossEntropyLossOp<float, CUDAContext>);
|
||||
REGISTER_CUDA_OPERATOR(
|
||||
SigmoidCrossEntropyLossGradient,
|
||||
SigmoidCrossEntropyLossGradientOp<float, CUDAContext>);
|
||||
} // namespace caffe2
|
@ -1,78 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef SIGMOID_CROSS_ENTROPY_LOSS_OP_H_
|
||||
#define SIGMOID_CROSS_ENTROPY_LOSS_OP_H_
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename T, class Context>
|
||||
class SigmoidCrossEntropyLossOp final : public Operator<Context> {
|
||||
public:
|
||||
SigmoidCrossEntropyLossOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<Context>(operator_def, ws),
|
||||
scale_(this->template GetSingleArgument<float>("scale", 1.)),
|
||||
normalize_(this->template GetSingleArgument<int>("normalize", 1)) {
|
||||
CAFFE_ENFORCE(scale_ >= 0);
|
||||
CAFFE_ENFORCE(normalize_ == 0 || normalize_ == 1);
|
||||
}
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
|
||||
bool RunOnDevice() override {
|
||||
// No CPU implementation for now
|
||||
CAFFE_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
protected:
|
||||
float scale_;
|
||||
int normalize_;
|
||||
Tensor losses_{Context::GetDeviceType()};
|
||||
Tensor counts_{Context::GetDeviceType()};
|
||||
Tensor normalizer_;
|
||||
};
|
||||
|
||||
template <typename T, class Context>
|
||||
class SigmoidCrossEntropyLossGradientOp final : public Operator<Context> {
|
||||
public:
|
||||
SigmoidCrossEntropyLossGradientOp(const OperatorDef& def, Workspace* ws)
|
||||
: Operator<Context>(def, ws),
|
||||
scale_(this->template GetSingleArgument<float>("scale", 1.)),
|
||||
normalize_(this->template GetSingleArgument<int>("normalize", 1)) {
|
||||
CAFFE_ENFORCE(scale_ >= 0);
|
||||
CAFFE_ENFORCE(normalize_ == 0 || normalize_ == 1);
|
||||
}
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
|
||||
bool RunOnDevice() override {
|
||||
// No CPU implementation for now
|
||||
CAFFE_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
protected:
|
||||
float scale_;
|
||||
int normalize_;
|
||||
Tensor counts_{Context::GetDeviceType()};
|
||||
Tensor normalizer_;
|
||||
};
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // SIGMOID_CROSS_ENTROPY_LOSS_OP_H_
|
@ -1,119 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "sigmoid_focal_loss_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
REGISTER_CPU_OPERATOR(SigmoidFocalLoss, SigmoidFocalLossOp<float, CPUContext>);
|
||||
REGISTER_CPU_OPERATOR(
|
||||
SigmoidFocalLossGradient,
|
||||
SigmoidFocalLossGradientOp<float, CPUContext>);
|
||||
|
||||
OPERATOR_SCHEMA(SigmoidFocalLoss)
|
||||
.NumInputs(3)
|
||||
.NumOutputs(1)
|
||||
.SetDoc(R"DOC(
|
||||
The binary form of Focal Loss designed for use in RetinaNet-like models.
|
||||
The input is assumed to be unnormalized scores (sometimes called 'logits')
|
||||
arranged in a 4D tensor with shape (N, C, H, W), where N is the number of
|
||||
elements in the batch, H and W are the height and width, and C = num_anchors *
|
||||
num_classes defines num_anchors 'groups' of logits, each of length
|
||||
num_classes. For the binary form of Focal Loss, num_classes does not include
|
||||
the background category. (So, for COCO, num_classes = 80, not 81.)
|
||||
|
||||
The binary form of focal loss is:
|
||||
|
||||
FL(p_t) = -alpha * (1 - p_t)**gamma * log(p_t),
|
||||
|
||||
where p = sigmoid(x), p_t = p or 1 - p depending on if the label is 1 or 0,
|
||||
respectively.
|
||||
|
||||
See: https://arxiv.org/abs/1708.02002 for details.
|
||||
)DOC")
|
||||
.Arg(
|
||||
"scale",
|
||||
"(float) default 1.0; multiply the loss by this scale factor.")
|
||||
.Arg(
|
||||
"alpha",
|
||||
"(float) default 0.25; Focal Loss's alpha hyper-parameter.")
|
||||
.Arg(
|
||||
"gamma",
|
||||
"(float) default 1.0; Focal Loss's gamma hyper-parameter.")
|
||||
.Arg(
|
||||
"num_classes",
|
||||
"(int) default 80; number of classes (excluding background).")
|
||||
.Input(
|
||||
0,
|
||||
"logits",
|
||||
"4D tensor of sigmoid inputs (called 'scores' or 'logits') with shape "
|
||||
"(N, C, H, W), where C = num_anchors * num_classes.")
|
||||
.Input(
|
||||
1,
|
||||
"labels",
|
||||
"4D tensor of labels with shape (N, num_anchors, H, W). Each entry is "
|
||||
"a class label in [0, num_classes - 1] (inclusive). The label "
|
||||
"identifies the one class that should have a sigmoid target of 1.")
|
||||
.Input(
|
||||
2,
|
||||
"normalizer",
|
||||
"Scalar; the loss is normalized by 1 / max(1, normalizer)."
|
||||
)
|
||||
.Output(
|
||||
0,
|
||||
"loss",
|
||||
"Scalar loss.");
|
||||
|
||||
OPERATOR_SCHEMA(SigmoidFocalLossGradient)
|
||||
.NumInputs(4)
|
||||
.NumOutputs(1)
|
||||
.Input(
|
||||
0,
|
||||
"logits",
|
||||
"See SigmoidFocalLoss.")
|
||||
.Input(
|
||||
1,
|
||||
"labels",
|
||||
"See SigmoidFocalLoss.")
|
||||
.Input(
|
||||
2,
|
||||
"normalizer",
|
||||
"See SigmoidFocalLoss.")
|
||||
.Input(
|
||||
3,
|
||||
"d_loss",
|
||||
"Gradient of forward output 0 (loss)")
|
||||
.Output(
|
||||
0,
|
||||
"d_logits",
|
||||
"Gradient of forward input 0 (logits)");
|
||||
|
||||
class GetSigmoidFocalLossGradient : public GradientMakerBase {
|
||||
using GradientMakerBase::GradientMakerBase;
|
||||
|
||||
vector<OperatorDef> GetGradientDefs() override {
|
||||
vector<string> blob_names{
|
||||
{I(0), I(1), I(2), GO(0)},
|
||||
};
|
||||
|
||||
return SingleGradientDef(
|
||||
"SigmoidFocalLossGradient", "", blob_names, vector<string>{GI(0)});
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_GRADIENT(SigmoidFocalLoss, GetSigmoidFocalLossGradient);
|
||||
|
||||
} // namespace caffe2
|
@ -1,185 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <cfloat>
|
||||
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "modules/detectron/sigmoid_focal_loss_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
namespace {
|
||||
|
||||
__global__ void SigmoidFocalLossKernel(
|
||||
const int N, const int D, const int H, const int W, const float* logits,
|
||||
const int* targets, const float* weight_pos,
|
||||
const float gamma, const float alpha,
|
||||
const int num_classes, float* losses) {
|
||||
CUDA_1D_KERNEL_LOOP(i, N * D * H * W) {
|
||||
int x = i % W;
|
||||
int y = (i / W) % H;
|
||||
int c = (i / (W * H)) % D; // channel, here D is channel dim in input NxDxHxW
|
||||
int n = i / (W * H * D); // n in NxDxHxW
|
||||
|
||||
int A = D / num_classes; // num_anchors = A
|
||||
int a = c / num_classes; // current anchor out of A anchors in D = A * num_cls
|
||||
int d = c % num_classes; // current class
|
||||
int t = targets[n * (H * W * A) + a * (H * W) + y * W + x]; // target
|
||||
|
||||
// check whether the class is true class or not.
|
||||
// The target classes are in range 1 - 81 and the d is in range 0-80
|
||||
// because we predict A*80 dim, so for comparison purpose, compare t and (d+1)
|
||||
float c1 = (t == (d + 1));
|
||||
float c2 = (t != -1 & t != (d + 1));
|
||||
|
||||
float Np = c10::cuda::compat::max(weight_pos[0], static_cast<float>(1.0));
|
||||
float zn = (1.0 - alpha) / Np;
|
||||
float zp = alpha / Np;
|
||||
|
||||
// p = 1. / 1. + expf(-x)
|
||||
float p = 1. / (1. + expf(-logits[i]));
|
||||
|
||||
// (1 - p)**gamma * log(p) where
|
||||
float term1 = powf((1. - p), gamma) * logf(c10::cuda::compat::max(p, FLT_MIN));
|
||||
// p**gamma * log(1 - p)
|
||||
float term2 =
|
||||
powf(p, gamma) *
|
||||
(-1. * logits[i] * (logits[i] >= 0) -
|
||||
logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0))));
|
||||
|
||||
losses[i] = 0.0;
|
||||
losses[i] += -c1 * term1 * zp;
|
||||
losses[i] += -c2 * term2 * zn;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void SigmoidFocalLossGradientKernel(
|
||||
const int N, const int D, const int H, const int W, const float* logits,
|
||||
const int* targets, float* dX_data, const float* weight_pos,
|
||||
const float gamma, const float alpha, const int num_classes,
|
||||
const float* avg_loss) {
|
||||
CUDA_1D_KERNEL_LOOP(i, N * D * H * W) {
|
||||
float a_loss = avg_loss[0];
|
||||
int x = i % W;
|
||||
int y = (i / W) % H;
|
||||
int c = (i / (W * H)) % D;
|
||||
int n = i / (W * H * D);
|
||||
|
||||
int A = D / num_classes; // num_anchors
|
||||
int a = c / num_classes; // current anchor
|
||||
int d = c % num_classes; // current class
|
||||
|
||||
float Np = c10::cuda::compat::max(weight_pos[0], static_cast<float>(1.0));
|
||||
float zn = (1.0 - alpha) / Np;
|
||||
float zp = alpha / Np;
|
||||
int t = targets[n * (H * W * A) + a * (H * W) + y * W + x];
|
||||
|
||||
float c1 = (t == (d + 1));
|
||||
float c2 = (t != -1 & t != (d + 1));
|
||||
float p = 1. / (1. + expf(-logits[i]));
|
||||
|
||||
// (1-p)**g * (1 - p - g*p*log(p))
|
||||
float term1 =
|
||||
powf((1. - p), gamma) *
|
||||
(1. - p - (p * gamma * logf(c10::cuda::compat::max(p, FLT_MIN))));
|
||||
// (p**g) * (g*(1-p)*log(1-p) - p)
|
||||
float term2 =
|
||||
powf(p, gamma) *
|
||||
((-1. * logits[i] * (logits[i] >= 0) -
|
||||
logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))) *
|
||||
(1. - p) * gamma - p);
|
||||
dX_data[i] = 0.0;
|
||||
dX_data[i] += -c1 * zp * term1;
|
||||
dX_data[i] += -c2 * zn * term2;
|
||||
dX_data[i] = dX_data[i] * a_loss;
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
template<>
|
||||
bool SigmoidFocalLossOp<float, CUDAContext>::RunOnDevice() {
|
||||
// Input logits, for example: N x (A * 80) x H x W in cls-agnostic
|
||||
auto& X = Input(0);
|
||||
// Target, for example: N x A x H x W
|
||||
auto& T = Input(1);
|
||||
// Number of positive examples: scalar
|
||||
auto& wp = Input(2);
|
||||
// output avg Sigmoid focal loss as mentioned in RetinaNet paper
|
||||
|
||||
|
||||
int N = X.dim32(0);
|
||||
int D = X.dim32(1);
|
||||
int H = X.dim32(2);
|
||||
int W = X.dim32(3);
|
||||
|
||||
auto* avg_loss = Output(0, vector<int64_t>(), at::dtype<float>());
|
||||
losses_.ResizeLike(X);
|
||||
float* avg_loss_data = avg_loss->mutable_data<float>();
|
||||
|
||||
SigmoidFocalLossKernel<<<CAFFE_GET_BLOCKS(X.size()),
|
||||
CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
|
||||
N, D, H, W, X.data<float>(), T.data<int>(),
|
||||
wp.data<float>(), gamma_, alpha_, num_classes_,
|
||||
losses_.mutable_data<float>());
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
math::Sum<float, CUDAContext>(
|
||||
losses_.size(), losses_.data<float>(), avg_loss_data, &context_);
|
||||
math::Scale<float, float, CUDAContext>(
|
||||
1, scale_, avg_loss_data, avg_loss_data, &context_);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
template<>
|
||||
bool SigmoidFocalLossGradientOp<float, CUDAContext>::RunOnDevice() {
|
||||
auto& X = Input(0);
|
||||
auto& T = Input(1);
|
||||
auto& wp = Input(2);
|
||||
auto& d_avg_loss = Input(InputSize() - 1);
|
||||
|
||||
|
||||
// get input shape
|
||||
int N = X.dim32(0);
|
||||
int D = X.dim32(1);
|
||||
int H = X.dim32(2);
|
||||
int W = X.dim32(3);
|
||||
|
||||
auto* dX = Output(0, X.sizes(), at::dtype<float>());
|
||||
|
||||
SigmoidFocalLossGradientKernel<<<CAFFE_GET_BLOCKS(X.size()),
|
||||
CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
|
||||
N, D, H, W, X.data<float>(), T.data<int>(), dX->mutable_data<float>(),
|
||||
wp.data<float>(), gamma_, alpha_, num_classes_,
|
||||
d_avg_loss.data<float>());
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
math::Scale<float, float, CUDAContext>(
|
||||
dX->size(),
|
||||
scale_,
|
||||
dX->data<float>(),
|
||||
dX->mutable_data<float>(),
|
||||
&context_);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
REGISTER_CUDA_OPERATOR(SigmoidFocalLoss,
|
||||
SigmoidFocalLossOp<float, CUDAContext>);
|
||||
REGISTER_CUDA_OPERATOR(SigmoidFocalLossGradient,
|
||||
SigmoidFocalLossGradientOp<float, CUDAContext>);
|
||||
} // namespace caffe2
|
@ -1,83 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef SIGMOID_FOCAL_LOSS_OP_H_
|
||||
#define SIGMOID_FOCAL_LOSS_OP_H_
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename T, class Context>
|
||||
class SigmoidFocalLossOp final : public Operator<Context> {
|
||||
public:
|
||||
SigmoidFocalLossOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<Context>(operator_def, ws),
|
||||
scale_(this->template GetSingleArgument<float>("scale", 1.)),
|
||||
num_classes_(this->template GetSingleArgument<int>("num_classes", 80)),
|
||||
gamma_(this->template GetSingleArgument<float>("gamma", 1.)),
|
||||
alpha_(this->template GetSingleArgument<float>("alpha", 0.25)) {
|
||||
CAFFE_ENFORCE(scale_ >= 0);
|
||||
}
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
|
||||
bool RunOnDevice() override {
|
||||
// No CPU implementation for now
|
||||
CAFFE_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
protected:
|
||||
float scale_;
|
||||
int num_classes_;
|
||||
float gamma_;
|
||||
float alpha_;
|
||||
Tensor losses_{Context::GetDeviceType()};
|
||||
Tensor counts_{Context::GetDeviceType()};
|
||||
};
|
||||
|
||||
template <typename T, class Context>
|
||||
class SigmoidFocalLossGradientOp final : public Operator<Context> {
|
||||
public:
|
||||
SigmoidFocalLossGradientOp(const OperatorDef& def, Workspace* ws)
|
||||
: Operator<Context>(def, ws),
|
||||
scale_(this->template GetSingleArgument<float>("scale", 1.)),
|
||||
num_classes_(this->template GetSingleArgument<int>("num_classes", 80)),
|
||||
gamma_(this->template GetSingleArgument<float>("gamma", 1.)),
|
||||
alpha_(this->template GetSingleArgument<float>("alpha", 0.25)) {
|
||||
CAFFE_ENFORCE(scale_ >= 0);
|
||||
}
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
|
||||
bool RunOnDevice() override {
|
||||
// No CPU implementation for now
|
||||
CAFFE_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
protected:
|
||||
float scale_;
|
||||
int num_classes_;
|
||||
float gamma_;
|
||||
float alpha_;
|
||||
Tensor counts_{Context::GetDeviceType()};
|
||||
Tensor weights_{Context::GetDeviceType()}; // unignored weights
|
||||
};
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // SIGMOID_FOCAL_LOSS_OP_H_
|
@ -1,117 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "smooth_l1_loss_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
REGISTER_CPU_OPERATOR(SmoothL1Loss, SmoothL1LossOp<float, CPUContext>);
|
||||
REGISTER_CPU_OPERATOR(
|
||||
SmoothL1LossGradient,
|
||||
SmoothL1LossGradientOp<float, CPUContext>);
|
||||
|
||||
OPERATOR_SCHEMA(SmoothL1Loss)
|
||||
.NumInputs(4)
|
||||
.NumOutputs(1)
|
||||
.SetDoc(R"DOC(
|
||||
Smooth L1 Loss is a minor variation of Huber loss in which the point of
|
||||
transition between L2 loss and L1 loss is adjustable by a hyper-parameter beta:
|
||||
|
||||
SmoothL1(x) = 0.5 * x^2 / beta if |x| < beta
|
||||
|x| - 0.5 * beta otherwise.
|
||||
|
||||
SmoothL1 is used in Fast R-CNN and descendants as the loss function for bounding
|
||||
box regression.
|
||||
|
||||
The loss computed by this op has a flexible form:
|
||||
|
||||
scale / N * sum_i alpha_out[i] * SmoothL1(alpha_in[i] * (y_hat[i] - y[i])).
|
||||
|
||||
The weights alpha_in and alpha_out are called the "inside" and "outside"
|
||||
weights, respectively. The inside weights are typically set to either 0 or 1 to
|
||||
implement ignoring (when 0) certain samples. The outside weights can be used
|
||||
to implement a per-sample loss weight. The overall loss is scaled by scale / N,
|
||||
where N is the number of batch elements in the input predictions.
|
||||
)DOC")
|
||||
.Arg(
|
||||
"beta",
|
||||
"(float) default 1.0; L2 to L1 transition point.")
|
||||
.Arg(
|
||||
"scale",
|
||||
"(float) default 1.0; multiply the loss by this scale factor.")
|
||||
.Input(
|
||||
0,
|
||||
"Y_hat",
|
||||
"Tensor of predictions (at least 1D).")
|
||||
.Input(
|
||||
1,
|
||||
"Y",
|
||||
"Tensor of labels with the same shape as Y_hat.")
|
||||
.Input(
|
||||
2,
|
||||
"alpha_in",
|
||||
"Tensor of inside weights with the same shape as Y.")
|
||||
.Input(
|
||||
3,
|
||||
"alpha_out",
|
||||
"Tensor of outside weights with the same shape as Y.")
|
||||
.Output(
|
||||
0,
|
||||
"loss",
|
||||
"Scalar loss.");
|
||||
|
||||
OPERATOR_SCHEMA(SmoothL1LossGradient)
|
||||
.NumInputs(5)
|
||||
.NumOutputs(1)
|
||||
.Input(
|
||||
0,
|
||||
"Y_hat",
|
||||
"See SmoothL1Loss.")
|
||||
.Input(
|
||||
1,
|
||||
"Y",
|
||||
"See SmoothL1Loss.")
|
||||
.Input(
|
||||
2,
|
||||
"alpha_in",
|
||||
"See SmoothL1Loss.")
|
||||
.Input(
|
||||
3,
|
||||
"alpha_out",
|
||||
"See SmoothL1Loss.")
|
||||
.Input(
|
||||
4,
|
||||
"d_loss",
|
||||
"Gradient of forward output 0 (loss).")
|
||||
.Output(
|
||||
0,
|
||||
"d_Y_hat",
|
||||
"Gradient of forward input 0 (Y_hat).");
|
||||
|
||||
class GetSmoothL1LossGradient : public GradientMakerBase {
|
||||
using GradientMakerBase::GradientMakerBase;
|
||||
vector<OperatorDef> GetGradientDefs() override {
|
||||
return SingleGradientDef(
|
||||
"SmoothL1LossGradient",
|
||||
"",
|
||||
vector<string>{I(0), I(1), I(2), I(3), GO(0)},
|
||||
vector<string>{GI(0)});
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_GRADIENT(SmoothL1Loss, GetSmoothL1LossGradient);
|
||||
|
||||
} // namespace caffe2
|
@ -1,185 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "modules/detectron/smooth_l1_loss_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
namespace {
|
||||
template <typename T>
|
||||
__global__ void SmoothL1Kernel(
|
||||
const int n, const T* in, T* out, T beta) {
|
||||
// f(x) = 0.5 * x^2 / beta if |x| < beta
|
||||
// |x| - 0.5 * beta otherwise
|
||||
CUDA_1D_KERNEL_LOOP(index, n) {
|
||||
T val = in[index];
|
||||
T abs_val = c10::cuda::compat::abs(val);
|
||||
if (abs_val < beta) {
|
||||
out[index] = 0.5 * val * val / beta;
|
||||
} else {
|
||||
out[index] = abs_val - 0.5 * beta;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void SmoothL1GradientKernel(
|
||||
const int n,
|
||||
const T* in,
|
||||
T* out,
|
||||
const T* d_loss_data,
|
||||
T norm,
|
||||
T beta) {
|
||||
// f'(x) = x / beta if |x| < beta
|
||||
// = sign(x) otherwise
|
||||
// We also scale by norm * d_loss in this kernel for convenience
|
||||
CUDA_1D_KERNEL_LOOP(index, n) {
|
||||
T val = in[index];
|
||||
T abs_val = c10::cuda::compat::abs(val);
|
||||
T d_loss = *d_loss_data;
|
||||
if (abs_val < beta) {
|
||||
out[index] = norm * d_loss * val / beta;
|
||||
} else {
|
||||
out[index] = norm * d_loss * ((T(0) < val) - (val < T(0)));
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
template<>
|
||||
bool SmoothL1LossOp<float, CUDAContext>::RunOnDevice() {
|
||||
auto& Y_hat = Input(0);
|
||||
auto& Y = Input(1);
|
||||
auto& alpha_in = Input(2);
|
||||
auto& alpha_out = Input(3);
|
||||
|
||||
|
||||
int N = Y.dim32(0);
|
||||
// Require the same number of elements along axis 0 (batch size), but
|
||||
// otherwise don't care about the shape (just the number of elements)
|
||||
CAFFE_ENFORCE_EQ(Y_hat.dim32(0), Y.dim32(0),
|
||||
"Y_hat and Y must have the same number of elements along axis 0");
|
||||
CAFFE_ENFORCE_EQ(Y_hat.size(), Y.size(),
|
||||
"Y_hat and Y must have the same number of elements");
|
||||
CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_in.size());
|
||||
CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_out.size());
|
||||
|
||||
auto* avg_loss = Output(0, vector<int64_t>(), at::dtype<float>());
|
||||
buff_.ResizeLike(Y);
|
||||
|
||||
// Difference
|
||||
// d := y_hat - y
|
||||
math::Sub<float, CUDAContext>(
|
||||
Y.size(), Y_hat.data<float>(), Y.data<float>(),
|
||||
buff_.mutable_data<float>(), &context_);
|
||||
// Element-wise weighted difference (can be used to ignore or reweight
|
||||
// specific components)
|
||||
// d := alpha_in * (y_hat - y)
|
||||
math::Mul<float, CUDAContext>(
|
||||
buff_.size(), buff_.data<float>(), alpha_in.data<float>(),
|
||||
buff_.mutable_data<float>(), &context_);
|
||||
|
||||
// Element-wise smooth l1 loss
|
||||
// l := SmoothL1(alpha_in * (y_hat - y))
|
||||
SmoothL1Kernel<float>
|
||||
<<<CAFFE_GET_BLOCKS(buff_.size()),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context_.cuda_stream()>>>(
|
||||
buff_.size(), buff_.data<float>(), buff_.mutable_data<float>(),
|
||||
beta_);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
// Element-wise weighted smooth l1 loss (can be used to specify a per-element
|
||||
// loss weight)
|
||||
// l := alpha_out * SmoothL1(alpha_in * (y_hat - y))
|
||||
math::Mul<float, CUDAContext>(
|
||||
buff_.size(), buff_.data<float>(), alpha_out.data<float>(),
|
||||
buff_.mutable_data<float>(), &context_);
|
||||
// Sum of all losses
|
||||
// al := sum_i l_i
|
||||
float* avg_loss_data = avg_loss->mutable_data<float>();
|
||||
math::Sum<float, CUDAContext>(
|
||||
buff_.size(), buff_.data<float>(), avg_loss_data, &context_);
|
||||
// Average of input batch size
|
||||
// al := 1/N * al
|
||||
math::Scale<float, float, CUDAContext>(
|
||||
1, scale_ / N, avg_loss_data, avg_loss_data, &context_);
|
||||
return true;
|
||||
}
|
||||
|
||||
template<>
|
||||
bool SmoothL1LossGradientOp<float, CUDAContext>::RunOnDevice() {
|
||||
auto& Y_hat = Input(0);
|
||||
auto& Y = Input(1);
|
||||
auto& alpha_in = Input(2);
|
||||
auto& alpha_out = Input(3);
|
||||
auto& d_avg_loss = Input(4); // gradient of net w.r.t. avg_loss ("gradOutput")
|
||||
// We intentially don't compute gradients for Y, alpha_{in,out} since they
|
||||
// are not needed (can change in the future if desired)
|
||||
|
||||
int N = Y.dim32(0);
|
||||
// Require the same number of elements along axis 0 (batch size), but
|
||||
// otherwise don't care about the shape (just the number of elements)
|
||||
CAFFE_ENFORCE_EQ(Y_hat.dim32(0), Y.dim32(0),
|
||||
"Y_hat and Y must have the same number of elements along axis 0");
|
||||
CAFFE_ENFORCE_EQ(Y_hat.size(), Y.size(),
|
||||
"Y_hat and Y must have the same number of elements");
|
||||
CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_in.size());
|
||||
CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_out.size());
|
||||
CAFFE_ENFORCE_EQ(d_avg_loss.size(), 1);
|
||||
|
||||
auto* d_Y_hat = Output(0, Y_hat.sizes(), at::dtype<float>()); // gradient of net w.r.t. Y_hat ("gradInput")
|
||||
buff_.ResizeLike(Y);
|
||||
|
||||
// Difference
|
||||
// d := y_hat - y
|
||||
math::Sub<float, CUDAContext>(
|
||||
Y.size(), Y_hat.data<float>(), Y.data<float>(),
|
||||
buff_.mutable_data<float>(), &context_);
|
||||
// Element-wise weighted difference (can be used to ignore or reweight
|
||||
// specific components)
|
||||
// d := alpha_in * (y_hat - y)
|
||||
math::Mul<float, CUDAContext>(
|
||||
buff_.size(), buff_.data<float>(), alpha_in.data<float>(),
|
||||
buff_.mutable_data<float>(), &context_);
|
||||
// d_Y_hat := d_avg_loss / N * SmoothL1'(alpha_in * (y_hat - y))
|
||||
SmoothL1GradientKernel<float>
|
||||
<<<CAFFE_GET_BLOCKS(buff_.size()),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context_.cuda_stream()>>>(
|
||||
buff_.size(), buff_.data<float>(), d_Y_hat->mutable_data<float>(),
|
||||
d_avg_loss.data<float>(), scale_ / N, beta_);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
// Element-wise scale by alpha_in and alpha_out
|
||||
math::Mul<float, CUDAContext>(
|
||||
d_Y_hat->size(), d_Y_hat->data<float>(), alpha_in.data<float>(),
|
||||
d_Y_hat->mutable_data<float>(), &context_);
|
||||
math::Mul<float, CUDAContext>(
|
||||
d_Y_hat->size(), d_Y_hat->data<float>(), alpha_out.data<float>(),
|
||||
d_Y_hat->mutable_data<float>(), &context_);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
REGISTER_CUDA_OPERATOR(SmoothL1Loss,
|
||||
SmoothL1LossOp<float, CUDAContext>);
|
||||
REGISTER_CUDA_OPERATOR(SmoothL1LossGradient,
|
||||
SmoothL1LossGradientOp<float, CUDAContext>);
|
||||
} // namespace caffe2
|
@ -1,75 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef SMOOTH_L1_LOSS_OP_H_
|
||||
#define SMOOTH_L1_LOSS_OP_H_
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename T, class Context>
|
||||
class SmoothL1LossOp final : public Operator<Context> {
|
||||
public:
|
||||
SmoothL1LossOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<Context>(operator_def, ws),
|
||||
beta_(this->template GetSingleArgument<float>("beta", 1.)),
|
||||
scale_(this->template GetSingleArgument<float>("scale", 1.)) {
|
||||
CAFFE_ENFORCE(beta_ > 0);
|
||||
CAFFE_ENFORCE(scale_ >= 0);
|
||||
}
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
|
||||
bool RunOnDevice() override {
|
||||
// No CPU implementation for now
|
||||
CAFFE_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
protected:
|
||||
float beta_; // Transition point from L1 to L2 loss
|
||||
float scale_; // Scale the loss by scale_
|
||||
Tensor buff_{Context::GetDeviceType()}; // Buffer for element-wise differences
|
||||
};
|
||||
|
||||
template <typename T, class Context>
|
||||
class SmoothL1LossGradientOp final : public Operator<Context> {
|
||||
public:
|
||||
SmoothL1LossGradientOp(const OperatorDef& def, Workspace* ws)
|
||||
: Operator<Context>(def, ws),
|
||||
beta_(this->template GetSingleArgument<float>("beta", 1.)),
|
||||
scale_(this->template GetSingleArgument<float>("scale", 1.)) {
|
||||
CAFFE_ENFORCE(beta_ > 0);
|
||||
CAFFE_ENFORCE(scale_ >= 0);
|
||||
}
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
|
||||
bool RunOnDevice() override {
|
||||
// No CPU implementation for now
|
||||
CAFFE_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
protected:
|
||||
float beta_; // Transition point from L1 to L2 loss
|
||||
float scale_; // Scale the loss by scale_
|
||||
Tensor buff_{Context::GetDeviceType()}; // Buffer for element-wise differences
|
||||
};
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // SMOOTH_L1_LOSS_OP_H_
|
@ -1,104 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "modules/detectron/softmax_focal_loss_op.h"
|
||||
|
||||
#include "caffe2/operators/softmax_utils.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
REGISTER_CPU_OPERATOR(SoftmaxFocalLoss, SoftmaxFocalLossOp<float, CPUContext>);
|
||||
REGISTER_CPU_OPERATOR(
|
||||
SoftmaxFocalLossGradient,
|
||||
SoftmaxFocalLossGradientOp<float, CPUContext>);
|
||||
|
||||
OPERATOR_SCHEMA(SoftmaxFocalLoss)
|
||||
.NumInputs(3)
|
||||
.NumOutputs(2)
|
||||
.SetDoc(R"DOC(
|
||||
A multiclass form of Focal Loss designed for use in RetinaNet-like models.
|
||||
The input is assumed to be unnormalized scores (sometimes called 'logits')
|
||||
arranged in a 4D tensor with shape (N, C, H, W), where N is the number of
|
||||
elements in the batch, H and W are the height and width, and C = num_anchors *
|
||||
num_classes. The softmax is applied num_anchors times along the C axis.
|
||||
|
||||
The softmax version of focal loss is:
|
||||
|
||||
FL(p_t) = -alpha * (1 - p_t)**gamma * log(p_t),
|
||||
|
||||
where p_i = exp(s_i) / sum_j exp(s_j), t is the target (ground truth) class, and
|
||||
s_j is the unnormalized score for class j.
|
||||
|
||||
See: https://arxiv.org/abs/1708.02002 for details.
|
||||
)DOC")
|
||||
.Arg(
|
||||
"scale",
|
||||
"(float) default 1.0; multiply the loss by this scale factor.")
|
||||
.Arg("alpha", "(float) default 0.25; Focal Loss's alpha hyper-parameter.")
|
||||
.Arg("gamma", "(float) default 1.0; Focal Loss's gamma hyper-parameter.")
|
||||
.Arg(
|
||||
"num_classes",
|
||||
"(int) default 81; number of classes in each softmax group.")
|
||||
.Input(
|
||||
0,
|
||||
"scores",
|
||||
"4D tensor of softmax inputs (called 'scores' or 'logits') with shape "
|
||||
"(N, C, H, W), where C = num_anchors * num_classes defines num_anchors "
|
||||
"groups of contiguous num_classes softmax inputs.")
|
||||
.Input(
|
||||
1,
|
||||
"labels",
|
||||
"4D tensor of labels with shape (N, num_anchors, H, W). Each entry is "
|
||||
"a class label in [0, num_classes - 1] (inclusive).")
|
||||
.Input(
|
||||
2,
|
||||
"normalizer",
|
||||
"Scalar; the loss is normalized by 1 / max(1, normalizer).")
|
||||
.Output(0, "loss", "Scalar loss.")
|
||||
.Output(
|
||||
1,
|
||||
"probabilities",
|
||||
"4D tensor of softmax probabilities with shape (N, C, H, W), where "
|
||||
"C = num_anchors * num_classes, and softmax was applied to each of the "
|
||||
"num_anchors groups; within a group the num_classes values sum to 1.");
|
||||
|
||||
OPERATOR_SCHEMA(SoftmaxFocalLossGradient)
|
||||
.NumInputs(5)
|
||||
.NumOutputs(1)
|
||||
.Input(0, "scores", "See SoftmaxFocalLoss.")
|
||||
.Input(1, "labels", "See SoftmaxFocalLoss.")
|
||||
.Input(2, "normalizer", "See SoftmaxFocalLoss.")
|
||||
.Input(
|
||||
3,
|
||||
"probabilities",
|
||||
"Output 1 from SoftmaxFocalLoss; See SoftmaxFocalLoss.")
|
||||
.Input(4, "d_loss", "Gradient of forward output 0 (loss)")
|
||||
.Output(0, "d_scores", "Gradient of forward input 0 (scores)");
|
||||
|
||||
class GetSoftmaxFocalLossGradient : public GradientMakerBase {
|
||||
using GradientMakerBase::GradientMakerBase;
|
||||
vector<OperatorDef> GetGradientDefs() override {
|
||||
return SingleGradientDef(
|
||||
"SoftmaxFocalLossGradient",
|
||||
"",
|
||||
vector<string>{I(0), I(1), I(2), O(1), GO(0)},
|
||||
vector<string>{GI(0)});
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_GRADIENT(SoftmaxFocalLoss, GetSoftmaxFocalLossGradient);
|
||||
|
||||
} // namespace caffe2
|
@ -1,256 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <cfloat>
|
||||
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "modules/detectron/softmax_focal_loss_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
namespace {
|
||||
|
||||
__global__ void SpatialSoftmaxKernel(const int N, const int A,
|
||||
const int H, const int W, const float* Xdata, float* Pdata,
|
||||
const int num_classes) {
|
||||
CUDA_1D_KERNEL_LOOP(index, N * A * H * W) {
|
||||
int D = num_classes * A;
|
||||
int x = index % W;
|
||||
int y = (index / W) % H;
|
||||
int a = (index / (W * H)) % A;
|
||||
int i = index / W / H / A;
|
||||
|
||||
// Subtract max on each cell for numerical reasons
|
||||
float max_val = -FLT_MAX;
|
||||
for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
|
||||
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
|
||||
max_val = max(max_val, Xdata[idx]);
|
||||
}
|
||||
// Exponentiate
|
||||
float expsum = 0.0f;
|
||||
for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
|
||||
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
|
||||
float expx = exp(Xdata[idx] - max_val);
|
||||
Pdata[idx] = expx;
|
||||
expsum += expx;
|
||||
}
|
||||
// Normalize
|
||||
for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
|
||||
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
|
||||
Pdata[idx] /= expsum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__global__ void SoftmaxFocalLossKernel(
|
||||
const int N, const int A, const int H, const int W,
|
||||
const float* Pdata, const int* targets, float* losses,
|
||||
const float* weight_pos, const float gamma, const float alpha,
|
||||
const int num_classes) {
|
||||
CUDA_1D_KERNEL_LOOP(i, N * A * H * W) {
|
||||
int D = A * num_classes;
|
||||
int x = i % W;
|
||||
int y = (i / W) % H;
|
||||
int a = (i / (W * H)) % A;
|
||||
int n = i / (W * H * A);
|
||||
const int label = static_cast<int>(targets[i]);
|
||||
|
||||
float Np = c10::cuda::compat::max(weight_pos[0], static_cast<float>(1.0));
|
||||
float z = (label == 0) * (1 - alpha) / Np +
|
||||
(label >= 1) * alpha / Np;
|
||||
|
||||
losses[i] = 0.0;
|
||||
if (label >= 0) {
|
||||
int offset = a * num_classes;
|
||||
int idx = n * (H * W * D) + (offset + label) * (H * W) + y * W + x;
|
||||
losses[i] =
|
||||
-(pow(1.0f - Pdata[idx], gamma) *
|
||||
log(c10::cuda::compat::max(Pdata[idx], FLT_MIN))) * z;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__global__ void SoftmaxFocalLossGradientWeightKernel(
|
||||
const int N, const int A, const int H, const int W,
|
||||
const float* Pdata, const int* targets, float* buff,
|
||||
const float* weight_pos, const float gamma, const float alpha,
|
||||
const int num_classes) {
|
||||
CUDA_1D_KERNEL_LOOP(i, N * A * H * W) {
|
||||
int D = A * num_classes;
|
||||
int x = i % W;
|
||||
int y = (i / W) % H;
|
||||
int a = (i / (W * H)) % A;
|
||||
int n = i / (W * H * A);
|
||||
const int label = static_cast<int>(targets[i]);
|
||||
float Np = c10::cuda::compat::max(weight_pos[0], static_cast<float>(1.0));
|
||||
float z = (label == 0) * (1 - alpha) / Np +
|
||||
(label >= 1) * alpha / Np;
|
||||
|
||||
buff[i] = 0.0;
|
||||
if (label >= 0) {
|
||||
int offset = a * num_classes;
|
||||
int idx = n * (H * W * D) + (offset + label) * (H * W) + y * W + x;
|
||||
float onemp = 1. - Pdata[idx];
|
||||
float p = Pdata[idx];
|
||||
buff[i] =
|
||||
(-pow(onemp, gamma) +
|
||||
gamma * pow(onemp, gamma - 1) * p * log(c10::cuda::compat::max(p, FLT_MIN))) * z;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__global__ void SoftmaxFocalLossGradientKernel(
|
||||
const int N, const int D, const int H, const int W,
|
||||
const float* Pdata, const int* targets, const float* buff,
|
||||
const float* d_loss_data, float* dX, const int num_classes) {
|
||||
CUDA_1D_KERNEL_LOOP(i, N * D * H * W) {
|
||||
int A = D / num_classes;
|
||||
int x = i % W;
|
||||
int y = (i / W) % H;
|
||||
int d = (i / (W * H)) % D;
|
||||
int a = d / num_classes;
|
||||
int c = d % num_classes;
|
||||
int n = i / (W * H * D);
|
||||
float d_loss = *d_loss_data;
|
||||
|
||||
int ind = n * (H * W * A) + a * (H * W) + y * W + x;
|
||||
const int label = static_cast<int>(targets[ind]);
|
||||
|
||||
float c1 = (label >= 0) * 1.0;
|
||||
float c2 = (label == c) * 1.0;
|
||||
dX[i] = 0.0;
|
||||
dX[i] = c1 * d_loss * buff[ind] * (c2 - Pdata[i]);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
|
||||
template <>
|
||||
bool SoftmaxFocalLossOp<float, CUDAContext>::RunOnDevice() {
|
||||
auto& X = Input(0); // Logits
|
||||
auto& T = Input(1); // Labels
|
||||
auto& wp = Input(2); // num of foreground
|
||||
// average loss as output
|
||||
// softmax probability, going to be re-used in gradient
|
||||
|
||||
int N = X.dim32(0);
|
||||
int D = X.dim32(1);
|
||||
int H = X.dim32(2);
|
||||
int W = X.dim32(3);
|
||||
int A = D / num_classes_;
|
||||
|
||||
ReinitializeTensor(&losses_, {N * A * H * W}, at::dtype<float>().device(CUDA));
|
||||
auto* P = Output(1, {N * D * H * W}, at::dtype<float>());
|
||||
auto* avg_loss = Output(0, vector<int64_t>(), at::dtype<float>());
|
||||
math::Set<float, CUDAContext>(
|
||||
avg_loss->size(), 0.f, avg_loss->mutable_data<float>(), &context_);
|
||||
math::Set<float, CUDAContext>(
|
||||
P->size(), 0.f, P->mutable_data<float>(), &context_);
|
||||
math::Set<float, CUDAContext>(
|
||||
losses_.size(), 0.f, losses_.mutable_data<float>(), &context_);
|
||||
TORCH_DCHECK_EQ(X.ndim(), 4);
|
||||
|
||||
const float* Xdata = X.data<float>();
|
||||
const float* Wdata = wp.data<float>();
|
||||
|
||||
|
||||
// Spatial Softmax Kernel
|
||||
SpatialSoftmaxKernel
|
||||
<<<CAFFE_GET_BLOCKS(N * A * H * W), CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
N, A, H, W, Xdata, P->mutable_data<float>(), num_classes_);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
// Compute loss for each x,y location
|
||||
const int* Tdata = T.data<int>();
|
||||
SoftmaxFocalLossKernel
|
||||
<<<CAFFE_GET_BLOCKS(N * A * H * W), CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
N, A, H, W, P->data<float>(), Tdata, losses_.mutable_data<float>(),
|
||||
Wdata, gamma_, alpha_, num_classes_);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
// sum the losses
|
||||
float* avg_loss_data = avg_loss->mutable_data<float>();
|
||||
math::Sum<float, CUDAContext>(
|
||||
losses_.size(), losses_.data<float>(), avg_loss_data, &context_);
|
||||
math::Scale<float, float, CUDAContext>(
|
||||
1, scale_, avg_loss_data, avg_loss_data, &context_);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
template<>
|
||||
bool SoftmaxFocalLossGradientOp<float, CUDAContext>::RunOnDevice() {
|
||||
auto& X = Input(0); // Logits
|
||||
auto& T = Input(1); // Label
|
||||
auto& wp = Input(2); // num of foreground example
|
||||
auto& P = Input(3); // Softmax Probability
|
||||
auto& d_avg_loss = Input(4);
|
||||
|
||||
|
||||
int N = X.dim32(0);
|
||||
int D = X.dim32(1);
|
||||
int H = X.dim32(2);
|
||||
int W = X.dim32(3);
|
||||
int A = D / num_classes_;
|
||||
|
||||
ReinitializeTensor(&buff_, {N * A * H * W}, at::dtype<float>().device(CUDA));
|
||||
|
||||
auto* dX = Output(0, X.sizes(), at::dtype<float>()); // gradient wrt logits
|
||||
|
||||
const float* Xdata = X.data<float>();
|
||||
const int* Tdata = T.data<int>();
|
||||
const float* Pdata = P.data<float>();
|
||||
const float* Wdata = wp.data<float>();
|
||||
|
||||
|
||||
// Compute the weight for gradients
|
||||
SoftmaxFocalLossGradientWeightKernel
|
||||
<<<CAFFE_GET_BLOCKS(N * A * H * W), CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
N, A, H, W, Pdata, Tdata, buff_.mutable_data<float>(),
|
||||
Wdata, gamma_, alpha_, num_classes_);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
// Compute the gradient with the weights
|
||||
const float* Bdata = buff_.data<float>();
|
||||
SoftmaxFocalLossGradientKernel
|
||||
<<<CAFFE_GET_BLOCKS(N * D * H * W), CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
N, D, H, W, Pdata, Tdata, Bdata, d_avg_loss.data<float>(),
|
||||
dX->mutable_data<float>(), num_classes_);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
math::Scale<float, float, CUDAContext>(
|
||||
dX->size(),
|
||||
scale_,
|
||||
dX->data<float>(),
|
||||
dX->mutable_data<float>(),
|
||||
&context_);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
REGISTER_CUDA_OPERATOR(SoftmaxFocalLoss,
|
||||
SoftmaxFocalLossOp<float, CUDAContext>);
|
||||
REGISTER_CUDA_OPERATOR(SoftmaxFocalLossGradient,
|
||||
SoftmaxFocalLossGradientOp<float, CUDAContext>);
|
||||
} // namespace caffe2
|
@ -1,91 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef SOFTMAX_FOCAL_LOSS_OP_H_
|
||||
#define SOFTMAX_FOCAL_LOSS_OP_H_
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename T, class Context>
|
||||
class SoftmaxFocalLossOp final : public Operator<Context> {
|
||||
public:
|
||||
SoftmaxFocalLossOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<Context>(operator_def, ws),
|
||||
scale_(this->template GetSingleArgument<float>("scale", 1.)),
|
||||
gamma_(this->template GetSingleArgument<float>("gamma", 1.)),
|
||||
alpha_(this->template GetSingleArgument<float>("alpha", 0.25)),
|
||||
num_classes_(this->template GetSingleArgument<int>("num_classes", 81)),
|
||||
order_(StringToStorageOrder(
|
||||
this->template GetSingleArgument<string>("order", "NCHW"))) {
|
||||
CAFFE_ENFORCE(scale_ >= 0);
|
||||
CAFFE_ENFORCE_EQ(
|
||||
order_, StorageOrder::NCHW, "Only NCHW order is supported right now.");
|
||||
}
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
|
||||
bool RunOnDevice() override {
|
||||
// No CPU implementation for now
|
||||
CAFFE_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
protected:
|
||||
float scale_;
|
||||
float gamma_;
|
||||
float alpha_;
|
||||
int num_classes_;
|
||||
StorageOrder order_;
|
||||
Tensor losses_;
|
||||
};
|
||||
|
||||
template <typename T, class Context>
|
||||
class SoftmaxFocalLossGradientOp final : public Operator<Context> {
|
||||
public:
|
||||
SoftmaxFocalLossGradientOp(const OperatorDef& def, Workspace* ws)
|
||||
: Operator<Context>(def, ws),
|
||||
scale_(this->template GetSingleArgument<float>("scale", 1.)),
|
||||
gamma_(this->template GetSingleArgument<float>("gamma", 1.)),
|
||||
alpha_(this->template GetSingleArgument<float>("alpha", 0.25)),
|
||||
num_classes_(this->template GetSingleArgument<int>("num_classes", 81)),
|
||||
order_(StringToStorageOrder(
|
||||
this->template GetSingleArgument<string>("order", "NCHW"))) {
|
||||
CAFFE_ENFORCE(scale_ >= 0);
|
||||
CAFFE_ENFORCE_EQ(
|
||||
order_, StorageOrder::NCHW, "Only NCHW order is supported right now.");
|
||||
}
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
|
||||
bool RunOnDevice() override {
|
||||
// No CPU implementation for now
|
||||
CAFFE_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
protected:
|
||||
float scale_;
|
||||
float gamma_;
|
||||
float alpha_;
|
||||
int num_classes_;
|
||||
StorageOrder order_;
|
||||
Tensor buff_;
|
||||
};
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // SOFTMAX_FOCAL_LOSS_OP_H_
|
@ -1,79 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "spatial_narrow_as_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
REGISTER_CPU_OPERATOR(SpatialNarrowAs, SpatialNarrowAsOp<CPUContext>);
|
||||
REGISTER_CPU_OPERATOR(
|
||||
SpatialNarrowAsGradient,
|
||||
SpatialNarrowAsGradientOp<CPUContext>);
|
||||
|
||||
OPERATOR_SCHEMA(SpatialNarrowAs)
|
||||
.NumInputs(2)
|
||||
.NumOutputs(1)
|
||||
.SetDoc(R"DOC(
|
||||
Reduces ("narrows") the spatial extent of A to that of B by removing rows and
|
||||
columns from the bottom and right.
|
||||
)DOC")
|
||||
.Input(
|
||||
0,
|
||||
"A",
|
||||
"3D or 4D input of shape (N, H0, W0) or (N, C, H0, W0).")
|
||||
.Input(
|
||||
1,
|
||||
"B",
|
||||
"3D or 4D input of shape (N, H1, W1) or (N, C, H1, W1), where H1 <= H0 "
|
||||
"and W1 <= W0.")
|
||||
.Output(
|
||||
0,
|
||||
"C",
|
||||
"Sub window of A containing rows [0, H1 - 1] (inclusive) and columns "
|
||||
"[0, W1 - 1] (inclusive).");
|
||||
|
||||
OPERATOR_SCHEMA(SpatialNarrowAsGradient)
|
||||
.NumInputs(3)
|
||||
.NumOutputs(1)
|
||||
.Input(
|
||||
0,
|
||||
"A",
|
||||
"See SpatialNarrowAs.")
|
||||
.Input(
|
||||
1,
|
||||
"B",
|
||||
"See SpatialNarrowAs.")
|
||||
.Input(
|
||||
2,
|
||||
"dC",
|
||||
"Gradient of forward output 0 (C).")
|
||||
.Output(
|
||||
0,
|
||||
"dA",
|
||||
"Gradient of forward input 0 (A)");
|
||||
|
||||
class SpatialNarrowAsGradient : public GradientMakerBase {
|
||||
using GradientMakerBase::GradientMakerBase;
|
||||
vector<OperatorDef> GetGradientDefs() override {
|
||||
return SingleGradientDef(
|
||||
"SpatialNarrowAsGradient", "",
|
||||
vector<string>{I(0), I(1), GO(0)},
|
||||
vector<string>{GI(0)});
|
||||
}
|
||||
};
|
||||
REGISTER_GRADIENT(SpatialNarrowAs, SpatialNarrowAsGradient);
|
||||
|
||||
} // namespace caffe2
|
@ -1,165 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "modules/detectron/spatial_narrow_as_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
namespace {
|
||||
template <typename T>
|
||||
__global__ void CopyKernel(
|
||||
const int N,
|
||||
const int C,
|
||||
const int in_H,
|
||||
const int in_W,
|
||||
const int out_H,
|
||||
const int out_W,
|
||||
const T* in_data,
|
||||
T* out_data) {
|
||||
CUDA_1D_KERNEL_LOOP(index, N * C * out_H * out_W) {
|
||||
int w = index % out_W;
|
||||
int h = (index / out_W) % out_H;
|
||||
int c = (index / out_W / out_H) % C;
|
||||
int n = (index / out_W / out_H / C);
|
||||
int in_index = n * C * in_H * in_W + c * in_H * in_W + h * in_W + w;
|
||||
int out_index = n * C * out_H * out_W + c * out_H * out_W + h * out_W + w;
|
||||
out_data[out_index] = in_data[in_index];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void CopyGradientKernel(
|
||||
const int N,
|
||||
const int C,
|
||||
const int in_H,
|
||||
const int in_W,
|
||||
const int out_H,
|
||||
const int out_W,
|
||||
const T* in_data,
|
||||
T* out_data) {
|
||||
CUDA_1D_KERNEL_LOOP(index, N * C * in_H * in_W) {
|
||||
int w = index % in_W;
|
||||
int h = (index / in_W) % in_H;
|
||||
int c = (index / in_W / in_H) % C;
|
||||
int n = (index / in_W / in_H / C);
|
||||
int in_index = n * C * in_H * in_W + c * in_H * in_W + h * in_W + w;
|
||||
int out_index = n * C * out_H * out_W + c * out_H * out_W + h * out_W + w;
|
||||
out_data[out_index] = in_data[in_index];
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
|
||||
template <>
|
||||
bool SpatialNarrowAsOp<CUDAContext>::RunOnDevice() {
|
||||
return DispatchHelper<TensorTypes<float_t, int32_t>>::call(this, Input(0));
|
||||
}
|
||||
|
||||
template <>
|
||||
template <typename T>
|
||||
bool SpatialNarrowAsOp<CUDAContext>::DoRunWithType() {
|
||||
// Narrows input 0 (A) spatially to match input 1 (B)
|
||||
auto& A = Input(0);
|
||||
auto& B = Input(1);
|
||||
|
||||
|
||||
CAFFE_ENFORCE_EQ(A.dim32(0), B.dim32(0), "Input dim 0 must be equal.");
|
||||
std::vector<int64_t> sizes;
|
||||
if (A.ndim() == B.ndim()) {
|
||||
CAFFE_ENFORCE_EQ(A.dim32(1), B.dim32(1), "Input dim 1 must be equal.");
|
||||
CAFFE_ENFORCE_GE(
|
||||
A.dim32(2), B.dim32(2), "Input 0 height must be >= input 1 height.");
|
||||
CAFFE_ENFORCE_GE(
|
||||
A.dim32(3), B.dim32(3), "Input 0 width must be >= input 1 width.");
|
||||
sizes = B.sizes().vec();
|
||||
} else {
|
||||
// For (N, H, W) case
|
||||
CAFFE_ENFORCE_EQ(A.ndim() - 1, B.ndim(), "Dimension mismatch.");
|
||||
CAFFE_ENFORCE_GE(
|
||||
A.dim32(2), B.dim32(1), "Input 0 height must be >= input 1 height.");
|
||||
CAFFE_ENFORCE_GE(
|
||||
A.dim32(3), B.dim32(2), "Input 0 width must be >= input 1 width.");
|
||||
sizes = {A.dim32(0), A.dim32(1), B.dim32(1), B.dim32(2)};
|
||||
}
|
||||
auto* C = Output(0, sizes, at::dtype<T>());
|
||||
int out_width = C->dim32(3);
|
||||
int out_height = C->dim32(2);
|
||||
int in_width = A.dim32(3);
|
||||
int in_height = A.dim32(2);
|
||||
|
||||
CopyKernel<T><<<
|
||||
CAFFE_GET_BLOCKS(C->size()),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context_.cuda_stream()>>>(
|
||||
C->dim32(0),
|
||||
C->dim32(1),
|
||||
in_height,
|
||||
in_width,
|
||||
out_height,
|
||||
out_width,
|
||||
A.template data<T>(),
|
||||
C->template mutable_data<T>());
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool SpatialNarrowAsGradientOp<CUDAContext>::RunOnDevice() {
|
||||
return DispatchHelper<TensorTypes<float_t, int32_t>>::call(this, Input(0));
|
||||
}
|
||||
|
||||
template <>
|
||||
template <typename T>
|
||||
bool SpatialNarrowAsGradientOp<CUDAContext>::DoRunWithType() {
|
||||
auto& A = Input(0);
|
||||
auto& B = Input(1);
|
||||
auto& dC = Input(2); // Gradient of net w.r.t. output of forward op
|
||||
auto* dA = Output(0, A.sizes(), at::dtype<T>()); // Gradient of net w.r.t. input to forward op
|
||||
|
||||
math::Set<T, CUDAContext>(
|
||||
dA->size(), 0.f, dA->template mutable_data<T>(), &context_);
|
||||
int out_width = dA->dim32(3);
|
||||
int out_height = dA->dim32(2);
|
||||
int in_width = dC.dim32(3);
|
||||
int in_height = dC.dim32(2);
|
||||
|
||||
CopyGradientKernel<T><<<
|
||||
CAFFE_GET_BLOCKS(dC.size()),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context_.cuda_stream()>>>(
|
||||
dA->dim32(0),
|
||||
dA->dim32(1),
|
||||
in_height,
|
||||
in_width,
|
||||
out_height,
|
||||
out_width,
|
||||
dC.template data<T>(),
|
||||
dA->template mutable_data<T>());
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
REGISTER_CUDA_OPERATOR(SpatialNarrowAs, SpatialNarrowAsOp<CUDAContext>);
|
||||
REGISTER_CUDA_OPERATOR(
|
||||
SpatialNarrowAsGradient,
|
||||
SpatialNarrowAsGradientOp<CUDAContext>);
|
||||
} // namespace caffe2
|
@ -1,63 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef SPATIAL_NARROW_AS_OP_H_
|
||||
#define SPATIAL_NARROW_AS_OP_H_
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <class Context>
|
||||
class SpatialNarrowAsOp final : public Operator<Context> {
|
||||
public:
|
||||
SpatialNarrowAsOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<Context>(operator_def, ws) {}
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
USE_DISPATCH_HELPER;
|
||||
bool RunOnDevice() override {
|
||||
// No CPU implementation for now
|
||||
CAFFE_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
protected:
|
||||
template <typename T>
|
||||
bool DoRunWithType();
|
||||
};
|
||||
|
||||
template <class Context>
|
||||
class SpatialNarrowAsGradientOp final : public Operator<Context> {
|
||||
public:
|
||||
SpatialNarrowAsGradientOp(const OperatorDef& def, Workspace* ws)
|
||||
: Operator<Context>(def, ws) {}
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
USE_DISPATCH_HELPER;
|
||||
bool RunOnDevice() override {
|
||||
// No CPU implementation for now
|
||||
CAFFE_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
protected:
|
||||
template <typename T>
|
||||
bool DoRunWithType();
|
||||
};
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // SPATIAL_NARROW_AS_OP_H_
|
@ -1,83 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "upsample_nearest_op.h"
|
||||
#ifdef USE_MKLDNN
|
||||
#include "caffe2/ideep/operators/operator_fallback_ideep.h"
|
||||
#include "caffe2/ideep/utils/ideep_operator.h"
|
||||
#endif
|
||||
|
||||
namespace caffe2 {
|
||||
#ifdef USE_MKLDNN
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
UpsampleNearest,
|
||||
IDEEPFallbackOp<UpsampleNearestOp<float, CPUContext>>);
|
||||
#endif
|
||||
|
||||
REGISTER_CPU_OPERATOR(UpsampleNearest, UpsampleNearestOp<float, CPUContext>);
|
||||
REGISTER_CPU_OPERATOR(
|
||||
UpsampleNearestGradient,
|
||||
UpsampleNearestGradientOp<float, CPUContext>);
|
||||
|
||||
OPERATOR_SCHEMA(UpsampleNearest)
|
||||
.NumInputs(1)
|
||||
.NumOutputs(1)
|
||||
.SetDoc(R"DOC(
|
||||
Nearest neighbor upsampling operation. Implementation taken from THCUNN.
|
||||
)DOC")
|
||||
.Arg(
|
||||
"scale",
|
||||
"(int) default 2; integer upsampling factor.")
|
||||
.Input(
|
||||
0,
|
||||
"X",
|
||||
"4D feature map input of shape (N, C, H, W).")
|
||||
.Output(
|
||||
0,
|
||||
"Y",
|
||||
"4D feature map of shape (N, C, scale * H, scale * W); Values are "
|
||||
"neareast neighbor samples from X.");
|
||||
|
||||
OPERATOR_SCHEMA(UpsampleNearestGradient)
|
||||
.NumInputs(2)
|
||||
.NumOutputs(1)
|
||||
.Input(
|
||||
0,
|
||||
"X",
|
||||
"See UpsampleNearest.")
|
||||
.Input(
|
||||
1,
|
||||
"dY",
|
||||
"Gradient of forward output 0 (Y).")
|
||||
.Output(
|
||||
0,
|
||||
"dX",
|
||||
"Gradient of forward input 0 (X).");
|
||||
|
||||
class GetUpsampleNearestGradient : public GradientMakerBase {
|
||||
using GradientMakerBase::GradientMakerBase;
|
||||
vector<OperatorDef> GetGradientDefs() override {
|
||||
return SingleGradientDef(
|
||||
"UpsampleNearestGradient",
|
||||
"",
|
||||
vector<string>{I(0), GO(0)},
|
||||
vector<string>{GI(0)});
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_GRADIENT(UpsampleNearest, GetUpsampleNearestGradient);
|
||||
|
||||
} // namespace caffe2
|
@ -1,223 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Adapted from https://github.com/torch/cunn/blob/master/lib/THCUNN/SpatialUpSamplingNearest.cu
|
||||
*
|
||||
* Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
|
||||
* Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
|
||||
* Copyright (c) 2011-2013 NYU (Clement Farabet)
|
||||
* Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert,
|
||||
* Leon Bottou, Iain Melvin, Jason Weston)
|
||||
* Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
|
||||
* Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert,
|
||||
* Samy Bengio, Johnny Mariethoz)
|
||||
*
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the names of NEC Laboratories American and IDIAP Research
|
||||
* Institute nor the names of its contributors may be used to endorse or
|
||||
* promote products derived from this software without specific prior
|
||||
* written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "modules/detectron/upsample_nearest_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
namespace {
|
||||
__device__ int translate_idx(int ii, int d1, int d2, int d3, int scale_factor) {
|
||||
int x, y, z, w;
|
||||
w = ii % d3;
|
||||
ii = ii/d3;
|
||||
z = ii % d2;
|
||||
ii = ii/d2;
|
||||
y = ii % d1;
|
||||
ii = ii/d1;
|
||||
x = ii;
|
||||
w = w/scale_factor;
|
||||
z = z/scale_factor;
|
||||
d2 /= scale_factor;
|
||||
d3 /= scale_factor;
|
||||
return (((x*d1+y)*d2)+z)*d3+w;
|
||||
}
|
||||
|
||||
__device__ int translate_idx_inv(
|
||||
int ii, int d1, int d2, int d3, int scale_factor, int off_x, int off_y) {
|
||||
int x, y, z, w;
|
||||
w = ii % d3;
|
||||
ii = ii/d3;
|
||||
z = ii % d2;
|
||||
ii = ii/d2;
|
||||
y = ii % d1;
|
||||
ii = ii/d1;
|
||||
x = ii;
|
||||
w = w*scale_factor+off_x;
|
||||
z = z*scale_factor+off_y;
|
||||
d2 *= scale_factor;
|
||||
d3 *= scale_factor;
|
||||
return (((x*d1+y)*d2)+z)*d3+w;
|
||||
}
|
||||
|
||||
__global__ void upscale(const float *input, float *output, long no_elements,
|
||||
int scale_factor, int d1, int d2, int d3) {
|
||||
long ii = threadIdx.x + blockDim.x * blockIdx.x;
|
||||
ii += threadIdx.y + blockDim.y * (blockDim.x * gridDim.x) * blockIdx.y;
|
||||
if (ii >= no_elements) return;
|
||||
int ipidx = translate_idx(ii, d1, d2, d3, scale_factor);
|
||||
output[ii]=input[ipidx];
|
||||
}
|
||||
|
||||
__global__ void downscale(float *gradInput_data, const float *gradOutput_data,
|
||||
long no_elements, int scale_factor, int d1, int d2,
|
||||
int d3) {
|
||||
long ii = threadIdx.x + blockDim.x * blockIdx.x;
|
||||
ii += threadIdx.y + blockDim.y * (blockDim.x * gridDim.x) * blockIdx.y;
|
||||
if (ii >= no_elements) return;
|
||||
for (int i=0; i < scale_factor; i++){
|
||||
for(int j=0; j < scale_factor; j++){
|
||||
int ipidx = translate_idx_inv(ii, d1, d2, d3, scale_factor, i, j);
|
||||
gradInput_data[ii] += gradOutput_data[ipidx];
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
template<>
|
||||
bool UpsampleNearestOp<float, CUDAContext>::RunOnDevice() {
|
||||
auto& X = Input(0);
|
||||
auto* Y = Output(0);
|
||||
|
||||
vector<int64_t> out_shape;
|
||||
for (int i = 0; i < X.ndim(); ++i) {
|
||||
out_shape.push_back(X.dim32(i));
|
||||
}
|
||||
out_shape[X.ndim() - 1] *= scale_;
|
||||
out_shape[X.ndim() - 2] *= scale_;
|
||||
Y->Resize(out_shape);
|
||||
|
||||
int d1;
|
||||
int d2;
|
||||
int d3;
|
||||
if (X.ndim() == 3) {
|
||||
d1 = Y->dim32(0);
|
||||
d2 = Y->dim32(1);
|
||||
d3 = Y->dim32(2);
|
||||
} else {
|
||||
d1 = Y->dim32(1);
|
||||
d2 = Y->dim32(2);
|
||||
d3 = Y->dim32(3);
|
||||
}
|
||||
long no_elements = Y->size();
|
||||
|
||||
const float *input_data = X.data<float>();
|
||||
float *output_data = Y->mutable_data<float>();
|
||||
|
||||
// cuda blocks & threads:
|
||||
long nthreads = 256;
|
||||
// Max number of blocks: http://en.wikipedia.org/wiki/CUDA
|
||||
// 65535 for SM 2.x, 2^32 -1 for >= 3.0
|
||||
// TODO: When we move to SM 3.5 we should update this
|
||||
long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535);
|
||||
long n_yblocks = (long)ceil(
|
||||
(float)no_elements / (float)(n_xblocks * nthreads));
|
||||
CAFFE_ENFORCE(n_yblocks <= 65535);
|
||||
dim3 blocks(n_xblocks, n_yblocks);
|
||||
dim3 threads(nthreads);
|
||||
|
||||
upscale<<<blocks, threads, 0, context_.cuda_stream()>>>(
|
||||
input_data, output_data, no_elements, scale_, d1, d2, d3);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
template<>
|
||||
bool UpsampleNearestGradientOp<float, CUDAContext>::RunOnDevice() {
|
||||
auto& X = Input(0); // Original input to "forward" op
|
||||
auto& dY = Input(1); // Gradient of net w.r.t. output of "forward" op
|
||||
// (aka "gradOutput")
|
||||
auto* dX = Output(0); // Gradient of net w.r.t. input to "forward" op
|
||||
// (aka "gradInput")
|
||||
|
||||
dX->ResizeLike(X);
|
||||
float *gradInput_data = dX->mutable_data<float>();
|
||||
const float *gradOutput_data = dY.data<float>();
|
||||
|
||||
int d1;
|
||||
int d2;
|
||||
int d3;
|
||||
if (dX->ndim() == 3) {
|
||||
d1 = dX->dim32(0);
|
||||
d2 = dX->dim32(1);
|
||||
d3 = dX->dim32(2);
|
||||
} else {
|
||||
d1 = dX->dim32(1);
|
||||
d2 = dX->dim32(2);
|
||||
d3 = dX->dim32(3);
|
||||
}
|
||||
long no_elements = dX->size();
|
||||
|
||||
// cuda blocks & threads:
|
||||
long nthreads = 256;
|
||||
// Max number of blocks: http://en.wikipedia.org/wiki/CUDA
|
||||
// 65535 for SM 2.x, 2^32 -1 for >= 3.0
|
||||
// TODO: When we move to SM 3.5 we should update this
|
||||
long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535);
|
||||
long n_yblocks = (long)ceil(
|
||||
(float)no_elements / (float)(n_xblocks * nthreads));
|
||||
CAFFE_ENFORCE(n_yblocks <= 65535);
|
||||
dim3 blocks(n_xblocks, n_yblocks);
|
||||
dim3 threads(nthreads);
|
||||
|
||||
math::Set<float, CUDAContext>(no_elements, 0.f, gradInput_data, &context_);
|
||||
downscale<<<blocks, threads, 0, context_.cuda_stream()>>>(
|
||||
gradInput_data, gradOutput_data, no_elements, scale_, d1, d2, d3);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
REGISTER_CUDA_OPERATOR(UpsampleNearest,
|
||||
UpsampleNearestOp<float, CUDAContext>);
|
||||
REGISTER_CUDA_OPERATOR(UpsampleNearestGradient,
|
||||
UpsampleNearestGradientOp<float, CUDAContext>);
|
||||
} // namespace caffe2
|
@ -1,106 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef UPSAMPLE_NEAREST_OP_H_
|
||||
#define UPSAMPLE_NEAREST_OP_H_
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename T, class Context>
|
||||
class UpsampleNearestOp final : public Operator<Context> {
|
||||
public:
|
||||
UpsampleNearestOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<Context>(operator_def, ws),
|
||||
scale_(this->template GetSingleArgument<int>("scale", 2)) {
|
||||
TORCH_DCHECK_GE(scale_, 1);
|
||||
}
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
|
||||
bool RunOnDevice() override {
|
||||
auto& X = Input(0);
|
||||
|
||||
auto out_shape = X.sizes().vec();
|
||||
out_shape[X.dim() - 1] *= scale_;
|
||||
out_shape[X.dim() - 2] *= scale_;
|
||||
auto* Y = Output(0, out_shape, at::dtype<T>());
|
||||
|
||||
int d1;
|
||||
int d2;
|
||||
int d3;
|
||||
if (X.dim() == 3) {
|
||||
d1 = Y->dim32(0);
|
||||
d2 = Y->dim32(1);
|
||||
d3 = Y->dim32(2);
|
||||
} else {
|
||||
d1 = Y->dim32(0) * Y->dim32(1);
|
||||
d2 = Y->dim32(2);
|
||||
d3 = Y->dim32(3);
|
||||
}
|
||||
|
||||
const T *input_data = X.template data<T>();
|
||||
T *output_data = Y->template mutable_data<T>();
|
||||
int scaled_d2 = d2 / scale_;
|
||||
int scaled_d3 = d3 / scale_;
|
||||
|
||||
#ifdef _OPENMP
|
||||
#pragma omp parallel for
|
||||
#endif
|
||||
for (int i = 0; i < d1; ++i) {
|
||||
for (int j = 0; j < d2; ++j) {
|
||||
for (int u = 0; u < d3; ++u) {
|
||||
int ii = (i * d2 + j) * d3 + u;
|
||||
int scaled_u = u / scale_;
|
||||
int scaled_j = j / scale_;
|
||||
int ipidx = ((i * scaled_d2) + scaled_j) * scaled_d3 + scaled_u;
|
||||
output_data[ii] = input_data[ipidx];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
protected:
|
||||
int scale_;
|
||||
};
|
||||
|
||||
template <typename T, class Context>
|
||||
class UpsampleNearestGradientOp final : public Operator<Context> {
|
||||
public:
|
||||
UpsampleNearestGradientOp(const OperatorDef& def, Workspace* ws)
|
||||
: Operator<Context>(def, ws),
|
||||
scale_(this->template GetSingleArgument<int>("scale", 2)) {
|
||||
TORCH_DCHECK_GE(scale_, 1);
|
||||
}
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
|
||||
bool RunOnDevice() override {
|
||||
// No CPU implementation for now
|
||||
CAFFE_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
protected:
|
||||
int scale_;
|
||||
};
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // UPSAMPLE_NEAREST_OP_H_
|
@ -1,42 +0,0 @@
|
||||
import unittest
|
||||
|
||||
import caffe2.python.hypothesis_test_util as hu
|
||||
import hypothesis.strategies as st
|
||||
import numpy as np
|
||||
from caffe2.python import core, dyndep
|
||||
from hypothesis import given, settings
|
||||
|
||||
|
||||
dyndep.InitOpsLibrary("@/caffe2/modules/detectron:detectron_ops")
|
||||
|
||||
|
||||
class TestUpsampleNearestOp(hu.HypothesisTestCase):
|
||||
@given(
|
||||
N=st.integers(1, 3),
|
||||
H=st.integers(10, 300),
|
||||
W=st.integers(10, 300),
|
||||
scale=st.integers(1, 3),
|
||||
**hu.gcs,
|
||||
)
|
||||
@settings(deadline=None, max_examples=20)
|
||||
def test_upsample_nearest_op(self, N, H, W, scale, gc, dc):
|
||||
C = 32
|
||||
X = np.random.randn(N, C, H, W).astype(np.float32)
|
||||
op = core.CreateOperator("UpsampleNearest", ["X"], ["Y"], scale=scale)
|
||||
|
||||
def ref(X):
|
||||
outH = H * scale
|
||||
outW = W * scale
|
||||
outH_idxs, outW_idxs = np.meshgrid(
|
||||
np.arange(outH), np.arange(outW), indexing="ij"
|
||||
)
|
||||
inH_idxs = (outH_idxs / scale).astype(np.int32)
|
||||
inW_idxs = (outW_idxs / scale).astype(np.int32)
|
||||
Y = X[:, :, inH_idxs, inW_idxs]
|
||||
return [Y]
|
||||
|
||||
self.assertReferenceChecks(device_option=gc, op=op, inputs=[X], reference=ref)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
@ -1,23 +0,0 @@
|
||||
if(NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
|
||||
# If we are building the standalone module, we set the proper cmake variables.
|
||||
cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
|
||||
find_package(Caffe2 REQUIRED)
|
||||
set(BUILD_TEST ON)
|
||||
option(BUILD_SHARED_LIBS "Build shared libs." ON)
|
||||
endif()
|
||||
|
||||
if(BUILD_TEST AND NOT BUILD_LITE_INTERPRETER)
|
||||
add_library(
|
||||
caffe2_module_test_dynamic
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/module_test_dynamic.cc)
|
||||
|
||||
if(HAVE_SOVERSION)
|
||||
set_target_properties(caffe2_module_test_dynamic PROPERTIES
|
||||
VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION})
|
||||
endif()
|
||||
target_link_libraries(caffe2_module_test_dynamic torch_library)
|
||||
install(TARGETS caffe2_module_test_dynamic DESTINATION lib)
|
||||
if(MSVC AND BUILD_SHARED_LIBS)
|
||||
install(FILES $<TARGET_PDB_FILE:caffe2_module_test_dynamic> DESTINATION lib OPTIONAL)
|
||||
endif()
|
||||
endif()
|
@ -1,41 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "caffe2/core/module.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
|
||||
// An explicitly defined module, testing correctness when we dynamically link a
|
||||
// module
|
||||
CAFFE2_MODULE(caffe2_module_test_dynamic, "Dynamic module only used for testing.");
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
class Caffe2ModuleTestDynamicDummyOp : public OperatorBase {
|
||||
public:
|
||||
using OperatorBase::OperatorBase;
|
||||
bool Run(int /* unused */ /*stream_id*/) override {
|
||||
return true;
|
||||
}
|
||||
virtual string type() {
|
||||
return "base";
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_CPU_OPERATOR(
|
||||
Caffe2ModuleTestDynamicDummy, Caffe2ModuleTestDynamicDummyOp);
|
||||
OPERATOR_SCHEMA(Caffe2ModuleTestDynamicDummy);
|
||||
|
||||
} // namespace caffe2
|
@ -1,32 +0,0 @@
|
||||
if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
|
||||
if(NOT USE_OBSERVERS)
|
||||
return()
|
||||
endif()
|
||||
else()
|
||||
cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
|
||||
project(caffe2_observers CXX)
|
||||
find_package(Caffe2 REQUIRED)
|
||||
option(BUILD_SHARED_LIBS "Build shared libs." ON)
|
||||
endif()
|
||||
|
||||
add_library(caffe2_observers
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/net_observer_reporter_print.cc"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/observer_config.cc"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/perf_observer.cc"
|
||||
)
|
||||
if(HAVE_SOVERSION)
|
||||
set_target_properties(caffe2_observers PROPERTIES
|
||||
VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION})
|
||||
endif()
|
||||
target_link_libraries(caffe2_observers PUBLIC torch_library)
|
||||
target_include_directories(caffe2_observers PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/..)
|
||||
target_compile_options(caffe2_observers PRIVATE "-DCAFFE2_BUILD_OBSERVER_LIB")
|
||||
install(TARGETS caffe2_observers DESTINATION lib)
|
||||
caffe2_interface_library(caffe2_observers caffe2_observers_library)
|
||||
if(MSVC AND BUILD_SHARED_LIBS)
|
||||
install(FILES $<TARGET_PDB_FILE:caffe2_observers> DESTINATION lib OPTIONAL)
|
||||
endif()
|
||||
|
||||
if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
|
||||
set(Caffe2_MODULES ${Caffe2_MODULES} caffe2_observers_library PARENT_SCOPE)
|
||||
endif()
|
@ -1,7 +0,0 @@
|
||||
#include "c10/macros/Macros.h"
|
||||
|
||||
#ifdef CAFFE2_BUILD_OBSERVER_LIB
|
||||
#define CAFFE2_OBSERVER_API C10_EXPORT
|
||||
#else
|
||||
#define CAFFE2_OBSERVER_API C10_IMPORT
|
||||
#endif
|
@ -1,38 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <map>
|
||||
|
||||
#include "caffe2/core/common.h"
|
||||
#include "caffe2/core/net.h"
|
||||
#include "observers/macros.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
struct PerformanceInformation {
|
||||
// Analytic
|
||||
int64_t flops = 0;
|
||||
int64_t bytes_written = 0;
|
||||
int64_t bytes_read = 0;
|
||||
std::vector<TensorShape> tensor_shapes = {};
|
||||
std::vector<Argument> args = {};
|
||||
std::string engine = ""; // the engine used
|
||||
std::string type = ""; // the type of the operator
|
||||
// Measured
|
||||
double latency = 0;
|
||||
double cpuMilliseconds = 0;
|
||||
};
|
||||
|
||||
class CAFFE2_OBSERVER_API NetObserverReporter {
|
||||
public:
|
||||
virtual ~NetObserverReporter() = default;
|
||||
|
||||
/*
|
||||
Report the delay metric collected by the observer.
|
||||
The delays are saved in a map. The key is an identifier associated
|
||||
with the reported delay. The value is the delay value in float
|
||||
*/
|
||||
virtual void report(
|
||||
NetBase* net,
|
||||
std::map<std::string, PerformanceInformation>&) = 0;
|
||||
};
|
||||
}
|
@ -1,158 +0,0 @@
|
||||
#include "observers/net_observer_reporter_print.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <sstream>
|
||||
#include "caffe2/core/init.h"
|
||||
#include "observers/observer_config.h"
|
||||
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
const std::string NetObserverReporterPrint::IDENTIFIER = "Caffe2Observer ";
|
||||
static std::string get_op_args(PerformanceInformation p);
|
||||
static std::string get_tensor_shapes(PerformanceInformation p);
|
||||
static std::string sanatize(std::string json_s);
|
||||
|
||||
void NetObserverReporterPrint::report(
|
||||
NetBase* net,
|
||||
std::map<std::string, PerformanceInformation>& info) {
|
||||
// Not allowed to use json library
|
||||
std::vector<std::map<std::string, std::string>> caffe2_perf;
|
||||
|
||||
for (auto& p : info) {
|
||||
if ((p.first == "NET_DELAY") && (info.size() == 1)) {
|
||||
// for Net_delay perf
|
||||
caffe2_perf.push_back({{"type", "NET"},
|
||||
{"value", c10::to_string(p.second.latency * 1000)},
|
||||
{"unit", "us"},
|
||||
{"metric", "latency"}});
|
||||
caffe2_perf.push_back({{"type", "NET_"},
|
||||
{
|
||||
"value",
|
||||
c10::to_string(
|
||||
p.second.cpuMilliseconds /
|
||||
p.second.latency *
|
||||
100),
|
||||
},
|
||||
{"unit", "percent"},
|
||||
{"metric", "cpu_percent"}});
|
||||
} else if (p.first != "NET_DELAY") {
|
||||
// for operator perf
|
||||
std::string shape_str = get_tensor_shapes(p.second);
|
||||
std::string args_str = get_op_args(p.second);
|
||||
std::string type = p.first;
|
||||
caffe2_perf.push_back({{"type", type},
|
||||
{"value", c10::to_string(p.second.latency * 1000)},
|
||||
{"unit", "us"},
|
||||
{"metric", "latency"}});
|
||||
caffe2_perf.push_back({{"type", type},
|
||||
{
|
||||
"value",
|
||||
c10::to_string(
|
||||
p.second.cpuMilliseconds /
|
||||
p.second.latency *
|
||||
100),
|
||||
},
|
||||
{"unit", "percent"},
|
||||
{"metric", "cpu_percent"}});
|
||||
if (p.second.flops > 0) {
|
||||
caffe2_perf.push_back({{"type", type},
|
||||
{"value", c10::to_string(p.second.flops)},
|
||||
{"unit", "flop"},
|
||||
{"metric", "flops"}});
|
||||
}
|
||||
if (shape_str != "") {
|
||||
caffe2_perf.push_back({{"type", type},
|
||||
{"info_string", shape_str},
|
||||
{"unit", ""},
|
||||
{"metric", "tensor_shapes"}});
|
||||
}
|
||||
if (args_str != "") {
|
||||
caffe2_perf.push_back({{"type", type},
|
||||
{"info_string", args_str},
|
||||
{"unit", ""},
|
||||
{"metric", "op_args"}});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(modernize-loop-convert)
|
||||
for (auto it = caffe2_perf.begin(); it != caffe2_perf.end(); it++) {
|
||||
std::stringstream buffer;
|
||||
auto entry = *it;
|
||||
buffer << IDENTIFIER << "{";
|
||||
// NOLINTNEXTLINE(modernize-raw-string-literal)
|
||||
buffer << "\"type\": \"" << sanatize(entry["type"]) << "\","
|
||||
// NOLINTNEXTLINE(modernize-raw-string-literal)
|
||||
<< "\"unit\": \"" << sanatize(entry["unit"]) << "\","
|
||||
// NOLINTNEXTLINE(modernize-raw-string-literal)
|
||||
<< "\"metric\": \"" << sanatize(entry["metric"]) << "\",";
|
||||
if (entry.find("value") != entry.end()) {
|
||||
// NOLINTNEXTLINE(modernize-raw-string-literal)
|
||||
buffer << "\"value\": \"" << sanatize(entry["value"]) << "\"";
|
||||
} else if (entry.find("info_string") != entry.end()) {
|
||||
// NOLINTNEXTLINE(modernize-raw-string-literal)
|
||||
buffer << "\"info_string\": \"" << sanatize(entry["info_string"]) << "\"";
|
||||
}
|
||||
buffer << "}";
|
||||
LOG(INFO) << buffer.str();
|
||||
}
|
||||
}
|
||||
|
||||
static std::string get_tensor_shapes(PerformanceInformation p) {
|
||||
std::string shape_str;
|
||||
std::stringstream shape_stream;
|
||||
if (!p.tensor_shapes.empty()) {
|
||||
shape_stream << "[";
|
||||
for (const auto i : c10::irange(p.tensor_shapes.size())) {
|
||||
shape_stream << "[";
|
||||
for (int j = 0; j < p.tensor_shapes[i].dims_size(); j++) {
|
||||
shape_stream << p.tensor_shapes[i].dims(j) << ", ";
|
||||
}
|
||||
shape_stream << "], ";
|
||||
}
|
||||
shape_stream << "]";
|
||||
shape_str = shape_stream.str();
|
||||
} else {
|
||||
shape_str = "";
|
||||
}
|
||||
return shape_str;
|
||||
}
|
||||
|
||||
static std::string get_op_args(PerformanceInformation p) {
|
||||
std::string args_str;
|
||||
if (!p.args.empty()) {
|
||||
std::stringstream args;
|
||||
args << "[";
|
||||
for (const auto i : c10::irange(p.args.size())) {
|
||||
args << "{" << p.args[i].name() << ": ";
|
||||
if (p.args[i].has_i()) {
|
||||
args << p.args[i].i();
|
||||
} else if (p.args[i].has_s()) {
|
||||
args << p.args[i].s();
|
||||
} else if (p.args[i].has_n()) {
|
||||
args << &p.args[i].n();
|
||||
} else if (p.args[i].has_f()) {
|
||||
args << p.args[i].f();
|
||||
} else {
|
||||
args << "None";
|
||||
}
|
||||
args << "}, ";
|
||||
}
|
||||
args << "]";
|
||||
args_str = args.str();
|
||||
} else {
|
||||
args_str = "";
|
||||
}
|
||||
return args_str;
|
||||
}
|
||||
|
||||
static std::string sanatize(std::string json_s) {
|
||||
// Remove illegal characters from the name that would cause json string to
|
||||
// become invalid
|
||||
json_s.erase(std::remove(json_s.begin(), json_s.end(), '"'), json_s.end());
|
||||
json_s.erase(std::remove(json_s.begin(), json_s.end(), '\\'), json_s.end());
|
||||
return json_s;
|
||||
}
|
||||
}
|
@ -1,16 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "observers/macros.h"
|
||||
#include "observers/net_observer_reporter.h"
|
||||
|
||||
#include "caffe2/core/common.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
class CAFFE2_OBSERVER_API NetObserverReporterPrint : public NetObserverReporter {
|
||||
public:
|
||||
static const std::string IDENTIFIER;
|
||||
void report(NetBase* net, std::map<std::string, PerformanceInformation>&) override;
|
||||
};
|
||||
|
||||
} // namespace caffe2
|
@ -1,12 +0,0 @@
|
||||
#include "observers/observer_config.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
int ObserverConfig::netInitSampleRate_ = 0;
|
||||
int ObserverConfig::netFollowupSampleRate_ = 0;
|
||||
int ObserverConfig::netFollowupSampleCount_ = 0;
|
||||
int ObserverConfig::operatorNetSampleRatio_ = 0;
|
||||
int ObserverConfig::skipIters_ = 0;
|
||||
unique_ptr<NetObserverReporter> ObserverConfig::reporter_ = nullptr;
|
||||
int ObserverConfig::marker_ = -1;
|
||||
}
|
@ -1,99 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "observers/macros.h"
|
||||
#include "observers/net_observer_reporter.h"
|
||||
|
||||
#include "caffe2/core/common.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
/*
|
||||
netInitSampleRate_ == 1 && operatorNetSampleRatio_ == 1 :
|
||||
Log operator metrics in every iteration
|
||||
netInitSampleRate_ == 1 && operatorNetSampleRatio_ == 0 :
|
||||
Log net metrics in every iterationn
|
||||
netInitSampleRate_ == n && netFollowupSampleRate_ == m &&
|
||||
netFollowupSampleCount == c && operatorNetSampleRatio_ == 1 :
|
||||
Log operator metrics first at odds of 1 / n. Once first logged,
|
||||
the following c logs are at odds of 1 / min(n, m). Then repeat
|
||||
netInitSampleRate_ == n && netFollowupSampleRate_ == m &&
|
||||
netFollowupSampleCount == c && operatorNetSampleRatio_ == 0 :
|
||||
Log net metrics first at odds of 1 / n. Once first logged,
|
||||
the following c logs are at odds of 1 / min(n, m). Then repeat
|
||||
netInitSampleRate_ == n && netFollowupSampleRate_ == m &&
|
||||
netFollowupSampleCount == c && operatorNetSampleRatio_ == o :
|
||||
Log net metrics first at odds of 1 / n. Once first logged,
|
||||
the following c logs are at odds of 1 / min(n, m), if the random number
|
||||
is multiples of o, log operator metrics instead. Then repeat
|
||||
skipIters_ == n: skip the first n iterations of the net.
|
||||
*/
|
||||
class CAFFE2_OBSERVER_API ObserverConfig {
|
||||
public:
|
||||
static void initSampleRate(
|
||||
int netInitSampleRate,
|
||||
int netFollowupSampleRate,
|
||||
int netFollowupSampleCount,
|
||||
int operatorNetSampleRatio,
|
||||
int skipIters) {
|
||||
CAFFE_ENFORCE(netFollowupSampleRate <= netInitSampleRate);
|
||||
CAFFE_ENFORCE(netFollowupSampleRate >= 1 || netInitSampleRate == 0);
|
||||
netInitSampleRate_ = netInitSampleRate;
|
||||
netFollowupSampleRate_ = netFollowupSampleRate;
|
||||
netFollowupSampleCount_ = netFollowupSampleCount;
|
||||
operatorNetSampleRatio_ = operatorNetSampleRatio;
|
||||
skipIters_ = skipIters;
|
||||
}
|
||||
static int getNetInitSampleRate() {
|
||||
return netInitSampleRate_;
|
||||
}
|
||||
static int getNetFollowupSampleRate() {
|
||||
return netFollowupSampleRate_;
|
||||
}
|
||||
static int getNetFollowupSampleCount() {
|
||||
return netFollowupSampleCount_;
|
||||
}
|
||||
static int getOpoeratorNetSampleRatio() {
|
||||
return operatorNetSampleRatio_;
|
||||
}
|
||||
static int getSkipIters() {
|
||||
return skipIters_;
|
||||
}
|
||||
static void setReporter(unique_ptr<NetObserverReporter> reporter) {
|
||||
reporter_ = std::move(reporter);
|
||||
}
|
||||
static NetObserverReporter* getReporter() {
|
||||
CAFFE_ENFORCE(reporter_);
|
||||
return reporter_.get();
|
||||
}
|
||||
static void setMarker(int marker) {
|
||||
marker_ = marker;
|
||||
}
|
||||
static int getMarker() {
|
||||
return marker_;
|
||||
}
|
||||
|
||||
private:
|
||||
/* The odds of log net metric initially or immediately after reset */
|
||||
static int netInitSampleRate_;
|
||||
|
||||
/* The odds of log net metric after log once after start of reset */
|
||||
static int netFollowupSampleRate_;
|
||||
|
||||
/* The number of follow up logs to be collected for odds of
|
||||
netFollowupSampleRate_ */
|
||||
static int netFollowupSampleCount_;
|
||||
|
||||
/* The odds to log the operator metric instead of the net metric.
|
||||
When the operator is logged the net is not logged. */
|
||||
static int operatorNetSampleRatio_;
|
||||
|
||||
/* skip the first few iterations */
|
||||
static int skipIters_;
|
||||
|
||||
static unique_ptr<NetObserverReporter> reporter_;
|
||||
|
||||
/* marker used in identifying the metrics in certain reporters */
|
||||
static int marker_;
|
||||
};
|
||||
|
||||
}
|
@ -1,330 +0,0 @@
|
||||
#include "observers/perf_observer.h"
|
||||
#include "observers/observer_config.h"
|
||||
#ifndef C10_MOBILE
|
||||
#include "caffe2/core/flags.h"
|
||||
#include "observers/net_observer_reporter_print.h"
|
||||
#endif
|
||||
|
||||
#include <random>
|
||||
// NOLINTNEXTLINE(modernize-deprecated-headers)
|
||||
#include <time.h>
|
||||
#include "caffe2/core/common.h"
|
||||
#include "caffe2/core/init.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
|
||||
#if defined(TARGET_OS_MAC) || \
|
||||
defined(TARGET_OS_IPHONE) || \
|
||||
defined(TARGET_IPHONE_SIMULATOR)
|
||||
#define _APPLE 1
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
#ifndef WIN32_LEAN_AND_MEAN
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
#ifdef _APPLE
|
||||
#include <mach/mach_time.h>
|
||||
#include <sys/time.h>
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
|
||||
#ifndef C10_MOBILE
|
||||
C10_DEFINE_int64(
|
||||
aiBench_netInitSampleRate,
|
||||
0,
|
||||
"One in N sampling rate for net delay");
|
||||
|
||||
C10_DEFINE_int64(
|
||||
aiBench_netFollowupSampleRate,
|
||||
0,
|
||||
"One in N sampling rate for net delay");
|
||||
|
||||
C10_DEFINE_int64(
|
||||
aiBench_netFollowupSampleCount,
|
||||
0,
|
||||
"control the following c logs");
|
||||
|
||||
C10_DEFINE_int64(
|
||||
aiBench_operatorNetSampleRatio,
|
||||
0,
|
||||
"One in N sampling rate for operator delay");
|
||||
|
||||
C10_DEFINE_int64(
|
||||
aiBench_skipIters,
|
||||
0,
|
||||
"skip the first N iterations of the net run");
|
||||
#endif
|
||||
|
||||
namespace caffe2 {
|
||||
namespace {
|
||||
|
||||
bool registerGlobalPerfNetObserverCreator(int* /*pargc*/, char*** /*pargv*/) {
|
||||
AddGlobalNetObserverCreator([](NetBase* subject) {
|
||||
return std::make_unique<PerfNetObserver>(subject);
|
||||
});
|
||||
|
||||
#if !defined(C10_MOBILE)
|
||||
// for aibench usage
|
||||
caffe2::ObserverConfig::setReporter(
|
||||
std::make_unique<caffe2::NetObserverReporterPrint>());
|
||||
|
||||
caffe2::ObserverConfig::initSampleRate(
|
||||
FLAGS_aiBench_netInitSampleRate,
|
||||
FLAGS_aiBench_netFollowupSampleRate,
|
||||
FLAGS_aiBench_netFollowupSampleCount,
|
||||
FLAGS_aiBench_operatorNetSampleRatio,
|
||||
FLAGS_aiBench_skipIters);
|
||||
#endif
|
||||
|
||||
return true;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
#ifdef _WIN32
|
||||
double getTicksPerMillisecond() {
|
||||
static LARGE_INTEGER ticks_per_sec;
|
||||
if (!ticks_per_sec.QuadPart) {
|
||||
QueryPerformanceFrequency(&ticks_per_sec);
|
||||
if (!ticks_per_sec.QuadPart) {
|
||||
return 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
return static_cast<double>(ticks_per_sec.QuadPart) / 1000.0;
|
||||
}
|
||||
#elif !defined _APPLE
|
||||
double getClockTimeMilliseconds(clockid_t clk_id) {
|
||||
int result;
|
||||
struct timespec tp;
|
||||
result = clock_gettime(clk_id, &tp);
|
||||
if (result == -1) {
|
||||
return 0.0;
|
||||
} else {
|
||||
return tp.tv_sec * 1000.0 + tp.tv_nsec / 1000000.0;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
double getWallClockTimeMilliseconds() {
|
||||
#ifdef _WIN32
|
||||
double ticks_per_ms = getTicksPerMillisecond();
|
||||
if (ticks_per_ms) {
|
||||
LARGE_INTEGER ticks;
|
||||
if (QueryPerformanceCounter(&ticks)) {
|
||||
return static_cast<double>(ticks.QuadPart) / ticks_per_ms;
|
||||
}
|
||||
}
|
||||
|
||||
return 0.0;
|
||||
#elif defined _APPLE
|
||||
static mach_timebase_info_data_t info;
|
||||
if (info.denom == 0) {
|
||||
mach_timebase_info(&info);
|
||||
}
|
||||
|
||||
uint64_t now = mach_absolute_time();
|
||||
now = now * info.numer / info.denom; // convert to nanoseconds
|
||||
return now / 1000000.0;
|
||||
#else
|
||||
return getClockTimeMilliseconds(CLOCK_MONOTONIC);
|
||||
#endif
|
||||
}
|
||||
|
||||
double getCpuTimeMilliseconds() {
|
||||
#ifdef _WIN32
|
||||
FILETIME creation_time;
|
||||
FILETIME exit_time;
|
||||
FILETIME kernel_time;
|
||||
FILETIME user_time;
|
||||
if (GetProcessTimes(
|
||||
GetCurrentProcess(),
|
||||
&creation_time,
|
||||
&exit_time,
|
||||
&kernel_time,
|
||||
&user_time)) {
|
||||
ULARGE_INTEGER kernel;
|
||||
ULARGE_INTEGER user;
|
||||
kernel.HighPart = kernel_time.dwHighDateTime;
|
||||
kernel.LowPart = kernel_time.dwLowDateTime;
|
||||
user.HighPart = user_time.dwHighDateTime;
|
||||
user.LowPart = user_time.dwLowDateTime;
|
||||
return (static_cast<double>(kernel.QuadPart) +
|
||||
static_cast<double>(user.QuadPart)) / 10000.0;
|
||||
}
|
||||
|
||||
return 0.0;
|
||||
#elif defined _APPLE
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
|
||||
struct rusage ru;
|
||||
if (getrusage(RUSAGE_SELF, &ru)) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
return ru.ru_utime.tv_sec * 1000.0
|
||||
+ ru.ru_utime.tv_usec / 1000.0
|
||||
+ ru.ru_stime.tv_sec * 1000.0
|
||||
+ ru.ru_stime.tv_usec / 1000.0;
|
||||
#else
|
||||
return getClockTimeMilliseconds(CLOCK_PROCESS_CPUTIME_ID);
|
||||
#endif
|
||||
}
|
||||
|
||||
REGISTER_CAFFE2_EARLY_INIT_FUNCTION(
|
||||
registerGlobalPerfNetObserverCreator,
|
||||
®isterGlobalPerfNetObserverCreator,
|
||||
"Caffe2 net global observer creator");
|
||||
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
|
||||
PerfNetObserver::PerfNetObserver(NetBase* subject_)
|
||||
: NetObserver(subject_), numRuns_(0) {}
|
||||
|
||||
// NOLINTNEXTLINE(modernize-use-equals-default)
|
||||
PerfNetObserver::~PerfNetObserver() {}
|
||||
|
||||
void PerfNetObserver::Start() {
|
||||
static int visitCount = 0;
|
||||
// Select whether to log the operator or the net.
|
||||
// We have one sample rate for the entire app.
|
||||
int netInitSampleRate = ObserverConfig::getNetInitSampleRate();
|
||||
int netFollowupSampleRate = ObserverConfig::getNetFollowupSampleRate();
|
||||
int netFollowupSampleCount = ObserverConfig::getNetFollowupSampleCount();
|
||||
int operatorNetSampleRatio = ObserverConfig::getOpoeratorNetSampleRatio();
|
||||
int skipIters = ObserverConfig::getSkipIters();
|
||||
int sampleRate = visitCount > 0 ? netFollowupSampleRate : netInitSampleRate;
|
||||
// NOLINTNEXTLINE(clang-analyzer-security.insecureAPI.rand)
|
||||
if (skipIters <= static_cast<int>(numRuns_) && sampleRate > 0 && rand() % sampleRate == 0) {
|
||||
visitCount++;
|
||||
if (visitCount == netFollowupSampleCount) {
|
||||
visitCount = 0;
|
||||
}
|
||||
// NOLINTNEXTLINE(clang-analyzer-security.insecureAPI.rand)
|
||||
if (operatorNetSampleRatio > 0 && rand() % operatorNetSampleRatio == 0) {
|
||||
logType_ = PerfNetObserver::OPERATOR_DELAY;
|
||||
} else {
|
||||
logType_ = PerfNetObserver::NET_DELAY;
|
||||
}
|
||||
} else {
|
||||
logType_ = PerfNetObserver::NONE;
|
||||
}
|
||||
numRuns_++;
|
||||
|
||||
if (logType_ == PerfNetObserver::OPERATOR_DELAY) {
|
||||
/* Always recreate new operator observers
|
||||
whenever we measure operator delay */
|
||||
const auto& operators = subject_->GetOperators();
|
||||
for (auto* op : operators) {
|
||||
observerMap_[op] = op->AttachObserver(
|
||||
std::make_unique<PerfOperatorObserver>(op, this));
|
||||
}
|
||||
}
|
||||
|
||||
wallMilliseconds_ = getWallClockTimeMilliseconds();
|
||||
cpuMilliseconds_ = getCpuTimeMilliseconds();
|
||||
}
|
||||
|
||||
void PerfNetObserver::Stop() {
|
||||
if (logType_ == PerfNetObserver::NONE) {
|
||||
return;
|
||||
}
|
||||
std::map<std::string, PerformanceInformation> info;
|
||||
PerformanceInformation net_perf;
|
||||
net_perf.cpuMilliseconds =
|
||||
getCpuTimeMilliseconds() - cpuMilliseconds_;
|
||||
net_perf.latency =
|
||||
getWallClockTimeMilliseconds() - wallMilliseconds_;
|
||||
|
||||
if (logType_ == PerfNetObserver::OPERATOR_DELAY) {
|
||||
const auto& operators = subject_->GetOperators();
|
||||
for (unsigned idx = 0; idx < operators.size(); ++idx) {
|
||||
const auto* op = operators[idx];
|
||||
auto name = getObserverName(op, static_cast<int>(idx));
|
||||
PerformanceInformation p;
|
||||
const PerfOperatorObserver* opObserver =
|
||||
static_cast<const PerfOperatorObserver*>(observerMap_[op]);
|
||||
p.latency = opObserver->getWallMilliseconds();
|
||||
p.cpuMilliseconds = opObserver->getCpuMilliseconds();
|
||||
p.engine = op->engine();
|
||||
p.type = op->type();
|
||||
p.tensor_shapes =
|
||||
static_cast<const PerfOperatorObserver*>(observerMap_[op])
|
||||
->getTensorShapes();
|
||||
|
||||
if (op->has_debug_def()) {
|
||||
// NOLINTNEXTLINE(performance-for-range-copy)
|
||||
for (auto arg : op->debug_def().arg()) {
|
||||
p.args.emplace_back(arg);
|
||||
}
|
||||
}
|
||||
|
||||
info.insert({name, p});
|
||||
}
|
||||
|
||||
/* clear all operator delay after use so that we don't spent time
|
||||
collecting the operator delay info in later runs */
|
||||
for (auto* op : operators) {
|
||||
op->DetachObserver(observerMap_[op]);
|
||||
}
|
||||
observerMap_.clear();
|
||||
}
|
||||
info.insert({"NET_DELAY", net_perf});
|
||||
ObserverConfig::getReporter()->report(subject_, info);
|
||||
}
|
||||
|
||||
caffe2::string PerfNetObserver::getObserverName(const OperatorBase* op, int idx)
|
||||
const {
|
||||
string opType = op->has_debug_def() ? op->debug_def().type() : "NO_TYPE";
|
||||
string displayName =
|
||||
(op->has_debug_def() ? op->debug_def().name().size()
|
||||
? op->debug_def().name()
|
||||
: (op->debug_def().output_size() ? op->debug_def().output(0)
|
||||
: "NO_OUTPUT")
|
||||
: "NO_DEF");
|
||||
caffe2::string name =
|
||||
"ID_" + c10::to_string(idx) + "_" + opType + "_" + displayName;
|
||||
return name;
|
||||
}
|
||||
|
||||
PerfOperatorObserver::PerfOperatorObserver(
|
||||
OperatorBase* op,
|
||||
PerfNetObserver* netObserver)
|
||||
: ObserverBase<OperatorBase>(op),
|
||||
netObserver_(netObserver),
|
||||
wallMilliseconds_(0),
|
||||
cpuMilliseconds_(0) {
|
||||
CAFFE_ENFORCE(netObserver_, "Observers can't operate outside of the net");
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(modernize-use-equals-default)
|
||||
PerfOperatorObserver::~PerfOperatorObserver() {}
|
||||
|
||||
void PerfOperatorObserver::Start() {
|
||||
wallMilliseconds_ = getWallClockTimeMilliseconds();
|
||||
cpuMilliseconds_ = getCpuTimeMilliseconds();
|
||||
}
|
||||
|
||||
void PerfOperatorObserver::Stop() {
|
||||
/* Time from the start of the net minus the time spent on all other
|
||||
operators is the time spent on this operator */
|
||||
cpuMilliseconds_ =
|
||||
getCpuTimeMilliseconds() - cpuMilliseconds_;
|
||||
wallMilliseconds_ =
|
||||
getWallClockTimeMilliseconds() - wallMilliseconds_;
|
||||
tensor_shapes_ = subject_->InputTensorShapes();
|
||||
}
|
||||
|
||||
double PerfOperatorObserver::getWallMilliseconds() const {
|
||||
return wallMilliseconds_;
|
||||
}
|
||||
|
||||
double PerfOperatorObserver::getCpuMilliseconds() const {
|
||||
return cpuMilliseconds_;
|
||||
}
|
||||
|
||||
std::vector<TensorShape> PerfOperatorObserver::getTensorShapes() const {
|
||||
return tensor_shapes_;
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
@ -1,66 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "caffe2/core/common.h"
|
||||
#include "caffe2/core/net.h"
|
||||
#include "caffe2/core/observer.h"
|
||||
#include "caffe2/core/timer.h"
|
||||
#include "observers/macros.h"
|
||||
|
||||
#include <unordered_map>
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
double getClockTimeMilliseconds();
|
||||
|
||||
class CAFFE2_OBSERVER_API PerfNetObserver : public NetObserver {
|
||||
public:
|
||||
explicit PerfNetObserver(NetBase* subject_);
|
||||
virtual ~PerfNetObserver();
|
||||
|
||||
private:
|
||||
void Start() override;
|
||||
void Stop() override;
|
||||
|
||||
caffe2::string getObserverName(const OperatorBase* op, int idx) const;
|
||||
|
||||
private:
|
||||
enum LogType {
|
||||
NONE,
|
||||
OPERATOR_DELAY,
|
||||
NET_DELAY,
|
||||
};
|
||||
LogType logType_;
|
||||
unsigned int numRuns_;
|
||||
std::unordered_map<const OperatorBase*, const ObserverBase<OperatorBase>*>
|
||||
observerMap_;
|
||||
|
||||
double wallMilliseconds_;
|
||||
double cpuMilliseconds_;
|
||||
};
|
||||
|
||||
class PerfOperatorObserver : public ObserverBase<OperatorBase> {
|
||||
public:
|
||||
PerfOperatorObserver(OperatorBase* op, PerfNetObserver* netObserver);
|
||||
virtual ~PerfOperatorObserver();
|
||||
|
||||
double getWallMilliseconds() const;
|
||||
double getCpuMilliseconds() const;
|
||||
std::vector<TensorShape> getTensorShapes() const;
|
||||
|
||||
private:
|
||||
void Start() override;
|
||||
void Stop() override;
|
||||
|
||||
private:
|
||||
// Observer of a net that owns corresponding op. We make sure net is never
|
||||
// destructed while operator observer is still alive. First operator observer
|
||||
// gets destructed, then the op, then the net and its observer.
|
||||
// We do this trick in order to get access to net's name and other fields
|
||||
// without storing inside the operator observer. Each field is memory
|
||||
// costly here and a raw pointer is a cheapest sholution
|
||||
PerfNetObserver* netObserver_;
|
||||
double wallMilliseconds_;
|
||||
double cpuMilliseconds_;
|
||||
std::vector<TensorShape> tensor_shapes_;
|
||||
};
|
||||
} // namespace caffe2
|
6
setup.py
6
setup.py
@ -88,12 +88,6 @@
|
||||
# disables use of system-wide nccl (we will use our submoduled
|
||||
# copy in third_party/nccl)
|
||||
#
|
||||
# BUILD_CAFFE2_OPS=0
|
||||
# disable Caffe2 operators build
|
||||
#
|
||||
# BUILD_CAFFE2=0
|
||||
# disable Caffe2 build
|
||||
#
|
||||
# USE_IBVERBS
|
||||
# toggle features related to distributed support
|
||||
#
|
||||
|
@ -145,26 +145,6 @@ void validateBlock(
|
||||
"\n\nDefined at:\n" + getNodeStackTraceString(node))
|
||||
}
|
||||
} else {
|
||||
#ifdef BUILD_CAFFE2
|
||||
// Assuming this is a Caffe2 change as it only modifies an aten op
|
||||
// for operator_export_type == ONNX_ATEN_FALLBACK, which is a common
|
||||
// pattern for Caffe2-specific scenarios.
|
||||
if (node->kind() == aten::expand) {
|
||||
if (operator_export_type ==
|
||||
onnx_torch::OperatorExportTypes::ONNX_ATEN_FALLBACK) {
|
||||
WithInsertPoint guard(node);
|
||||
auto* new_node =
|
||||
b->owningGraph()->insertNode(b->owningGraph()->create(
|
||||
Symbol(::c10::aten::ATen),
|
||||
node->inputs(),
|
||||
node->outputs().size()));
|
||||
for (size_t i = 0; i < node->outputs().size(); ++i) {
|
||||
node->output(i)->replaceAllUsesWith(new_node->output(i));
|
||||
}
|
||||
new_node->s_(Symbol::fromQualString("attr::operator"), "expand");
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (node->kind() == prim::PackPadded || node->kind() == prim::PadPacked) {
|
||||
if (operator_export_type !=
|
||||
onnx_torch::OperatorExportTypes::ONNX_FALLTHROUGH) {
|
||||
|
@ -293,10 +293,6 @@ void initONNXBindings(PyObject* module) {
|
||||
|
||||
onnx.attr("PRODUCER_VERSION") = py::str(TORCH_VERSION);
|
||||
|
||||
#ifdef BUILD_CAFFE2
|
||||
onnx.attr("_CAFFE2_ATEN_FALLBACK") = true;
|
||||
#else
|
||||
onnx.attr("_CAFFE2_ATEN_FALLBACK") = false;
|
||||
#endif
|
||||
}
|
||||
} // namespace torch::onnx
|
||||
|
@ -350,11 +350,6 @@ def export(
|
||||
%3 : Float = onnx::Mul(%2, %0)
|
||||
return (%3)
|
||||
|
||||
If PyTorch was built with Caffe2 (i.e. with ``BUILD_CAFFE2=1``), then
|
||||
Caffe2-specific behavior will be enabled, including special support
|
||||
for ops are produced by the modules described in
|
||||
`Quantization <https://pytorch.org/docs/stable/quantization.html>`_.
|
||||
|
||||
.. warning::
|
||||
|
||||
Models exported this way are probably runnable only by Caffe2.
|
||||
@ -1802,9 +1797,8 @@ def _add_output_to_block(block: _C.Block, value: _C.Value) -> int:
|
||||
def _should_aten_fallback(
|
||||
name: str, opset_version: int, operator_export_type: _C_onnx.OperatorExportTypes
|
||||
):
|
||||
# For BUILD_CAFFE2=0 builds, if domain=="aten" and operator_export_type==ONNX_ATEN,
|
||||
# For all builds, if domain=="aten" and operator_export_type==ONNX_ATEN,
|
||||
# an aten::ATen operator is created regardless of symbolics existence
|
||||
# For BUILD_CAFFE2=1, the same applies only if there is no symbolic available
|
||||
|
||||
is_exportable_aten_op = registration.registry.is_registered_op(name, opset_version)
|
||||
is_onnx_aten_export = operator_export_type == _C_onnx.OperatorExportTypes.ONNX_ATEN
|
||||
|
Reference in New Issue
Block a user