Remove caffe2 from more build files (#125898)

Co-authored-by: Aaron Gokaslan <aaronGokaslan@gmail.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/125898
Approved by: https://github.com/Skylion007
This commit is contained in:
Richard Barnes
2024-05-13 18:37:54 +00:00
committed by PyTorch MergeBot
parent b620231378
commit b9e7b35912
70 changed files with 36 additions and 5361 deletions

View File

@ -44,11 +44,6 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
fi
fi
if [[ ${BUILD_ENVIRONMENT} == *"caffe2"* ]]; then
echo "Caffe2 build is ON"
export BUILD_CAFFE2=ON
fi
if [[ ${BUILD_ENVIRONMENT} == *"paralleltbb"* ]]; then
export ATEN_THREADING=TBB
export USE_TBB=1

View File

@ -181,11 +181,7 @@ option(BUILD_BINARY "Build C++ binaries" OFF)
option(BUILD_DOCS "Build Caffe2 documentation" OFF)
option(BUILD_CUSTOM_PROTOBUF "Build and use Caffe2's own protobuf under third_party" ON)
option(BUILD_PYTHON "Build Python binaries" ON)
option(BUILD_CAFFE2 "Master flag to build Caffe2" OFF)
option(BUILD_LITE_INTERPRETER "Master flag to build Lite Interpreter" OFF)
cmake_dependent_option(
BUILD_CAFFE2_OPS "Build Caffe2 operators" ON
"BUILD_CAFFE2" OFF)
option(BUILD_SHARED_LIBS "Build libcaffe2.so" ON)
cmake_dependent_option(
CAFFE2_LINK_LOCAL_PROTOBUF "If set, build protobuf inside libcaffe2.so." ON
@ -635,7 +631,6 @@ if(INTERN_BUILD_MOBILE)
endif()
set(BUILD_PYTHON OFF)
set(BUILD_FUNCTORCH OFF)
set(BUILD_CAFFE2_OPS OFF)
set(USE_DISTRIBUTED OFF)
set(NO_API ON)
set(USE_FBGEMM OFF)
@ -1208,13 +1203,6 @@ else()
"shared libs.")
endif()
# ---[ Modules
# If master flag for buildling Caffe2 is disabled, we also disable the
# build for Caffe2 related operator modules.
if(BUILD_CAFFE2)
add_subdirectory(modules)
endif()
# ---[ Binaries
# Binaries will be built after the Caffe2 main libraries and the modules
# are built. For the binaries, they will be linked to the Caffe2 main

View File

@ -667,7 +667,6 @@ only interested in a specific component.
- Working on a test binary? Run `(cd build && ninja bin/test_binary_name)` to
rebuild only that test binary (without rerunning cmake). (Replace `ninja` with
`make` if you don't have ninja installed).
- Don't need Caffe2? Pass `BUILD_CAFFE2=0` to disable Caffe2 build.
On the initial build, you can also speed things up with the environment
variables `DEBUG`, `USE_DISTRIBUTED`, `USE_MKLDNN`, `USE_CUDA`, `USE_FLASH_ATTENTION`, `USE_MEM_EFF_ATTENTION`, `BUILD_TEST`, `USE_FBGEMM`, `USE_NNPACK` and `USE_QNNPACK`.
@ -1196,7 +1195,7 @@ build_with_asan()
LDFLAGS="-stdlib=libstdc++" \
CFLAGS="-fsanitize=address -fno-sanitize-recover=all -shared-libasan -pthread" \
CXX_FLAGS="-pthread" \
USE_CUDA=0 USE_OPENMP=0 BUILD_CAFFE2_OPS=0 USE_DISTRIBUTED=0 DEBUG=1 \
USE_CUDA=0 USE_OPENMP=0 USE_DISTRIBUTED=0 DEBUG=1 \
python setup.py develop
}

View File

@ -379,7 +379,7 @@ You can also pass the `CMAKE_VARS="..."` environment variable to specify additio
See [setup.py](./setup.py) for the list of available variables.
```bash
CMAKE_VARS="BUILD_CAFFE2=ON BUILD_CAFFE2_OPS=ON" make -f docker.Makefile
make -f docker.Makefile
```
### Building the Documentation

View File

@ -54,7 +54,7 @@ if(NOT BUILD_LITE_INTERPRETER)
endif()
EXCLUDE(ATen_CORE_SRCS "${ATen_CORE_SRCS}" ${ATen_CORE_TEST_SRCS})
# Exclude TensorImpl_test.cpp if compiling without Caffe2
if(NOT BUILD_CAFFE2 AND NOT BUILD_LITE_INTERPRETER)
if(NOT BUILD_LITE_INTERPRETER)
file(GLOB_RECURSE ATen_CORE_EXCLUDED_TEST_SRCS "core/TensorImpl_test.cpp")
EXCLUDE(ATen_CORE_TEST_SRCS "${ATen_CORE_TEST_SRCS}" ${ATen_CORE_EXCLUDED_TEST_SRCS})
endif()

View File

@ -123,16 +123,6 @@ list(APPEND ATen_XPU_TEST_SRCS
${CMAKE_CURRENT_SOURCE_DIR}/xpu_generator_test.cpp
)
# Caffe2 specific tests
if(BUILD_CAFFE2)
list(APPEND ATen_CPU_TEST_SRCS
${CMAKE_CURRENT_SOURCE_DIR}/ExclusivelyOwned_test.cpp
${CMAKE_CURRENT_SOURCE_DIR}/tensor_interop_test.cpp)
list(APPEND ATen_CUDA_TEST_SRCS
${CMAKE_CURRENT_SOURCE_DIR}/cuda_tensor_interop_test.cpp)
endif()
# ---[ Send the lists to the parent scope.
set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE)
set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE)

View File

@ -7,16 +7,6 @@ if(INTERN_BUILD_MOBILE)
return()
endif()
if(BUILD_CAFFE2)
caffe2_binary_target("at_launch_benchmark.cc")
target_include_directories(at_launch_benchmark PUBLIC
${CMAKE_BINARY_DIR}/aten/src)
caffe2_binary_target("intra_inter_benchmark.cc")
target_include_directories(intra_inter_benchmark PUBLIC
${CMAKE_BINARY_DIR}/aten/src)
endif()
caffe2_binary_target("parallel_info.cc")
target_include_directories(parallel_info PUBLIC
${CMAKE_BINARY_DIR}/aten/src) # provides "ATen/TypeExtendedInterface.h" to ATen.h

View File

@ -279,7 +279,6 @@ def get_pt_preprocessor_flags():
"-D_THP_CORE",
"-DUSE_SCALARS",
"-DNO_CUDNN_DESTROY_HANDLE",
"-DBUILD_CAFFE2",
]
if _is_build_mode_dev():

View File

@ -110,21 +110,11 @@ endif()
add_subdirectory(core)
add_subdirectory(serialize)
add_subdirectory(utils)
if(BUILD_CAFFE2 OR (NOT USE_FBGEMM))
if(NOT USE_FBGEMM)
add_subdirectory(perfkernels)
endif()
# Skip modules that are not used by libtorch mobile yet.
if(BUILD_CAFFE2 AND NOT INTERN_BUILD_MOBILE)
add_subdirectory(core/nomnigraph)
if(USE_NVRTC)
add_subdirectory(cuda_rtc)
endif()
if(BUILD_CAFFE2_OPS)
endif()
add_subdirectory(proto)
endif()
if(NOT BUILD_CAFFE2 AND NOT INTERN_BUILD_MOBILE)
if(NOT INTERN_BUILD_MOBILE)
add_subdirectory(proto)
endif()
@ -585,17 +575,10 @@ if(NOT INTERN_BUILD_MOBILE AND NOT BUILD_LITE_INTERPRETER)
${TORCH_SRC_DIR}/csrc/utils/byte_order.cpp
)
# Disable legacy import of building without Caffe2 support
if(BUILD_CAFFE2)
list(APPEND TORCH_SRCS
${TORCH_SRC_DIR}/csrc/jit/serialization/import_legacy.cpp
)
else()
set_source_files_properties(
${TORCH_SRC_DIR}/csrc/jit/serialization/import.cpp
PROPERTIES COMPILE_FLAGS "-DC10_DISABLE_LEGACY_IMPORT"
)
endif()
set_source_files_properties(
${TORCH_SRC_DIR}/csrc/jit/serialization/import.cpp
PROPERTIES COMPILE_FLAGS "-DC10_DISABLE_LEGACY_IMPORT"
)
if(USE_DISTRIBUTED)
append_filelist("libtorch_distributed_base_sources" TORCH_SRCS)
if(NOT WIN32)
@ -809,11 +792,6 @@ if(HAVE_SOVERSION)
VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION})
endif()
torch_compile_options(torch_cpu) # see cmake/public/utils.cmake
if(BUILD_CAFFE2 AND NOT MSVC)
# Caffe2 has too many signed-unsigned violation, but the framework is dead
# So no point in fixing those
target_compile_options(torch_cpu PRIVATE "-Wno-sign-compare")
endif()
# Ignore Wdeprecated-XXX errors from third-party libraries
if(NOT MSVC)
@ -1921,14 +1899,6 @@ if(BUILD_TEST)
endif()
endforeach()
endif()
# For special tests that explicitly uses dependencies, we add them here
if(BUILD_CAFFE2 AND USE_MPI)
target_link_libraries(mpi_test MPI::MPI_CXX)
if(USE_CUDA)
target_link_libraries(mpi_gpu_test MPI::MPI_CXX)
endif()
endif()
endif()
if(MSVC)
@ -1998,11 +1968,6 @@ if(BUILD_PYTHON)
set_source_files_properties(${TORCH_SRC_DIR}/../caffe2/operators/box_with_nms_limit_op.cc PROPERTIES COMPILE_FLAGS -Wno-attributes)
endif()
# ---[ Python.
if(BUILD_CAFFE2)
target_compile_definitions(torch PRIVATE BUILD_CAFFE2)
endif()
# generated pb files are copied from build/caffe2 to caffe2
# if we copied them back to build this would create a build cycle
# consider removing the need for globs

View File

@ -2,5 +2,4 @@ import warnings
from torch.onnx import _CAFFE2_ATEN_FALLBACK
if not _CAFFE2_ATEN_FALLBACK:
warnings.warn("Caffe2 support is not fully enabled in this PyTorch build. "
"Please enable Caffe2 by building PyTorch from source with `BUILD_CAFFE2=1` flag.")
warnings.warn("Caffe2 support is no longer present in PyTorch.")

View File

@ -1,68 +1,4 @@
if(NOT BUILD_CAFFE2 OR INTERN_BUILD_MOBILE)
list(APPEND Caffe2_CPU_SRCS
"${CMAKE_CURRENT_SOURCE_DIR}/common.cc"
)
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
return()
endif()
# ---[ GPU files
# ------[ cuDNN
if(USE_CUDNN)
file(GLOB tmp *_cudnn.cc)
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
endif()
# ------[ general GPU
file(GLOB tmp *_gpu.cc)
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
# ------[ CUDA sources
file(GLOB tmp *.cu)
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
# exclude test files
file(GLOB tmp *_test.cc)
exclude(Caffe2_GPU_SRCS "${Caffe2_GPU_SRCS}" ${tmp})
# ---[ general HIP files
file(GLOB tmp hip/*.cc)
set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${tmp})
# ------[ HIP sources
file(GLOB tmp hip/*.hip)
set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${tmp})
# exclude test files
file(GLOB tmp hip/*_test.cc)
exclude(Caffe2_HIP_SRCS "${Caffe2_HIP_SRCS}" ${tmp})
# ---[ CPU files.
file(GLOB tmp *.cc)
# Manually remove the cudnn files since we might be using USE_CUDNN=OFF
# TODO: when we move to explicit file list, this would not be needed.
file(GLOB tmp_cudnn *_cudnn.cc)
exclude(tmp "${tmp}" ${tmp_cudnn})
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp})
# exclude test files and gpu files
file(GLOB tmp *_test.cc)
exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp})
exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_GPU_SRCS})
exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_HIP_SRCS})
# ---[ GPU test files
file(GLOB tmp *_gpu_test.cc)
set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} ${tmp})
# ---[ HIP test files
file(GLOB tmp hip/*_test.cc)
set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} ${tmp})
# ---[ CPU test files
file(GLOB tmp *_test.cc)
set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp})
exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_GPU_TEST_SRCS})
exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_HIP_TEST_SRCS})
# ---[ Send the lists to the parent scope.
list(APPEND Caffe2_CPU_SRCS
"${CMAKE_CURRENT_SOURCE_DIR}/common.cc"
)
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE)
set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} PARENT_SCOPE)

View File

@ -1,5 +1,6 @@
#include "caffe2/core/blob_serialization.h"
#include <limits>
#include <mutex>
#include <sstream>
#include <utility>
@ -83,8 +84,7 @@ Range<T*> GetMutableTensorDataRange(
size_t start,
size_t numElements) {
CAFFE_ENFORCE(
// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
start + numElements <= tensor.numel(),
static_cast<int64_t>(start + numElements) <= tensor.numel(),
"Requested invalid mutable tensor range [",
start,
", ",
@ -100,8 +100,7 @@ c10::ArrayRef<T> GetTensorDataRange(
size_t start,
size_t numElements) {
CAFFE_ENFORCE(
// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
start + numElements <= tensor.numel(),
static_cast<int64_t>(start + numElements) <= tensor.numel(),
"Requested invalid tensor range [",
start,
", ",
@ -390,8 +389,7 @@ void TensorSerializer::SerializeWithOptions(
// Poorman's IOBound ThreadPool
SimpleQueue<size_t> chunkQueue;
auto task = [&]() {
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
size_t chunkStart;
size_t chunkStart = std::numeric_limits<size_t>::max();
while (chunkQueue.Pop(&chunkStart)) {
processChunk(chunkStart);
}
@ -409,8 +407,7 @@ void TensorSerializer::SerializeWithOptions(
VLOG(1) << "Serializing blob " << name;
// Serialize whole vector. If vector is empty, it's shape still needs to be
// serialized in empty proto
for (size_t chunkBegin = 0;
// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
for (int64_t chunkBegin = 0;
chunkBegin < std::max(tensor.numel(), static_cast<int64_t>(1));
chunkBegin += chunk_size) {
VLOG(2) << "Starting a chunk at " << chunkBegin;
@ -582,8 +579,7 @@ void SerializeTensorData(const SerializeParams<float>& params) {
BlobSerializationOptions_FloatFormat_FLOAT_BFLOAT16) {
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
std::unique_ptr<float[]> tmp_buffer;
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
const float* src;
const float* src = nullptr;
if (params.context.device() == CPU) {
src = params.input.data();
} else {
@ -653,14 +649,12 @@ void TensorSerializer::Serialize(
size_t chunkBegin,
int32_t chunkSize) {
CAFFE_ENFORCE(
// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
chunkBegin <= input.numel(),
static_cast<int64_t>(chunkBegin) <= input.numel(),
"Chunk begin is out of tensor: ",
chunkBegin,
' ',
input.numel());
// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
if (chunkBegin + chunkSize > input.numel()) {
if (static_cast<int64_t>(chunkBegin + chunkSize) > input.numel()) {
chunkSize = input.numel() - chunkBegin;
}
@ -1029,8 +1023,7 @@ DESERIALIZE_IMPL(float, FMT_BFLOAT16) {
params.tensor_proto.raw_data().data());
// If we are on a big-endian machine, byte-swap the serialized data.
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
const fbgemm::bfloat16* src;
const fbgemm::bfloat16* src = nullptr;
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
std::unique_ptr<fbgemm::bfloat16[]> bswap_buffer;
if (kIsLittleEndian) {
@ -1045,8 +1038,7 @@ DESERIALIZE_IMPL(float, FMT_BFLOAT16) {
// bfloat16 to float conversion.
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
std::unique_ptr<float[]> tmp_buffer;
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
float* dest;
float* dest = nullptr;
if (params.context.device() == CPU) {
dest = params.dest.data();
} else {

View File

@ -1,8 +1,4 @@
if(BUILD_CAFFE2)
file(GLOB Caffe2_PROTOBUF_FILES "${CMAKE_CURRENT_SOURCE_DIR}/*.proto")
else()
set(Caffe2_PROTOBUF_FILES "${CMAKE_CURRENT_SOURCE_DIR}/torch.proto;${CMAKE_CURRENT_SOURCE_DIR}/caffe2.proto")
endif()
set(Caffe2_PROTOBUF_FILES "${CMAKE_CURRENT_SOURCE_DIR}/torch.proto;${CMAKE_CURRENT_SOURCE_DIR}/caffe2.proto")
caffe2_protobuf_generate_cpp_py(Caffe2_PROTO_SRCS Caffe2_PROTO_HEADERS Caffe2_PROTO_PY ${Caffe2_PROTOBUF_FILES})

View File

@ -14,8 +14,7 @@ import warnings
try:
from caffe2.proto import caffe2_pb2, metanet_pb2, torch_pb2
except ImportError:
warnings.warn('Caffe2 support is not enabled in this PyTorch build. '
'Please enable Caffe2 by building PyTorch from source with `BUILD_CAFFE2=1` flag.')
warnings.warn('Caffe2 support is no longer present in PyTorch.')
raise
try:

View File

@ -6,8 +6,7 @@ import warnings
try:
from caffe2.proto import caffe2_pb2
except ImportError:
warnings.warn('Caffe2 support is not enabled in this PyTorch build. '
'Please enable Caffe2 by building PyTorch from source with `BUILD_CAFFE2=1` flag.')
warnings.warn('Caffe2 support is no longer present in PyTorch.')
raise
# TODO: refactor & remove the following alias

View File

@ -1,100 +1,18 @@
if(NOT BUILD_CAFFE2 OR INTERN_BUILD_MOBILE)
list(APPEND Caffe2_CPU_SRCS
utils/string_utils.cc
utils/threadpool/ThreadPool.cc
)
if(USE_PTHREADPOOL AND NOT USE_INTERNAL_PTHREADPOOL_IMPL)
list(APPEND Caffe2_CPU_SRCS
utils/threadpool/pthreadpool-cpp.cc
utils/threadpool/thread_pool_guard.cpp
)
endif()
if(NOT BUILD_CAFFE2 AND NOT INTERN_BUILD_MOBILE)
list(APPEND Caffe2_CPU_SRCS
utils/proto_wrap.cc
)
endif()
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
return()
endif()
list(APPEND Caffe2_CPU_SRCS
utils/bench_utils.cc
utils/cpuid.cc
utils/math/broadcast.cc
utils/math/elementwise.cc
utils/math/reduce.cc
utils/math/transpose.cc
utils/math/utils.cc
utils/math_cpu.cc
utils/murmur_hash3.cc
utils/proto_utils.cc
utils/proto_wrap.cc
utils/string_utils.cc
utils/threadpool/ThreadPool.cc
utils/signal_handler.cc
utils/smart_tensor_printer.cc
utils/string_utils.cc)
)
if(USE_PTHREADPOOL)
if(USE_PTHREADPOOL AND NOT USE_INTERNAL_PTHREADPOOL_IMPL)
list(APPEND Caffe2_CPU_SRCS
utils/threadpool/pthreadpool-cpp.cc
utils/threadpool/thread_pool_guard.cpp)
if(USE_INTERNAL_PTHREADPOOL_IMPL)
list(APPEND Caffe2_CPU_SRCS
utils/threadpool/pthreadpool.cc
utils/threadpool/pthreadpool_impl.cc)
endif()
utils/threadpool/thread_pool_guard.cpp
)
endif()
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS}
utils/math/broadcast.cu
utils/math/elementwise.cu
utils/math/reduce.cu
utils/math/transpose.cu
utils/math_gpu.cu
)
set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS}
utils/math/hip/broadcast.hip
utils/math/hip/elementwise.hip
utils/math/hip/reduce.hip
utils/math/hip/transpose.hip
utils/hip/math_gpu.hip
)
set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS}
utils/fixed_divisor_test.cc
utils/math_test.cc
utils/fatal_signal_asan_no_sig_test.cc
utils/simple_queue_test.cc
utils/proto_utils_test.cc
utils/smart_tensor_printer_test.cc
utils/cast_test.cc
)
if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "s390x")
set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS}
utils/cpuid_test.cc
)
if(NOT INTERN_BUILD_MOBILE)
list(APPEND Caffe2_CPU_SRCS
utils/proto_wrap.cc
)
endif()
set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS}
utils/math_gpu_test.cc
)
set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS}
utils/hip/math_gpu_test.cc
utils/hip/math_blas_gpu_test.cc
)
# TODO Once all source files are defined inside the local c10_utils_xxx targets,
# it should be the job of the parent CMakeLists.txt to decide what to do with the target (i.e. link it to caffe2)
# instead of us locally adding it to Caffe2_xxx variables.
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE)
set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} PARENT_SCOPE)

View File

@ -1685,9 +1685,6 @@ if(NOT INTERN_BUILD_MOBILE)
if(MKLDNN_FOUND)
set(AT_MKLDNN_ENABLED 1)
include_directories(AFTER SYSTEM ${MKLDNN_INCLUDE_DIR})
if(BUILD_CAFFE2_OPS)
list(APPEND Caffe2_DEPENDENCY_LIBS caffe2::mkldnn)
endif(BUILD_CAFFE2_OPS)
else()
message(WARNING "MKLDNN could not be found.")
caffe2_update_option(USE_MKLDNN OFF)

View File

@ -23,8 +23,6 @@ function(caffe2_print_configuration_summary)
message(STATUS "")
message(STATUS " TORCH_VERSION : ${TORCH_VERSION}")
message(STATUS " BUILD_CAFFE2 : ${BUILD_CAFFE2}")
message(STATUS " BUILD_CAFFE2_OPS : ${BUILD_CAFFE2_OPS}")
message(STATUS " BUILD_STATIC_RUNTIME_BENCHMARK: ${BUILD_STATIC_RUNTIME_BENCHMARK}")
message(STATUS " BUILD_BINARY : ${BUILD_BINARY}")
message(STATUS " BUILD_CUSTOM_PROTOBUF : ${BUILD_CUSTOM_PROTOBUF}")

View File

@ -80,9 +80,6 @@ else()
# shared library.
# TODO: this list might be incomplete.
append_torchlib_if_found(c10)
if(@BUILD_CAFFE2@)
append_torchlib_if_found(Caffe2_perfkernels_avx512 Caffe2_perfkernels_avx2 Caffe2_perfkernels_avx)
endif()
if(@USE_NNPACK@)
append_torchlib_if_found(nnpack)

View File

@ -1,7 +0,0 @@
project(modules CXX C)
add_subdirectory(detectron)
add_subdirectory(module_test)
add_subdirectory(observers)
# Finally, set Caffe2_MODULES to parent scope.
set(Caffe2_MODULES ${Caffe2_MODULES} PARENT_SCOPE)

View File

@ -1,57 +0,0 @@
file(GLOB Detectron_CPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc)
file(GLOB Detectron_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cu)
file(GLOB_RECURSE Detectron_HIP_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.hip)
if(BUILD_CAFFE2_OPS)
# Note(ilijar): Since Detectron ops currently have no
# CPU implementation, we only build GPU ops for now.
if(USE_CUDA)
add_library(
caffe2_detectron_ops_gpu SHARED
${Detectron_CPU_SRCS}
${Detectron_GPU_SRCS})
target_link_libraries(caffe2_detectron_ops_gpu PRIVATE torch)
if(USE_OPENMP)
target_link_libraries(caffe2_detectron_ops_gpu PRIVATE caffe2::openmp)
endif()
if(USE_MKLDNN)
target_link_libraries(caffe2_detectron_ops_gpu PRIVATE caffe2::mkldnn)
endif()
install(TARGETS caffe2_detectron_ops_gpu DESTINATION lib)
if(MSVC)
install(FILES $<TARGET_PDB_FILE:caffe2_detectron_ops_gpu> DESTINATION lib OPTIONAL)
endif()
elseif(USE_ROCM)
hip_include_directories(${Caffe2_HIP_INCLUDES})
set_source_files_properties(${Detectron_HIP_SRCS} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
HIP_ADD_LIBRARY(
caffe2_detectron_ops_hip SHARED
${Detectron_CPU_SRCS}
${Detectron_HIP_SRCS})
target_compile_options(caffe2_detectron_ops_hip PRIVATE ${HIP_CXX_FLAGS})
if(USE_MKLDNN)
target_link_libraries(caffe2_detectron_ops_hip PRIVATE caffe2::mkldnn)
endif()
target_link_libraries(caffe2_detectron_ops_hip PRIVATE torch)
install(TARGETS caffe2_detectron_ops_hip DESTINATION lib)
elseif(NOT IOS_PLATFORM)
add_library(caffe2_detectron_ops SHARED ${Detectron_CPU_SRCS})
if(HAVE_SOVERSION)
set_target_properties(caffe2_detectron_ops PROPERTIES
VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION})
endif()
target_link_libraries(caffe2_detectron_ops PRIVATE torch)
if(USE_OPENMP)
target_link_libraries(caffe2_detectron_ops PRIVATE caffe2::openmp)
endif()
if(USE_MKLDNN)
target_link_libraries(caffe2_detectron_ops PRIVATE caffe2::mkldnn)
endif()
install(TARGETS caffe2_detectron_ops DESTINATION lib)
if(MSVC)
install(FILES $<TARGET_PDB_FILE:caffe2_detectron_ops> DESTINATION lib OPTIONAL)
endif()
endif()
endif()

View File

@ -1,83 +0,0 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "modules/detectron/group_spatial_softmax_op.h"
#include "caffe2/operators/softmax_utils.h"
namespace caffe2 {
REGISTER_CPU_OPERATOR(
GroupSpatialSoftmax,
GroupSpatialSoftmaxOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(
GroupSpatialSoftmaxGradient,
GroupSpatialSoftmaxGradientOp<float, CPUContext>);
OPERATOR_SCHEMA(GroupSpatialSoftmax)
.NumInputs(1)
.NumOutputs(1)
.SetDoc(R"DOC(
RetinaNet specific form of spatial softmax.
The input is assumed to be unnormalized scores (sometimes called 'logits')
arranged in a 4D tensor with shape (N, C, H, W), where N is the number of
elements in the batch, H and W are the height and width, and C = num_anchors *
num_classes defines num_anchors 'groups' of softmax inputs, each of length
num_classes. The softmax is applied to each group independently.
See: https://arxiv.org/abs/1708.02002 for details.
)DOC")
.Arg(
"num_classes",
"(int) default 81; number of classes in each softmax group.")
.Input(
0,
"scores",
"4D tensor of softmax inputs (called 'scores' or 'logits') with shape "
"(N, C, H, W), where C = num_anchors * num_classes defines num_anchors "
"groups of contiguous num_classes softmax inputs.")
.Output(
0,
"probabilities",
"4D tensor of softmax probabilities with shape (N, C, H, W), where "
"C = num_anchors * num_classes, and softmax was applied to each of the "
"num_anchors groups; within a group the num_classes values sum to 1.");
OPERATOR_SCHEMA(GroupSpatialSoftmaxGradient)
.NumInputs(2)
.NumOutputs(1)
.Input(0, "scores", "See GroupSpatialSoftmax")
.Input(
1,
"d_probabilities",
"Gradient of forward output 0 (probabilities).")
.Output(0, "d_scores", "Gradient of forward input 0 (scores).");
class GetGroupSpatialSoftmaxGradient : public GradientMakerBase {
using GradientMakerBase::GradientMakerBase;
vector<OperatorDef> GetGradientDefs() override {
return SingleGradientDef(
"GroupSpatialSoftmaxGradient",
"",
vector<string>{O(0), GO(0)},
vector<string>{GI(0)});
}
};
REGISTER_GRADIENT(GroupSpatialSoftmax, GetGroupSpatialSoftmaxGradient);
} // namespace caffe2

View File

@ -1,181 +0,0 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cfloat>
#include "caffe2/core/context_gpu.h"
#include "modules/detectron/group_spatial_softmax_op.h"
namespace caffe2 {
namespace {
__global__ void GroupSpatialSoftmaxKernel(const int num, const int A, const int W,
const int H, const float* Xdata, float* Pdata, const int num_classes) {
// Loop through labels (N x A x H x W)
CUDA_1D_KERNEL_LOOP(index, num * A * H * W) {
int D = num_classes * A;
int x = index % W;
int y = (index / W) % H;
int a = (index / (W * H)) % A;
int i = index / W / H / A;
// Subtract max on each cell for numerical reasons
float max_val = -FLT_MAX;
for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
max_val = max(max_val, Xdata[idx]);
}
// Exponentiate
float expsum = 0.0f;
for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
float expx = exp(Xdata[idx] - max_val);
Pdata[idx] = expx;
expsum += expx;
}
// Normalize
for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
Pdata[idx] /= expsum;
}
}
}
__global__ void SumProbsKernel(const int N, const int A, const int W,
const int H, const float* Ydata, const float* dYdata,
float* sum_probs_data, const int num_classes) {
CUDA_1D_KERNEL_LOOP(i, N * A * W * H) {
int D = num_classes * A;
int x = i % W;
int y = (i / W) % H;
int a = (i / (W * H)) % A;
int n = i / (W * H * A);
sum_probs_data[i] = 0.0;
for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
int idx = n * (H * W * D) + c * (H * W) + y * W + x;
sum_probs_data[i] += (Ydata[idx] * dYdata[idx]);
}
}
}
__global__ void SubSumKernel(
const int N, const int A, const int W, const int H,
const float* sum_probs_data, float* dXdata, const int num_classes) {
CUDA_1D_KERNEL_LOOP(i, N * (A * num_classes) * W * H) {
int D = num_classes * A;
int x = i % W;
int y = (i / W) % H;
int a = ((i / (W * H)) % D) / num_classes;
int n = i / W / H / D;
int idx = n * (H * W * A) + a * (H * W) + y * W + x;
dXdata[i] = (dXdata[i] - sum_probs_data[idx]);
}
}
} // namespace
template <>
bool GroupSpatialSoftmaxOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0); // Logits
int N = X.dim32(0);
int D = X.dim32(1);
int H = X.dim32(2);
int W = X.dim32(3);
int A = D / num_classes_;
auto* P = Output(0, X.sizes(), at::dtype<float>()); // Probabilities from softmax
TORCH_DCHECK_EQ(X.ndim(), 4);
const float* Xdata = X.data<float>();
float* Pdata = P->mutable_data<float>();
// Softmax for each x,y location
GroupSpatialSoftmaxKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
N, A, W, H, Xdata, Pdata, num_classes_);
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}
template<>
bool GroupSpatialSoftmaxGradientOp<float, CUDAContext>::RunOnDevice() {
auto& Y = Input(0); // Probabilities from softmax
auto& dY = Input(1);
TORCH_DCHECK_EQ(Y.ndim(), 4);
int N = Y.dim32(0);
int D = Y.dim32(1);
int H = Y.dim32(2);
int W = Y.dim32(3);
int A = D / num_classes_;
auto* dX = Output(0, Y.sizes(), at::dtype<float>());
if (sum_probs_.size() != N * A * H * W) {
ReinitializeTensor(&sum_probs_, {N * A * H * W}, at::dtype<float>().device(CUDA));
}
const float* Ydata = Y.data<float>();
const float* dYdata = dY.data<float>();
float* dXdata = dX->mutable_data<float>();
float* sum_probs_data = sum_probs_.mutable_data<float>();
math::Set<float, CUDAContext>(
sum_probs_.size(), 0.0f, sum_probs_data, &context_);
// Complete math:
// J_ij = h_i (delta_ij - h_j)
// d x_i = sum_j d h_ij = sum_j J_ij * dy_j
// = sum_j h_i (delta_ij - h_j) * dy_j
// = h_i dy_i - (sum_j h_i h_j dy_j)
// = h_i dy_i - h_i sum_j h_j dy_j
// Step 0: dx = dy
context_.Copy<float, CUDAContext, CUDAContext>(Y.size(), dYdata, dXdata);
// Step 1: s = Sum(dY[j] * Y[j])
SumProbsKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS, 0,
context_.cuda_stream()>>>(
N, A, W, H, Ydata, dYdata, sum_probs_data, num_classes_);
C10_CUDA_KERNEL_LAUNCH_CHECK();
// Step 2: dX[i] = dX[i] - s
SubSumKernel<<<CAFFE_GET_BLOCKS(Y.size()), CAFFE_CUDA_NUM_THREADS, 0,
context_.cuda_stream()>>>(
N, A, W, H, sum_probs_.data<float>(), dXdata, num_classes_);
C10_CUDA_KERNEL_LAUNCH_CHECK();
// Step 3: dX[i] = Y[i] * dX[i]
math::Mul<float, CUDAContext>(Y.size(), dXdata, Ydata, dXdata, &context_);
return true;
}
REGISTER_CUDA_OPERATOR(GroupSpatialSoftmax,
GroupSpatialSoftmaxOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(GroupSpatialSoftmaxGradient,
GroupSpatialSoftmaxGradientOp<float, CUDAContext>);
} // namespace caffe2

View File

@ -1,76 +0,0 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef GROUP_SPATIAL_SOFTMAX_OP_H_
#define GROUP_SPATIAL_SOFTMAX_OP_H_
#include "caffe2/core/context.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/operator.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
template <typename T, class Context>
class GroupSpatialSoftmaxOp final : public Operator<Context> {
public:
GroupSpatialSoftmaxOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws),
num_classes_(this->template GetSingleArgument<int>("num_classes", 81)),
order_(StringToStorageOrder(
this->template GetSingleArgument<string>("order", "NCHW"))) {
CAFFE_ENFORCE_EQ(
order_, StorageOrder::NCHW, "Only NCHW order is supported right now.");
}
USE_OPERATOR_CONTEXT_FUNCTIONS;
bool RunOnDevice() override {
// No CPU implementation for now
CAFFE_NOT_IMPLEMENTED;
}
protected:
int num_classes_;
StorageOrder order_;
};
template <typename T, class Context>
class GroupSpatialSoftmaxGradientOp final : public Operator<Context> {
public:
GroupSpatialSoftmaxGradientOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws),
num_classes_(this->template GetSingleArgument<int>("num_classes", 81)),
order_(StringToStorageOrder(
this->template GetSingleArgument<string>("order", "NCHW"))) {
CAFFE_ENFORCE_EQ(
order_, StorageOrder::NCHW, "Only NCHW order is supported right now.");
}
USE_OPERATOR_CONTEXT_FUNCTIONS;
bool RunOnDevice() override {
// No CPU implementation for now
CAFFE_NOT_IMPLEMENTED;
}
protected:
int num_classes_;
StorageOrder order_;
Tensor sum_probs_;
};
} // namespace caffe2
#endif // GROUP_SPATIAL_SOFTMAX_OP_H_

View File

@ -1,106 +0,0 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "ps_roi_pool_op.h"
namespace caffe2 {
REGISTER_CPU_OPERATOR(PSRoIPool, PSRoIPoolOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(
PSRoIPoolGradient,
PSRoIPoolGradientOp<float, CPUContext>);
OPERATOR_SCHEMA(PSRoIPool)
.NumInputs(2)
.NumOutputs(2)
.SetDoc(R"DOC(
Position Sensitive Region of Interest Pooling as used in R-FCN.
)DOC")
.Arg(
"spatial_scale",
"(float) default 1.0; Spatial scale of the input feature map X "
"relative to the input image. E.g., 0.0625 if X has a stride of 16 "
"w.r.t. the input image.")
.Arg(
"group_size",
"(int) default 1; pooled_h = pooled_w = group_size where pooled_{h,w} "
"is the pooled output Y's height and width, respectively.")
.Arg(
"output_dim",
"(int) default 1; number of channels in the pooled output, which might "
"be the number of classes is used for classification or 4 if used for "
"class agnostic bounding box regression.")
.Input(
0,
"X",
"4D position sensitive feature map input of shape (N, C, H, W), where "
"C = group_size**2 * output_dim.")
.Input(
1,
"RoIs",
"2D input of shape (R, 5) specifying R RoIs with five columns "
"representing: batch index in [0, N - 1], x1, y1, x2, y2. The RoI "
"coordinates are in the coordinate system of the input image.")
.Output(
0,
"Y",
"4D output of shape (R, output_dim, pooled_h, pooled_w). The r-th "
"batch element is a pooled feature map cooresponding to the r-th RoI.")
.Output(
1,
"argmaxes",
"4D output of shape (R, output_dim, pooled_h, pooled_w). Same as Y, "
"except it records the argmax indices rather than the max pooled "
"values.");
OPERATOR_SCHEMA(PSRoIPoolGradient)
.NumInputs(4)
.NumOutputs(1)
.Input(
0,
"X",
"See PSRoIPool.")
.Input(
1,
"RoIs",
"See PSRoIPool.")
.Input(
2,
"argmaxes",
"See PSRoIPool.")
.Input(
3,
"dY",
"Gradient of forward output 0 (Y)")
.Output(
0,
"dX",
"Gradient of forward input 0 (X)");
class GetPSRoIPoolGradient : public GradientMakerBase {
using GradientMakerBase::GradientMakerBase;
vector<OperatorDef> GetGradientDefs() override {
return SingleGradientDef(
"PSRoIPoolGradient",
"",
vector<string>{I(0), I(1), O(1), GO(0)},
vector<string>{GI(0)});
}
};
REGISTER_GRADIENT(PSRoIPool, GetPSRoIPoolGradient);
} // namespace caffe2

View File

@ -1,289 +0,0 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Based on https://github.com/daijifeng001/caffe-rfcn/blob/r-fcn/src/caffe/layers/psroi_pooling_layer.cu
//
// ------------------------------------------------------------------
// R-FCN
// Copyright (c) 2016 Microsoft
// Licensed under The MIT License [see r-fcn/LICENSE for details]
// Written by Yi Li
// ------------------------------------------------------------------
//
// COPYRIGHT
//
// All contributions by the University of California:
// Copyright (c) 2014, 2015, The Regents of the University of California
// (Regents)
// All rights reserved.
//
// All other contributions:
// Copyright (c) 2014, 2015, the respective contributors
// All rights reserved.
//
// Caffe uses a shared copyright model: each contributor holds copyright over
// their contributions to Caffe. The project versioning records all such
// contribution and copyright details. If a contributor wants to further mark
// their specific copyright on a particular contribution, they should indicate
// their copyright solely in the commit message of the change when it is
// committed.
//
// LICENSE
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// 1. Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//
// CONTRIBUTION AGREEMENT
//
// By contributing to the BVLC/caffe repository through pull-request, comment,
// or otherwise, the contributor releases their content to the
// license and copyright terms herein.
#include <cfloat>
#include "caffe2/core/context_gpu.h"
#include "modules/detectron/ps_roi_pool_op.h"
namespace caffe2 {
namespace {
template <typename T>
inline __device__ T gpu_atomic_add(const T val, T* address);
template <>
inline __device__
float gpu_atomic_add(const float val, float* address) {
return atomicAdd(address, val);
}
template <typename T>
__global__ void PSRoIPoolForward(
const int nthreads,
const T* bottom_data,
const T spatial_scale,
const int channels,
const int height,
const int width,
const int pooled_height,
const int pooled_width,
const T* bottom_rois,
const int output_dim,
const int group_size,
T* top_data,
int* mapping_channel) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// The output is in order (n, ctop, ph, pw)
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int ctop = (index / pooled_width / pooled_height) % output_dim;
int n = index / pooled_width / pooled_height / output_dim;
// [start, end) interval for spatial sampling
const T* offset_bottom_rois = bottom_rois + n * 5;
int roi_batch_ind = offset_bottom_rois[0];
T roi_start_w = static_cast<T>(
roundf(offset_bottom_rois[1])) * spatial_scale;
T roi_start_h = static_cast<T>(
roundf(offset_bottom_rois[2])) * spatial_scale;
T roi_end_w = static_cast<T>(
roundf(offset_bottom_rois[3]) + 1.) * spatial_scale;
T roi_end_h = static_cast<T>(
roundf(offset_bottom_rois[4]) + 1.) * spatial_scale;
// Force too small ROIs to be 1x1
T roi_width = c10::cuda::compat::max(roi_end_w - roi_start_w, static_cast<T>(0.1)); // avoid 0
T roi_height = c10::cuda::compat::max(roi_end_h - roi_start_h, static_cast<T>(0.1));
// Compute w and h at bottom
T bin_size_h = roi_height / static_cast<T>(pooled_height);
T bin_size_w = roi_width / static_cast<T>(pooled_width);
// Add roi offsets and clip to input boundaries
int hstart = floor(
static_cast<T>(ph) * bin_size_h + roi_start_h);
int wstart = floor(
static_cast<T>(pw)* bin_size_w + roi_start_w);
int hend = ceil(
static_cast<T>(ph + 1) * bin_size_h + roi_start_h);
int wend = ceil(
static_cast<T>(pw + 1) * bin_size_w + roi_start_w);
hstart = min(max(hstart, 0), height);
hend = min(max(hend, 0), height);
wstart = min(max(wstart, 0),width);
wend = min(max(wend, 0), width);
bool is_empty = (hend <= hstart) || (wend <= wstart);
int gw = pw;
int gh = ph;
int c = (ctop * group_size + gh) * group_size + gw;
const T* offset_bottom_data =
bottom_data + (roi_batch_ind * channels + c) * height * width;
T out_sum = 0;
for (int h = hstart; h < hend; ++h){
for (int w = wstart; w < wend; ++w){
int bottom_index = h*width + w;
out_sum += offset_bottom_data[bottom_index];
}
}
T bin_area = (hend - hstart) * (wend - wstart);
top_data[index] = is_empty ? 0. : out_sum / bin_area;
mapping_channel[index] = c;
}
}
template <typename T>
__global__ void PSRoIPoolBackward(
const int nthreads,
const T* top_diff,
const int* mapping_channel,
const int num_rois,
const T spatial_scale,
const int channels,
const int height,
const int width,
const int pooled_height,
const int pooled_width,
const int output_dim,
T* bottom_diff,
const T* bottom_rois) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// The output is in order (n, ctop, ph, pw)
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int n = index / pooled_width / pooled_height / output_dim;
// [start, end) interval for spatial sampling
const T* offset_bottom_rois = bottom_rois + n * 5;
int roi_batch_ind = offset_bottom_rois[0];
T roi_start_w = static_cast<T>(
roundf(offset_bottom_rois[1])) * spatial_scale;
T roi_start_h = static_cast<T>(
roundf(offset_bottom_rois[2])) * spatial_scale;
T roi_end_w = static_cast<T>(
roundf(offset_bottom_rois[3]) + 1.) * spatial_scale;
T roi_end_h = static_cast<T>(
roundf(offset_bottom_rois[4]) + 1.) * spatial_scale;
// Force too small ROIs to be 1x1
T roi_width = c10::cuda::compat::max(roi_end_w - roi_start_w, static_cast<T>(0.1)); //avoid 0
T roi_height = c10::cuda::compat::max(roi_end_h - roi_start_h, static_cast<T>(0.1));
// Compute w and h at bottom
T bin_size_h = roi_height / static_cast<T>(pooled_height);
T bin_size_w = roi_width / static_cast<T>(pooled_width);
int hstart = floor(
static_cast<T>(ph)* bin_size_h + roi_start_h);
int wstart = floor(
static_cast<T>(pw)* bin_size_w + roi_start_w);
int hend = ceil(
static_cast<T>(ph + 1) * bin_size_h + roi_start_h);
int wend = ceil(
static_cast<T>(pw + 1) * bin_size_w + roi_start_w);
// Add roi offsets and clip to input boundaries
hstart = min(max(hstart, 0), height);
hend = min(max(hend, 0), height);
wstart = min(max(wstart, 0), width);
wend = min(max(wend, 0), width);
bool is_empty = (hend <= hstart) || (wend <= wstart);
// Compute c at bottom
int c = mapping_channel[index];
T* offset_bottom_diff =
bottom_diff + (roi_batch_ind * channels + c) * height * width;
T bin_area = (hend - hstart) * (wend - wstart);
T diff_val = is_empty ? 0. : top_diff[index] / bin_area;
for (int h = hstart; h < hend; ++h){
for (int w = wstart; w < wend; ++w){
int bottom_index = h * width + w;
gpu_atomic_add(diff_val, offset_bottom_diff + bottom_index);
}
}
}
}
} // namespace
template<>
bool PSRoIPoolOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0); // Input data to pool
auto& R = Input(1); // RoIs
auto* Y = Output(0, {R.dim32(0), output_dim_, pooled_height_, pooled_width_}, at::dtype<float>()); // PSRoI pooled data
auto* A = Output(1, Y->sizes(), at::dtype<int>()); // mapping_channel
int output_size = Y->numel();
PSRoIPoolForward<float><<<CAFFE_GET_BLOCKS(output_size),
CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
output_size, X.data<float>(), spatial_scale_, X.dim32(1), X.dim32(2),
X.dim32(3), pooled_height_, pooled_width_, R.data<float>(), output_dim_,
group_size_, Y->mutable_data<float>(), A->mutable_data<int>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}
template<>
bool PSRoIPoolGradientOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0); // Input data to pool
auto& R = Input(1); // RoIs
auto& A = Input(2); // mapping channels
auto& dY = Input(3); // Gradient of net w.r.t. output of "forward" op
// (aka "gradOutput")
auto* dX = Output(0, X.sizes(), at::dtype<float>()); // Gradient of net w.r.t. input to "forward" op
// (aka "gradInput")
// Must zero-out dX before accumulating gradients
math::Set<float, CUDAContext>(
dX->size(), 0.f, dX->mutable_data<float>(), &context_);
PSRoIPoolBackward<float><<<CAFFE_GET_BLOCKS(dY.size()),
CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
dY.size(), dY.data<float>(), A.data<int>(), R.dim32(0), spatial_scale_,
X.dim32(1), X.dim32(2), X.dim32(3), pooled_height_, pooled_width_,
output_dim_, dX->mutable_data<float>(), R.data<float>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}
REGISTER_CUDA_OPERATOR(PSRoIPool,
PSRoIPoolOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(PSRoIPoolGradient,
PSRoIPoolGradientOp<float, CUDAContext>);
} // namespace caffe2

View File

@ -1,93 +0,0 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef PS_ROI_POOL_OP_H_
#define PS_ROI_POOL_OP_H_
#include "caffe2/core/context.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/operator.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
template <typename T, class Context>
class PSRoIPoolOp final : public Operator<Context> {
public:
PSRoIPoolOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws),
spatial_scale_(this->template GetSingleArgument<float>(
"spatial_scale", 1.)),
group_size_(this->template GetSingleArgument<int>("group_size", 1)),
output_dim_(this->template GetSingleArgument<int>("output_dim", 1)) {
TORCH_DCHECK_GT(spatial_scale_, 0);
TORCH_DCHECK_GT(group_size_, 0);
pooled_height_ = group_size_;
pooled_width_ = group_size_;
}
USE_OPERATOR_CONTEXT_FUNCTIONS;
bool RunOnDevice() override {
// No CPU implementation for now
CAFFE_NOT_IMPLEMENTED;
}
protected:
float spatial_scale_;
int group_size_;
int output_dim_;
int pooled_height_;
int pooled_width_;
int channels_;
int height_;
int width_;
};
template <typename T, class Context>
class PSRoIPoolGradientOp final : public Operator<Context> {
public:
PSRoIPoolGradientOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws),
spatial_scale_(this->template GetSingleArgument<float>(
"spatial_scale", 1.)),
group_size_(this->template GetSingleArgument<int>("group_size", 1)),
output_dim_(this->template GetSingleArgument<int>("output_dim", 1)) {
TORCH_DCHECK_GT(spatial_scale_, 0);
TORCH_DCHECK_GT(group_size_, 0);
pooled_height_ = group_size_;
pooled_width_ = group_size_;
}
USE_OPERATOR_CONTEXT_FUNCTIONS;
bool RunOnDevice() override {
// No CPU implementation for now
CAFFE_NOT_IMPLEMENTED;
}
protected:
float spatial_scale_;
int group_size_;
int output_dim_;
int pooled_height_;
int pooled_width_;
int channels_;
int height_;
int width_;
};
} // namespace caffe2
#endif // PS_ROI_POOL_OP_H_

View File

@ -1,99 +0,0 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "roi_pool_f_op.h"
namespace caffe2 {
REGISTER_CPU_OPERATOR(RoIPoolF, RoIPoolFOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(RoIPoolFGradient, RoIPoolFGradientOp<float, CPUContext>);
OPERATOR_SCHEMA(RoIPoolF)
.NumInputs(2)
.NumOutputs(2)
.SetDoc(R"DOC(
Region of Interest (RoI) pooling operation as used in Fast R-CNN.
)DOC")
.Arg(
"spatial_scale",
"(float) default 1.0; Spatial scale of the input feature map X "
"relative to the input image. E.g., 0.0625 if X has a stride of 16 "
"w.r.t. the input image.")
.Arg(
"pooled_h",
"(int) default 1; Pooled output Y's height.")
.Arg(
"pooled_w",
"(int) default 1; Pooled output Y's width.")
.Input(
0,
"X",
"4D feature map input of shape (N, C, H, W).")
.Input(
1,
"RoIs",
"2D input of shape (R, 5) specifying R RoIs with five columns "
"representing: batch index in [0, N - 1], x1, y1, x2, y2. The RoI "
"coordinates are in the coordinate system of the input image.")
.Output(
0,
"Y",
"4D output of shape (R, C, pooled_h, pooled_w). The r-th batch element "
"is a pooled feature map cooresponding to the r-th RoI.")
.Output(
1,
"argmaxes",
"4D output of shape (R, C, pooled_h, pooled_w). Same as Y, except it "
"records the argmax indices rather than the max pooled values.");
OPERATOR_SCHEMA(RoIPoolFGradient)
.NumInputs(4)
.NumOutputs(1)
.Input(
0,
"X",
"See RoIPoolF.")
.Input(
1,
"RoIs",
"See RoIPoolF.")
.Input(
2,
"argmaxes",
"See RoIPoolF.")
.Input(
3,
"dY",
"Gradient of forward output 0 (Y)")
.Output(
0,
"dX",
"Gradient of forward input 0 (X)");
class GetRoIPoolFGradient : public GradientMakerBase {
using GradientMakerBase::GradientMakerBase;
vector<OperatorDef> GetGradientDefs() override {
return SingleGradientDef(
"RoIPoolFGradient",
"",
vector<string>{I(0), I(1), O(1), GO(0)},
vector<string>{GI(0)});
}
};
REGISTER_GRADIENT(RoIPoolF, GetRoIPoolFGradient);
} // namespace caffe2

View File

@ -1,187 +0,0 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cfloat>
#include "caffe2/core/context_gpu.h"
#include "modules/detectron/roi_pool_f_op.h"
namespace caffe2 {
namespace {
template <typename T>
inline __device__ T gpu_atomic_add(const T val, T* address);
template <>
inline __device__
float gpu_atomic_add(const float val, float* address) {
return atomicAdd(address, val);
}
template <typename T>
__global__ void RoIPoolFForward(const int nthreads, const T* bottom_data,
const T spatial_scale, const int channels, const int height,
const int width, const int pooled_height, const int pooled_width,
const T* bottom_rois, T* top_data, int* argmax_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
const T* offset_bottom_rois = bottom_rois + n * 5;
int roi_batch_ind = offset_bottom_rois[0];
int roi_start_w = roundf(offset_bottom_rois[1] * spatial_scale);
int roi_start_h = roundf(offset_bottom_rois[2] * spatial_scale);
int roi_end_w = roundf(offset_bottom_rois[3] * spatial_scale);
int roi_end_h = roundf(offset_bottom_rois[4] * spatial_scale);
// Force malformed ROIs to be 1x1
int roi_width = max(roi_end_w - roi_start_w + 1, 1);
int roi_height = max(roi_end_h - roi_start_h + 1, 1);
T bin_size_h = static_cast<T>(roi_height)
/ static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width)
/ static_cast<T>(pooled_width);
int hstart = static_cast<int>(floor(static_cast<T>(ph)
* bin_size_h));
int wstart = static_cast<int>(floor(static_cast<T>(pw)
* bin_size_w));
int hend = static_cast<int>(ceil(static_cast<T>(ph + 1)
* bin_size_h));
int wend = static_cast<int>(ceil(static_cast<T>(pw + 1)
* bin_size_w));
// Add roi offsets and clip to input boundaries
hstart = min(max(hstart + roi_start_h, 0), height);
hend = min(max(hend + roi_start_h, 0), height);
wstart = min(max(wstart + roi_start_w, 0), width);
wend = min(max(wend + roi_start_w, 0), width);
bool is_empty = (hend <= hstart) || (wend <= wstart);
// Define an empty pooling region to be zero
T maxval = is_empty ? 0 : -FLT_MAX;
// If nothing is pooled, argmax = -1 causes nothing to be backprop'd
int maxidx = -1;
const T* offset_bottom_data =
bottom_data + (roi_batch_ind * channels + c) * height * width;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
int bottom_index = h * width + w;
if (offset_bottom_data[bottom_index] > maxval) {
maxval = offset_bottom_data[bottom_index];
maxidx = bottom_index;
}
}
}
top_data[index] = maxval;
argmax_data[index] = maxidx;
}
}
template <typename T>
__global__ void RoIPoolFBackward(const int nthreads, const T* top_diff,
const int* argmax_data, const int num_rois, const T spatial_scale,
const int channels, const int height, const int width,
const int pooled_height, const int pooled_width, T* bottom_diff,
const T* bottom_rois) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
const T* offset_bottom_rois = bottom_rois + n * 5;
int roi_batch_ind = offset_bottom_rois[0];
int bottom_offset = (roi_batch_ind * channels + c) * height * width;
int top_offset = (n * channels + c) * pooled_height * pooled_width;
const T* offset_top_diff = top_diff + top_offset;
T* offset_bottom_diff = bottom_diff + bottom_offset;
const int* offset_argmax_data = argmax_data + top_offset;
int argmax = offset_argmax_data[ph * pooled_width + pw];
if (argmax != -1) {
gpu_atomic_add(
static_cast<T>(offset_top_diff[ph * pooled_width + pw]),
offset_bottom_diff + argmax);
}
}
}
} // namespace
template<>
bool RoIPoolFOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0); // Input data to pool
auto& R = Input(1); // RoIs
if (R.size() == 0) {
// Handle empty rois
std::vector<int64_t> sizes = {0, X.dim32(1), pooled_height_, pooled_width_};
/* auto* Y = */ Output(0, sizes, at::dtype<float>());
/* auto* A = */ Output(1, sizes, at::dtype<int>());
return true;
}
auto* Y = Output(0, {R.dim32(0), X.dim32(1), pooled_height_, pooled_width_}, at::dtype<float>()); // RoI pooled data
auto* A = Output(1, Y->sizes(), at::dtype<int>()); // argmaxes
int output_size = Y->size();
RoIPoolFForward<float><<<CAFFE_GET_BLOCKS(output_size),
CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
output_size, X.data<float>(), spatial_scale_, X.dim32(1), X.dim32(2),
X.dim32(3), pooled_height_, pooled_width_, R.data<float>(),
Y->mutable_data<float>(), A->mutable_data<int>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}
template<>
bool RoIPoolFGradientOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0); // Input data to pool
auto& R = Input(1); // RoIs
auto& A = Input(2); // argmaxes
auto& dY = Input(3); // Gradient of net w.r.t. output of "forward" op
// (aka "gradOutput")
auto* dX = Output(0, X.sizes(), at::dtype<float>()); // Gradient of net w.r.t. input to "forward" op
// (aka "gradInput")
// Must zero-out dX before accumulating gradients
math::Set<float, CUDAContext>(
dX->size(), 0.f, dX->mutable_data<float>(), &context_);
if (dY.size() > 0) { // Handle possibly empty gradient if there were no rois
RoIPoolFBackward<float><<<CAFFE_GET_BLOCKS(dY.size()),
CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
dY.size(), dY.data<float>(), A.data<int>(), R.dim32(0), spatial_scale_,
X.dim32(1), X.dim32(2), X.dim32(3), pooled_height_, pooled_width_,
dX->mutable_data<float>(), R.data<float>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
return true;
}
REGISTER_CUDA_OPERATOR(RoIPoolF,
RoIPoolFOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(RoIPoolFGradient,
RoIPoolFGradientOp<float, CUDAContext>);
} // namespace caffe2

View File

@ -1,81 +0,0 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef ROI_POOL_F_OP_H_
#define ROI_POOL_F_OP_H_
#include "caffe2/core/context.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/operator.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
template <typename T, class Context>
class RoIPoolFOp final : public Operator<Context> {
public:
RoIPoolFOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws),
spatial_scale_(this->template GetSingleArgument<float>(
"spatial_scale", 1.)),
pooled_height_(this->template GetSingleArgument<int>("pooled_h", 1)),
pooled_width_(this->template GetSingleArgument<int>("pooled_w", 1)) {
TORCH_DCHECK_GT(spatial_scale_, 0);
TORCH_DCHECK_GT(pooled_height_, 0);
TORCH_DCHECK_GT(pooled_width_, 0);
}
USE_OPERATOR_CONTEXT_FUNCTIONS;
bool RunOnDevice() override {
// No CPU implementation for now
CAFFE_NOT_IMPLEMENTED;
}
protected:
float spatial_scale_;
int pooled_height_;
int pooled_width_;
};
template <typename T, class Context>
class RoIPoolFGradientOp final : public Operator<Context> {
public:
RoIPoolFGradientOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws),
spatial_scale_(this->template GetSingleArgument<float>(
"spatial_scale", 1.)),
pooled_height_(this->template GetSingleArgument<int>("pooled_h", 1)),
pooled_width_(this->template GetSingleArgument<int>("pooled_w", 1)) {
TORCH_DCHECK_GT(spatial_scale_, 0);
TORCH_DCHECK_GT(pooled_height_, 0);
TORCH_DCHECK_GT(pooled_width_, 0);
}
USE_OPERATOR_CONTEXT_FUNCTIONS;
bool RunOnDevice() override {
// No CPU implementation for now
CAFFE_NOT_IMPLEMENTED;
}
protected:
float spatial_scale_;
int pooled_height_;
int pooled_width_;
};
} // namespace caffe2
#endif // ROI_POOL_F_OP_H_

View File

@ -1,81 +0,0 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "sample_as_op.h"
namespace caffe2 {
REGISTER_CPU_OPERATOR(SampleAs, SampleAsOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(SampleAsGradient, SampleAsGradientOp<float, CPUContext>);
OPERATOR_SCHEMA(SampleAs)
.NumInputs(2)
.NumOutputs(1)
.SetDoc(R"DOC(
Select the batch elements from input tensor X where the corresponding input
label value is > 0.
)DOC")
.Input(
0,
"X",
"Tensor of at least 1D shape (N, ...).")
.Input(
1,
"labels",
"Tensor of type int with 1D shape (N, ).")
.Output(
0,
"Y",
"Tensor with number of dims matching X, but with the length of dim 0 "
"equal to the number of non-zero elements in labels. The batch items "
"from X corresponding to the non-zero elements in labels are copied "
"into Y.");
OPERATOR_SCHEMA(SampleAsGradient)
.NumInputs(3)
.NumOutputs(1)
.Input(
0,
"X",
"See SampleAs.")
.Input(
1,
"labels",
"See SampleAs."
)
.Input(
2,
"dY",
"Gradient of forward output 0 (Y).")
.Output(
0,
"dX",
"Gradient of forward input 0 (X).");
class GetSampleAsGradient : public GradientMakerBase {
using GradientMakerBase::GradientMakerBase;
vector<OperatorDef> GetGradientDefs() override {
return SingleGradientDef(
"SampleAsGradient",
"",
vector<string>{I(0), I(1), GO(0)},
vector<string>{GI(0)});
}
};
REGISTER_GRADIENT(SampleAs, GetSampleAsGradient);
} // namespace caffe2

View File

@ -1,116 +0,0 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* SampleAs by Kaiming He for Mask R-CNN
X.dim32(0) = L.dim32(0)
Y's output samples are the samples of X for which L > 0.
*/
#include <cfloat>
#include "caffe2/core/context_gpu.h"
#include "modules/detectron/sample_as_op.h"
#include <stdio.h>
namespace caffe2 {
template <>
bool SampleAsOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0); // Input data to be sliced
auto& L = Input(1); // Target data that provide the identity
CAFFE_ENFORCE(
X.dim32(0) == L.dim32(0),
"X.dim32(0) must be equal to L.dim32(0)",
"(",
X.dim32(0),
" vs. ",
L.dim32(0),
")");
// copy L to CPU:
std::vector<int> labels(L.dim32(0));
context_.CopyBytes<CUDAContext, CPUContext>(
L.dim32(0) * sizeof(int), L.data<int>(), &labels[0]);
// Make sure that the copy is finished
context_.FinishDeviceComputation();
int count = 0;
for (int i = 0; i < L.dim32(0); i++) {
if (labels[i] > 0) {
count++;
}
}
assert(count > 0);
// resize Y
vector<int64_t> out_shape(X.sizes().vec());
out_shape[0] = count;
auto* Y = Output(0, out_shape, at::dtype<float>()); // Sliced data (Y.dim32(0) = num of (L > 0))
const int len = X.size() / X.dim32(0);
float* output = Y->mutable_data<float>();
for (int i = 0; i < L.dim32(0); i++) {
if (labels[i] > 0) {
context_.CopyBytes<CUDAContext, CUDAContext>(
len * sizeof(float), X.data<float>() + i * len, output);
output += len;
} // if
} // i
return true;
}
template <>
bool SampleAsGradientOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0);
auto& L = Input(1);
auto& dY = Input(2);
auto* dX = Output(0, X.sizes(), at::dtype<float>());
// copy L to CPU:
std::vector<int> labels(L.dim32(0));
context_.CopyBytes<CUDAContext, CPUContext>(
L.dim32(0) * sizeof(int), L.data<int>(), &labels[0]);
// Make sure that the copy is finished
context_.FinishDeviceComputation();
// zero-out dX
math::Set<float, CUDAContext>(
dX->size(), 0.f, dX->mutable_data<float>(), &context_);
const int len = X.size() / X.dim32(0);
const float* input = dY.data<float>();
for (int i = 0; i < L.dim32(0); i++) {
if (labels[i] > 0) {
context_.CopyBytes<CUDAContext, CUDAContext>(
len * sizeof(float), input, dX->mutable_data<float>() + i * len);
input += len;
} // if
} // i
return true;
}
REGISTER_CUDA_OPERATOR(SampleAs, SampleAsOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(
SampleAsGradient,
SampleAsGradientOp<float, CUDAContext>);
} // namespace caffe2

View File

@ -1,55 +0,0 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef SAMPLE_AS_OP_H_
#define SAMPLE_AS_OP_H_
#include "caffe2/core/context.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/operator.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
template <typename T, class Context>
class SampleAsOp final : public Operator<Context> {
public:
SampleAsOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws) {}
USE_OPERATOR_CONTEXT_FUNCTIONS;
bool RunOnDevice() override {
// No CPU implementation for now
CAFFE_NOT_IMPLEMENTED;
}
};
template <typename T, class Context>
class SampleAsGradientOp final : public Operator<Context> {
public:
SampleAsGradientOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws) {}
USE_OPERATOR_CONTEXT_FUNCTIONS;
bool RunOnDevice() override {
// No CPU implementation for now
CAFFE_NOT_IMPLEMENTED;
}
};
} // namespace caffe2
#endif // SAMPLE_AS_OP_H_

View File

@ -1,107 +0,0 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "select_smooth_l1_loss_op.h"
namespace caffe2 {
REGISTER_CPU_OPERATOR(
SelectSmoothL1Loss,
SelectSmoothL1LossOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(
SelectSmoothL1LossGradient,
SelectSmoothL1LossGradientOp<float, CPUContext>);
OPERATOR_SCHEMA(SelectSmoothL1Loss)
.NumInputs(4)
.NumOutputs(1)
.SetDoc(R"DOC(
RetinaNet specific op for computing Smooth L1 Loss at select locations in a 4D
tensor that encodes bounding box regression predictions.
)DOC")
.Arg(
"beta",
"(float) default 1.0; L2 to L1 transition point.")
.Arg(
"scale",
"(float) default 1.0; multiply the loss by this scale factor.")
.Input(
0,
"Y_hat",
"4D tensor of bounding box regression predictions with shape "
"(N, 4 * num_bbox_classes * num_anchors, H, W).")
.Input(
1,
"Y",
"2D tensor of labels shape (M, 4) for 4 contiguous channels starting "
"at each of the M locations selected by the locations input.")
.Input(
2,
"locations",
"2D tensor of shape (M, 4) that identifies M 'select' locations "
"encoded by the four columns: (n, c, y, x). The loss is computed on the "
"four contiguous channel locations [c, c + 3] (inclusive).")
.Input(
3,
"normalizer",
"Scalar; the loss is divided by max(1, normalizer).")
.Output(
0,
"loss",
"Scalar loss.");
OPERATOR_SCHEMA(SelectSmoothL1LossGradient)
.NumInputs(5)
.NumOutputs(1)
.Input(
0,
"Y_hat",
"See SelectSmoothL1Loss.")
.Input(
1,
"Y",
"See SelectSmoothL1Loss.")
.Input(
2,
"locations",
"See SelectSmoothL1Loss.")
.Input(
3,
"normalizer",
"See SelectSmoothL1Loss.")
.Input(
4,
"d_loss",
"Gradient of forward output 0 (loss).")
.Output(
0,
"d_Y_hat",
"Gradient of forward input 0 (Y_hat).");
class GetSelectSmoothL1LossGradient : public GradientMakerBase {
using GradientMakerBase::GradientMakerBase;
vector<OperatorDef> GetGradientDefs() override {
return SingleGradientDef(
"SelectSmoothL1LossGradient",
"",
vector<string>{I(0), I(1), I(2), I(3), GO(0)},
vector<string>{GI(0)});
}
};
REGISTER_GRADIENT(SelectSmoothL1Loss, GetSelectSmoothL1LossGradient);
} // namespace caffe2

View File

@ -1,189 +0,0 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "caffe2/core/context_gpu.h"
#include "modules/detectron/select_smooth_l1_loss_op.h"
namespace caffe2 {
namespace {
__global__ void SelectSmoothL1Kernel(
const int D, const int H, const int W,
const int M, const float* Y_hat, const float* Y, const float* L, float* out,
const float* S, const float beta) {
// f(x) = 0.5 * x^2 / beta if |x| < beta
// |x| - 0.5 * beta otherwise
CUDA_1D_KERNEL_LOOP(i, M) {
int n = L[i * 4];
int c = L[i * 4 + 1];
int y = L[i * 4 + 2];
int x = L[i * 4 + 3];
for (int j = 0; j < 4; j++){
// Y_hat: N x (A * CLS * 4) x H x W
int ind = n * (D * H * W) + (c + j) * (H * W) + y * W + x;
float y_hat = Y_hat[ind];
float y = Y[i * 4 + j];
float val = y_hat - y;
float abs_val = c10::cuda::compat::abs(val);
if (abs_val < beta) {
out[ind] = (0.5 * val * val / beta) / c10::cuda::compat::max(S[0], static_cast<float>(1.0));
} else {
out[ind] = (abs_val - 0.5 * beta) / c10::cuda::compat::max(S[0], static_cast<float>(1.0));
}
}
}
}
__global__ void SelectSmoothL1GradientKernel(
const int D, const int H, const int W,
const int M,
const float* Y_hat,
const float* Y,
const float* L,
float* out,
const float* d_loss_data,
float norm,
const float* S,
float beta) {
// f'(x) = x / beta if |x| < beta
// = sign(x) otherwise
// We also scale by norm * d_loss in this kernel for convenience
CUDA_1D_KERNEL_LOOP(i, M) {
int n = L[i * 4];
int c = L[i * 4 + 1];
int y = L[i * 4 + 2];
int x = L[i * 4 + 3];
float d_loss = *d_loss_data;
for (int j = 0; j < 4; j++) {
int ind = n * (D * H * W) + (c + j) * (H * W) + y * W + x;
float y_hat = Y_hat[ind];
float y = Y[i * 4 + j];
float val = y_hat - y;
float abs_val = c10::cuda::compat::abs(val);
if (abs_val < beta) {
out[ind] = norm * d_loss * val / beta / c10::cuda::compat::max(S[0], static_cast<float>(1.0));
} else {
out[ind] = norm * d_loss * ((float(0) < val) - (val < float(0))) / c10::cuda::compat::max(S[0], static_cast<float>(1.0));
}
}
}
}
} // namespace
template<>
bool SelectSmoothL1LossOp<float, CUDAContext>::RunOnDevice() {
// bbox targets predictions, for example: N x (A * 4) H x W in cls-agnostic case
auto& Y_hat = Input(0);
// true targets: for example: M x 4 where M is the #fg boxes per fpn level
auto& Y = Input(1);
// locations of fg boxes: M x 4
auto& L = Input(2);
// total number of fg boxes across all FPN levels: scalar
auto& S = Input(3);
auto* avg_loss = Output(0, vector<int64_t>(), at::dtype<float>());
if (Y.size() == 0){
math::Set<float, CUDAContext>(
1, static_cast<float>(0), avg_loss->mutable_data<float>(), &context_);
return true;
}
int N = Y_hat.dim32(0);
int D = Y_hat.dim32(1);
int H = Y_hat.dim32(2);
int W = Y_hat.dim32(3);
int M = Y.dim32(0);
// initialization
buff_.ResizeLike(Y_hat);
math::Set<float, CUDAContext>(
1, static_cast<float>(0), avg_loss->mutable_data<float>(), &context_);
math::Set<float, CUDAContext>(
buff_.size(), 0.0, buff_.mutable_data<float>(), &context_);
// Element-wise smooth l1 loss
// l := SelectSmoothL1((y_hat - y))
SelectSmoothL1Kernel<<<CAFFE_GET_BLOCKS(buff_.size()),
CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
D, H, W,
M, Y_hat.data<float>(), Y.data<float>(),
L.data<float>(), buff_.mutable_data<float>(),
S.data<float>(), beta_);
C10_CUDA_KERNEL_LAUNCH_CHECK();
// Sum of all losses
// al := sum_i l_i
float* avg_loss_data = avg_loss->mutable_data<float>();
math::Sum<float, CUDAContext>(
buff_.size(), buff_.data<float>(), avg_loss_data, &context_);
// Average of input batch size
math::Scale<float, float, CUDAContext>(
1, scale_, avg_loss_data, avg_loss_data, &context_);
return true;
}
template<>
bool SelectSmoothL1LossGradientOp<float, CUDAContext>::RunOnDevice() {
auto& Y_hat = Input(0);
auto& Y = Input(1);
auto& L = Input(2);
auto& S = Input(3);
// Below is gradient of net w.r.t. avg_loss ("gradOutput"), should be all 1's
auto& d_avg_loss = Input(4);
auto* d_Y_hat = Output(0, Y_hat.sizes(), at::dtype<float>()); // gradient of net w.r.t. Y_hat ("gradInput")
math::Set<float, CUDAContext>(
d_Y_hat->size(), 0.0, d_Y_hat->mutable_data<float>(), &context_);
if (Y.size() == 0){
return true;
}
int N = Y_hat.dim32(0);
int D = Y_hat.dim32(1);
int H = Y_hat.dim32(2);
int W = Y_hat.dim32(3);
int M = Y.dim32(0);
// Element-wise weighted difference (can be used to ignore or reweight
// specific components)
// d := (y_hat - y)
// d_Y_hat := d_avg_loss * SelectSmoothL1'((y_hat - y))
SelectSmoothL1GradientKernel<<<CAFFE_GET_BLOCKS(d_Y_hat->size()),
CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
D, H, W, M, Y_hat.data<float>(), Y.data<float>(),
L.data<float>(), d_Y_hat->mutable_data<float>(),
d_avg_loss.data<float>(), scale_, S.data<float>(), beta_);
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}
REGISTER_CUDA_OPERATOR(SelectSmoothL1Loss,
SelectSmoothL1LossOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(SelectSmoothL1LossGradient,
SelectSmoothL1LossGradientOp<float, CUDAContext>);
} // namespace caffe2

View File

@ -1,77 +0,0 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef SELECT_SMOOTH_L1_LOSS_OP_H_
#define SELECT_SMOOTH_L1_LOSS_OP_H_
#include "caffe2/core/context.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/operator.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
template <typename T, class Context>
class SelectSmoothL1LossOp final : public Operator<Context> {
public:
SelectSmoothL1LossOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws),
beta_(this->template GetSingleArgument<float>("beta", 1.)),
scale_(this->template GetSingleArgument<float>("scale", 1.)) {
CAFFE_ENFORCE(beta_ > 0);
CAFFE_ENFORCE(scale_ >= 0);
}
USE_OPERATOR_CONTEXT_FUNCTIONS;
bool RunOnDevice() override {
// No CPU implementation for now
CAFFE_NOT_IMPLEMENTED;
}
protected:
float beta_; // Transition point from L1 to L2 loss
float scale_; // Scale the loss by scale_
int dim_; // dimension for 1 anchor prediction
Tensor buff_{Context::GetDeviceType()}; // Buffer for element-wise differences
};
template <typename T, class Context>
class SelectSmoothL1LossGradientOp final : public Operator<Context> {
public:
SelectSmoothL1LossGradientOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws),
beta_(this->template GetSingleArgument<float>("beta", 1.)),
scale_(this->template GetSingleArgument<float>("scale", 1.)) {
CAFFE_ENFORCE(beta_ > 0);
CAFFE_ENFORCE(scale_ >= 0);
}
USE_OPERATOR_CONTEXT_FUNCTIONS;
bool RunOnDevice() override {
// No CPU implementation for now
CAFFE_NOT_IMPLEMENTED;
}
protected:
float beta_; // Transition point from L1 to L2 loss
float scale_; // Scale the loss by scale_
int dim_; // dimension for 1 anchor prediction
Tensor buff_{Context::GetDeviceType()}; // Buffer for element-wise differences
};
} // namespace caffe2
#endif // SELECT_SMOOTH_L1_LOSS_OP_H_

View File

@ -1,96 +0,0 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "sigmoid_cross_entropy_loss_op.h"
namespace caffe2 {
REGISTER_CPU_OPERATOR(
SigmoidCrossEntropyLoss,
SigmoidCrossEntropyLossOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(
SigmoidCrossEntropyLossGradient,
SigmoidCrossEntropyLossGradientOp<float, CPUContext>);
OPERATOR_SCHEMA(SigmoidCrossEntropyLoss)
.NumInputs(2)
.NumOutputs(1)
.SetDoc(R"DOC(
Compute sigmoid activations followed by averaged binary cross entropy loss. The
target values may be in {-1, 0, 1}, where -1 indicates that the corresponding
sample should be ignored and {0, 1} correspond to the binary classes 0 and 1. By
default the loss is divided by the number of targets > -1 and then multiplied by
the `scale` op argument. The divisive normalization may be disable by setting
the op argument `normalize` to 0 (the multiplication by `scale` still takes
effect).
This op fuses sigmoid and cross entropy for numerical stability in both forward
and gradient computation.
)DOC")
.Arg(
"scale",
"(float) default 1.0; multiply the loss by this scale factor.")
.Arg(
"normalize",
"(int) default 1; if true, divide the loss by the number of targets > "
"-1.")
.Input(
0,
"X",
"Tensor of predicted logits (shape must be at least 1D).")
.Input(
1,
"targets",
"Tensor of targets of type int and same shape as logits X.")
.Output(
0,
"loss",
"Scalar loss.");
OPERATOR_SCHEMA(SigmoidCrossEntropyLossGradient)
.NumInputs(3)
.NumOutputs(1)
.Input(
0,
"X",
"See SigmoidCrossEntropyLoss.")
.Input(
1,
"targets",
"See SigmoidCrossEntropyLoss.")
.Input(
2,
"d_loss",
"Gradient of forward output 0 (loss).")
.Output(
0,
"dX",
"Gradient of forward input 0 (X).");
class GetSigmoidCrossEntropyLossGradient : public GradientMakerBase {
using GradientMakerBase::GradientMakerBase;
vector<OperatorDef> GetGradientDefs() override {
return SingleGradientDef(
"SigmoidCrossEntropyLossGradient",
"",
vector<string>{I(0), I(1), GO(0)},
vector<string>{GI(0)});
}
};
REGISTER_GRADIENT(SigmoidCrossEntropyLoss, GetSigmoidCrossEntropyLossGradient);
} // namespace caffe2

View File

@ -1,190 +0,0 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "caffe2/core/context_gpu.h"
#include "modules/detectron/sigmoid_cross_entropy_loss_op.h"
namespace caffe2 {
namespace {
__global__ void ElementwiseMaxKernel(const int n, float* data, const float a) {
CUDA_1D_KERNEL_LOOP(index, n) {
data[index] = (data[index] > a) ? data[index] : a;
}
}
__global__ void SigmoidCrossEntropyLossKernel(
const int n,
const float* logits,
const int* targets,
float* losses,
float* counts) {
CUDA_1D_KERNEL_LOOP(index, n) {
if (targets[index] == -1) {
losses[index] = 0.;
counts[index] = 0.;
} else {
losses[index] =
-1. * logits[index] * (targets[index] - (logits[index] >= 0)) +
logf(
1 +
expf(logits[index] - 2 * logits[index] * (logits[index] >= 0)));
counts[index] = 1.;
}
}
}
__global__ void SigmoidCrossEntropyLossGradientKernel(
const int n,
const float* logits,
const int* targets,
float* d_logits,
float* counts) {
CUDA_1D_KERNEL_LOOP(index, n) {
if (targets[index] == -1) {
d_logits[index] = 0.;
counts[index] = 0.;
} else {
d_logits[index] = 1. / (1. + expf(-logits[index])) - targets[index];
counts[index] = 1.;
}
}
}
} // namespace
template <>
bool SigmoidCrossEntropyLossOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0);
auto& T = Input(1);
CAFFE_ENFORCE(
X.size() == T.size(),
"Logit and target must have the same size",
"(",
X.size(),
" vs. ",
T.size(),
")");
auto* avg_loss = Output(0, vector<int64_t>(), at::dtype<float>());
counts_.ResizeLike(X);
losses_.ResizeLike(X);
ReinitializeTensor(&normalizer_, vector<int64_t>(), at::dtype<float>().device(CUDA));
SigmoidCrossEntropyLossKernel<<<
CAFFE_GET_BLOCKS(X.size()),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
X.size(),
X.data<float>(),
T.data<int>(),
losses_.mutable_data<float>(),
counts_.mutable_data<float>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
float* avg_loss_data = avg_loss->mutable_data<float>();
math::Sum<float, CUDAContext>(
losses_.size(), losses_.data<float>(), avg_loss_data, &context_);
if (normalize_) {
float* normalizer_data = normalizer_.mutable_data<float>();
math::Sum<float, CUDAContext>(
counts_.size(), counts_.data<float>(), normalizer_data, &context_);
// Prevent division by zero is all counts are zero
ElementwiseMaxKernel<<<
CAFFE_GET_BLOCKS(normalizer_.size()),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(normalizer_.size(), normalizer_data, 1e-5);
C10_CUDA_KERNEL_LAUNCH_CHECK();
math::Div<float, CUDAContext>(
1, avg_loss_data, normalizer_data, avg_loss_data, &context_);
}
math::Scale<float, float, CUDAContext>(
1, scale_, avg_loss_data, avg_loss_data, &context_);
return true;
}
template <>
bool SigmoidCrossEntropyLossGradientOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0);
auto& T = Input(1);
auto& d_avg_loss = Input(2);
auto* dX = Output(0, X.sizes(), at::dtype<float>());
counts_.ResizeLike(X);
ReinitializeTensor(&normalizer_, vector<int64_t>(), at::dtype<float>().device(CUDA));
SigmoidCrossEntropyLossGradientKernel<<<
CAFFE_GET_BLOCKS(X.size()),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
X.size(),
X.data<float>(),
T.data<int>(),
dX->mutable_data<float>(),
counts_.mutable_data<float>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
if (normalize_) {
float* normalizer_data = normalizer_.mutable_data<float>();
math::Sum<float, CUDAContext>(
counts_.size(), counts_.data<float>(), normalizer_data, &context_);
// Prevent division by zero is all counts are zero
ElementwiseMaxKernel<<<
CAFFE_GET_BLOCKS(normalizer_.size()),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(normalizer_.size(), normalizer_data, 1e-5);
C10_CUDA_KERNEL_LAUNCH_CHECK();
math::Div<float, CUDAContext>(
1,
d_avg_loss.data<float>(),
normalizer_data,
normalizer_data,
&context_);
math::Scale<float, float, CUDAContext>(
1, scale_, normalizer_data, normalizer_data, &context_);
math::Scale<float, float, CUDAContext>(
dX->size(),
normalizer_data,
dX->data<float>(),
dX->mutable_data<float>(),
&context_);
} else {
math::Scale<float, float, CUDAContext>(
dX->size(),
scale_,
dX->data<float>(),
dX->mutable_data<float>(),
&context_);
math::Scale<float, float, CUDAContext>(
dX->size(),
d_avg_loss.data<float>(),
dX->data<float>(),
dX->mutable_data<float>(),
&context_);
}
return true;
}
REGISTER_CUDA_OPERATOR(
SigmoidCrossEntropyLoss,
SigmoidCrossEntropyLossOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(
SigmoidCrossEntropyLossGradient,
SigmoidCrossEntropyLossGradientOp<float, CUDAContext>);
} // namespace caffe2

View File

@ -1,78 +0,0 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef SIGMOID_CROSS_ENTROPY_LOSS_OP_H_
#define SIGMOID_CROSS_ENTROPY_LOSS_OP_H_
#include "caffe2/core/context.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/operator.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
template <typename T, class Context>
class SigmoidCrossEntropyLossOp final : public Operator<Context> {
public:
SigmoidCrossEntropyLossOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws),
scale_(this->template GetSingleArgument<float>("scale", 1.)),
normalize_(this->template GetSingleArgument<int>("normalize", 1)) {
CAFFE_ENFORCE(scale_ >= 0);
CAFFE_ENFORCE(normalize_ == 0 || normalize_ == 1);
}
USE_OPERATOR_CONTEXT_FUNCTIONS;
bool RunOnDevice() override {
// No CPU implementation for now
CAFFE_NOT_IMPLEMENTED;
}
protected:
float scale_;
int normalize_;
Tensor losses_{Context::GetDeviceType()};
Tensor counts_{Context::GetDeviceType()};
Tensor normalizer_;
};
template <typename T, class Context>
class SigmoidCrossEntropyLossGradientOp final : public Operator<Context> {
public:
SigmoidCrossEntropyLossGradientOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws),
scale_(this->template GetSingleArgument<float>("scale", 1.)),
normalize_(this->template GetSingleArgument<int>("normalize", 1)) {
CAFFE_ENFORCE(scale_ >= 0);
CAFFE_ENFORCE(normalize_ == 0 || normalize_ == 1);
}
USE_OPERATOR_CONTEXT_FUNCTIONS;
bool RunOnDevice() override {
// No CPU implementation for now
CAFFE_NOT_IMPLEMENTED;
}
protected:
float scale_;
int normalize_;
Tensor counts_{Context::GetDeviceType()};
Tensor normalizer_;
};
} // namespace caffe2
#endif // SIGMOID_CROSS_ENTROPY_LOSS_OP_H_

View File

@ -1,119 +0,0 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "sigmoid_focal_loss_op.h"
namespace caffe2 {
REGISTER_CPU_OPERATOR(SigmoidFocalLoss, SigmoidFocalLossOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(
SigmoidFocalLossGradient,
SigmoidFocalLossGradientOp<float, CPUContext>);
OPERATOR_SCHEMA(SigmoidFocalLoss)
.NumInputs(3)
.NumOutputs(1)
.SetDoc(R"DOC(
The binary form of Focal Loss designed for use in RetinaNet-like models.
The input is assumed to be unnormalized scores (sometimes called 'logits')
arranged in a 4D tensor with shape (N, C, H, W), where N is the number of
elements in the batch, H and W are the height and width, and C = num_anchors *
num_classes defines num_anchors 'groups' of logits, each of length
num_classes. For the binary form of Focal Loss, num_classes does not include
the background category. (So, for COCO, num_classes = 80, not 81.)
The binary form of focal loss is:
FL(p_t) = -alpha * (1 - p_t)**gamma * log(p_t),
where p = sigmoid(x), p_t = p or 1 - p depending on if the label is 1 or 0,
respectively.
See: https://arxiv.org/abs/1708.02002 for details.
)DOC")
.Arg(
"scale",
"(float) default 1.0; multiply the loss by this scale factor.")
.Arg(
"alpha",
"(float) default 0.25; Focal Loss's alpha hyper-parameter.")
.Arg(
"gamma",
"(float) default 1.0; Focal Loss's gamma hyper-parameter.")
.Arg(
"num_classes",
"(int) default 80; number of classes (excluding background).")
.Input(
0,
"logits",
"4D tensor of sigmoid inputs (called 'scores' or 'logits') with shape "
"(N, C, H, W), where C = num_anchors * num_classes.")
.Input(
1,
"labels",
"4D tensor of labels with shape (N, num_anchors, H, W). Each entry is "
"a class label in [0, num_classes - 1] (inclusive). The label "
"identifies the one class that should have a sigmoid target of 1.")
.Input(
2,
"normalizer",
"Scalar; the loss is normalized by 1 / max(1, normalizer)."
)
.Output(
0,
"loss",
"Scalar loss.");
OPERATOR_SCHEMA(SigmoidFocalLossGradient)
.NumInputs(4)
.NumOutputs(1)
.Input(
0,
"logits",
"See SigmoidFocalLoss.")
.Input(
1,
"labels",
"See SigmoidFocalLoss.")
.Input(
2,
"normalizer",
"See SigmoidFocalLoss.")
.Input(
3,
"d_loss",
"Gradient of forward output 0 (loss)")
.Output(
0,
"d_logits",
"Gradient of forward input 0 (logits)");
class GetSigmoidFocalLossGradient : public GradientMakerBase {
using GradientMakerBase::GradientMakerBase;
vector<OperatorDef> GetGradientDefs() override {
vector<string> blob_names{
{I(0), I(1), I(2), GO(0)},
};
return SingleGradientDef(
"SigmoidFocalLossGradient", "", blob_names, vector<string>{GI(0)});
}
};
REGISTER_GRADIENT(SigmoidFocalLoss, GetSigmoidFocalLossGradient);
} // namespace caffe2

View File

@ -1,185 +0,0 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cfloat>
#include "caffe2/core/context_gpu.h"
#include "modules/detectron/sigmoid_focal_loss_op.h"
namespace caffe2 {
namespace {
__global__ void SigmoidFocalLossKernel(
const int N, const int D, const int H, const int W, const float* logits,
const int* targets, const float* weight_pos,
const float gamma, const float alpha,
const int num_classes, float* losses) {
CUDA_1D_KERNEL_LOOP(i, N * D * H * W) {
int x = i % W;
int y = (i / W) % H;
int c = (i / (W * H)) % D; // channel, here D is channel dim in input NxDxHxW
int n = i / (W * H * D); // n in NxDxHxW
int A = D / num_classes; // num_anchors = A
int a = c / num_classes; // current anchor out of A anchors in D = A * num_cls
int d = c % num_classes; // current class
int t = targets[n * (H * W * A) + a * (H * W) + y * W + x]; // target
// check whether the class is true class or not.
// The target classes are in range 1 - 81 and the d is in range 0-80
// because we predict A*80 dim, so for comparison purpose, compare t and (d+1)
float c1 = (t == (d + 1));
float c2 = (t != -1 & t != (d + 1));
float Np = c10::cuda::compat::max(weight_pos[0], static_cast<float>(1.0));
float zn = (1.0 - alpha) / Np;
float zp = alpha / Np;
// p = 1. / 1. + expf(-x)
float p = 1. / (1. + expf(-logits[i]));
// (1 - p)**gamma * log(p) where
float term1 = powf((1. - p), gamma) * logf(c10::cuda::compat::max(p, FLT_MIN));
// p**gamma * log(1 - p)
float term2 =
powf(p, gamma) *
(-1. * logits[i] * (logits[i] >= 0) -
logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0))));
losses[i] = 0.0;
losses[i] += -c1 * term1 * zp;
losses[i] += -c2 * term2 * zn;
}
}
__global__ void SigmoidFocalLossGradientKernel(
const int N, const int D, const int H, const int W, const float* logits,
const int* targets, float* dX_data, const float* weight_pos,
const float gamma, const float alpha, const int num_classes,
const float* avg_loss) {
CUDA_1D_KERNEL_LOOP(i, N * D * H * W) {
float a_loss = avg_loss[0];
int x = i % W;
int y = (i / W) % H;
int c = (i / (W * H)) % D;
int n = i / (W * H * D);
int A = D / num_classes; // num_anchors
int a = c / num_classes; // current anchor
int d = c % num_classes; // current class
float Np = c10::cuda::compat::max(weight_pos[0], static_cast<float>(1.0));
float zn = (1.0 - alpha) / Np;
float zp = alpha / Np;
int t = targets[n * (H * W * A) + a * (H * W) + y * W + x];
float c1 = (t == (d + 1));
float c2 = (t != -1 & t != (d + 1));
float p = 1. / (1. + expf(-logits[i]));
// (1-p)**g * (1 - p - g*p*log(p))
float term1 =
powf((1. - p), gamma) *
(1. - p - (p * gamma * logf(c10::cuda::compat::max(p, FLT_MIN))));
// (p**g) * (g*(1-p)*log(1-p) - p)
float term2 =
powf(p, gamma) *
((-1. * logits[i] * (logits[i] >= 0) -
logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))) *
(1. - p) * gamma - p);
dX_data[i] = 0.0;
dX_data[i] += -c1 * zp * term1;
dX_data[i] += -c2 * zn * term2;
dX_data[i] = dX_data[i] * a_loss;
}
}
} // namespace
template<>
bool SigmoidFocalLossOp<float, CUDAContext>::RunOnDevice() {
// Input logits, for example: N x (A * 80) x H x W in cls-agnostic
auto& X = Input(0);
// Target, for example: N x A x H x W
auto& T = Input(1);
// Number of positive examples: scalar
auto& wp = Input(2);
// output avg Sigmoid focal loss as mentioned in RetinaNet paper
int N = X.dim32(0);
int D = X.dim32(1);
int H = X.dim32(2);
int W = X.dim32(3);
auto* avg_loss = Output(0, vector<int64_t>(), at::dtype<float>());
losses_.ResizeLike(X);
float* avg_loss_data = avg_loss->mutable_data<float>();
SigmoidFocalLossKernel<<<CAFFE_GET_BLOCKS(X.size()),
CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
N, D, H, W, X.data<float>(), T.data<int>(),
wp.data<float>(), gamma_, alpha_, num_classes_,
losses_.mutable_data<float>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
math::Sum<float, CUDAContext>(
losses_.size(), losses_.data<float>(), avg_loss_data, &context_);
math::Scale<float, float, CUDAContext>(
1, scale_, avg_loss_data, avg_loss_data, &context_);
return true;
}
template<>
bool SigmoidFocalLossGradientOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0);
auto& T = Input(1);
auto& wp = Input(2);
auto& d_avg_loss = Input(InputSize() - 1);
// get input shape
int N = X.dim32(0);
int D = X.dim32(1);
int H = X.dim32(2);
int W = X.dim32(3);
auto* dX = Output(0, X.sizes(), at::dtype<float>());
SigmoidFocalLossGradientKernel<<<CAFFE_GET_BLOCKS(X.size()),
CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
N, D, H, W, X.data<float>(), T.data<int>(), dX->mutable_data<float>(),
wp.data<float>(), gamma_, alpha_, num_classes_,
d_avg_loss.data<float>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
math::Scale<float, float, CUDAContext>(
dX->size(),
scale_,
dX->data<float>(),
dX->mutable_data<float>(),
&context_);
return true;
}
REGISTER_CUDA_OPERATOR(SigmoidFocalLoss,
SigmoidFocalLossOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(SigmoidFocalLossGradient,
SigmoidFocalLossGradientOp<float, CUDAContext>);
} // namespace caffe2

View File

@ -1,83 +0,0 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef SIGMOID_FOCAL_LOSS_OP_H_
#define SIGMOID_FOCAL_LOSS_OP_H_
#include "caffe2/core/context.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/operator.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
template <typename T, class Context>
class SigmoidFocalLossOp final : public Operator<Context> {
public:
SigmoidFocalLossOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws),
scale_(this->template GetSingleArgument<float>("scale", 1.)),
num_classes_(this->template GetSingleArgument<int>("num_classes", 80)),
gamma_(this->template GetSingleArgument<float>("gamma", 1.)),
alpha_(this->template GetSingleArgument<float>("alpha", 0.25)) {
CAFFE_ENFORCE(scale_ >= 0);
}
USE_OPERATOR_CONTEXT_FUNCTIONS;
bool RunOnDevice() override {
// No CPU implementation for now
CAFFE_NOT_IMPLEMENTED;
}
protected:
float scale_;
int num_classes_;
float gamma_;
float alpha_;
Tensor losses_{Context::GetDeviceType()};
Tensor counts_{Context::GetDeviceType()};
};
template <typename T, class Context>
class SigmoidFocalLossGradientOp final : public Operator<Context> {
public:
SigmoidFocalLossGradientOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws),
scale_(this->template GetSingleArgument<float>("scale", 1.)),
num_classes_(this->template GetSingleArgument<int>("num_classes", 80)),
gamma_(this->template GetSingleArgument<float>("gamma", 1.)),
alpha_(this->template GetSingleArgument<float>("alpha", 0.25)) {
CAFFE_ENFORCE(scale_ >= 0);
}
USE_OPERATOR_CONTEXT_FUNCTIONS;
bool RunOnDevice() override {
// No CPU implementation for now
CAFFE_NOT_IMPLEMENTED;
}
protected:
float scale_;
int num_classes_;
float gamma_;
float alpha_;
Tensor counts_{Context::GetDeviceType()};
Tensor weights_{Context::GetDeviceType()}; // unignored weights
};
} // namespace caffe2
#endif // SIGMOID_FOCAL_LOSS_OP_H_

View File

@ -1,117 +0,0 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "smooth_l1_loss_op.h"
namespace caffe2 {
REGISTER_CPU_OPERATOR(SmoothL1Loss, SmoothL1LossOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(
SmoothL1LossGradient,
SmoothL1LossGradientOp<float, CPUContext>);
OPERATOR_SCHEMA(SmoothL1Loss)
.NumInputs(4)
.NumOutputs(1)
.SetDoc(R"DOC(
Smooth L1 Loss is a minor variation of Huber loss in which the point of
transition between L2 loss and L1 loss is adjustable by a hyper-parameter beta:
SmoothL1(x) = 0.5 * x^2 / beta if |x| < beta
|x| - 0.5 * beta otherwise.
SmoothL1 is used in Fast R-CNN and descendants as the loss function for bounding
box regression.
The loss computed by this op has a flexible form:
scale / N * sum_i alpha_out[i] * SmoothL1(alpha_in[i] * (y_hat[i] - y[i])).
The weights alpha_in and alpha_out are called the "inside" and "outside"
weights, respectively. The inside weights are typically set to either 0 or 1 to
implement ignoring (when 0) certain samples. The outside weights can be used
to implement a per-sample loss weight. The overall loss is scaled by scale / N,
where N is the number of batch elements in the input predictions.
)DOC")
.Arg(
"beta",
"(float) default 1.0; L2 to L1 transition point.")
.Arg(
"scale",
"(float) default 1.0; multiply the loss by this scale factor.")
.Input(
0,
"Y_hat",
"Tensor of predictions (at least 1D).")
.Input(
1,
"Y",
"Tensor of labels with the same shape as Y_hat.")
.Input(
2,
"alpha_in",
"Tensor of inside weights with the same shape as Y.")
.Input(
3,
"alpha_out",
"Tensor of outside weights with the same shape as Y.")
.Output(
0,
"loss",
"Scalar loss.");
OPERATOR_SCHEMA(SmoothL1LossGradient)
.NumInputs(5)
.NumOutputs(1)
.Input(
0,
"Y_hat",
"See SmoothL1Loss.")
.Input(
1,
"Y",
"See SmoothL1Loss.")
.Input(
2,
"alpha_in",
"See SmoothL1Loss.")
.Input(
3,
"alpha_out",
"See SmoothL1Loss.")
.Input(
4,
"d_loss",
"Gradient of forward output 0 (loss).")
.Output(
0,
"d_Y_hat",
"Gradient of forward input 0 (Y_hat).");
class GetSmoothL1LossGradient : public GradientMakerBase {
using GradientMakerBase::GradientMakerBase;
vector<OperatorDef> GetGradientDefs() override {
return SingleGradientDef(
"SmoothL1LossGradient",
"",
vector<string>{I(0), I(1), I(2), I(3), GO(0)},
vector<string>{GI(0)});
}
};
REGISTER_GRADIENT(SmoothL1Loss, GetSmoothL1LossGradient);
} // namespace caffe2

View File

@ -1,185 +0,0 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "caffe2/core/context_gpu.h"
#include "modules/detectron/smooth_l1_loss_op.h"
namespace caffe2 {
namespace {
template <typename T>
__global__ void SmoothL1Kernel(
const int n, const T* in, T* out, T beta) {
// f(x) = 0.5 * x^2 / beta if |x| < beta
// |x| - 0.5 * beta otherwise
CUDA_1D_KERNEL_LOOP(index, n) {
T val = in[index];
T abs_val = c10::cuda::compat::abs(val);
if (abs_val < beta) {
out[index] = 0.5 * val * val / beta;
} else {
out[index] = abs_val - 0.5 * beta;
}
}
}
template <typename T>
__global__ void SmoothL1GradientKernel(
const int n,
const T* in,
T* out,
const T* d_loss_data,
T norm,
T beta) {
// f'(x) = x / beta if |x| < beta
// = sign(x) otherwise
// We also scale by norm * d_loss in this kernel for convenience
CUDA_1D_KERNEL_LOOP(index, n) {
T val = in[index];
T abs_val = c10::cuda::compat::abs(val);
T d_loss = *d_loss_data;
if (abs_val < beta) {
out[index] = norm * d_loss * val / beta;
} else {
out[index] = norm * d_loss * ((T(0) < val) - (val < T(0)));
}
}
}
} // namespace
template<>
bool SmoothL1LossOp<float, CUDAContext>::RunOnDevice() {
auto& Y_hat = Input(0);
auto& Y = Input(1);
auto& alpha_in = Input(2);
auto& alpha_out = Input(3);
int N = Y.dim32(0);
// Require the same number of elements along axis 0 (batch size), but
// otherwise don't care about the shape (just the number of elements)
CAFFE_ENFORCE_EQ(Y_hat.dim32(0), Y.dim32(0),
"Y_hat and Y must have the same number of elements along axis 0");
CAFFE_ENFORCE_EQ(Y_hat.size(), Y.size(),
"Y_hat and Y must have the same number of elements");
CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_in.size());
CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_out.size());
auto* avg_loss = Output(0, vector<int64_t>(), at::dtype<float>());
buff_.ResizeLike(Y);
// Difference
// d := y_hat - y
math::Sub<float, CUDAContext>(
Y.size(), Y_hat.data<float>(), Y.data<float>(),
buff_.mutable_data<float>(), &context_);
// Element-wise weighted difference (can be used to ignore or reweight
// specific components)
// d := alpha_in * (y_hat - y)
math::Mul<float, CUDAContext>(
buff_.size(), buff_.data<float>(), alpha_in.data<float>(),
buff_.mutable_data<float>(), &context_);
// Element-wise smooth l1 loss
// l := SmoothL1(alpha_in * (y_hat - y))
SmoothL1Kernel<float>
<<<CAFFE_GET_BLOCKS(buff_.size()),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
buff_.size(), buff_.data<float>(), buff_.mutable_data<float>(),
beta_);
C10_CUDA_KERNEL_LAUNCH_CHECK();
// Element-wise weighted smooth l1 loss (can be used to specify a per-element
// loss weight)
// l := alpha_out * SmoothL1(alpha_in * (y_hat - y))
math::Mul<float, CUDAContext>(
buff_.size(), buff_.data<float>(), alpha_out.data<float>(),
buff_.mutable_data<float>(), &context_);
// Sum of all losses
// al := sum_i l_i
float* avg_loss_data = avg_loss->mutable_data<float>();
math::Sum<float, CUDAContext>(
buff_.size(), buff_.data<float>(), avg_loss_data, &context_);
// Average of input batch size
// al := 1/N * al
math::Scale<float, float, CUDAContext>(
1, scale_ / N, avg_loss_data, avg_loss_data, &context_);
return true;
}
template<>
bool SmoothL1LossGradientOp<float, CUDAContext>::RunOnDevice() {
auto& Y_hat = Input(0);
auto& Y = Input(1);
auto& alpha_in = Input(2);
auto& alpha_out = Input(3);
auto& d_avg_loss = Input(4); // gradient of net w.r.t. avg_loss ("gradOutput")
// We intentially don't compute gradients for Y, alpha_{in,out} since they
// are not needed (can change in the future if desired)
int N = Y.dim32(0);
// Require the same number of elements along axis 0 (batch size), but
// otherwise don't care about the shape (just the number of elements)
CAFFE_ENFORCE_EQ(Y_hat.dim32(0), Y.dim32(0),
"Y_hat and Y must have the same number of elements along axis 0");
CAFFE_ENFORCE_EQ(Y_hat.size(), Y.size(),
"Y_hat and Y must have the same number of elements");
CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_in.size());
CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_out.size());
CAFFE_ENFORCE_EQ(d_avg_loss.size(), 1);
auto* d_Y_hat = Output(0, Y_hat.sizes(), at::dtype<float>()); // gradient of net w.r.t. Y_hat ("gradInput")
buff_.ResizeLike(Y);
// Difference
// d := y_hat - y
math::Sub<float, CUDAContext>(
Y.size(), Y_hat.data<float>(), Y.data<float>(),
buff_.mutable_data<float>(), &context_);
// Element-wise weighted difference (can be used to ignore or reweight
// specific components)
// d := alpha_in * (y_hat - y)
math::Mul<float, CUDAContext>(
buff_.size(), buff_.data<float>(), alpha_in.data<float>(),
buff_.mutable_data<float>(), &context_);
// d_Y_hat := d_avg_loss / N * SmoothL1'(alpha_in * (y_hat - y))
SmoothL1GradientKernel<float>
<<<CAFFE_GET_BLOCKS(buff_.size()),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
buff_.size(), buff_.data<float>(), d_Y_hat->mutable_data<float>(),
d_avg_loss.data<float>(), scale_ / N, beta_);
C10_CUDA_KERNEL_LAUNCH_CHECK();
// Element-wise scale by alpha_in and alpha_out
math::Mul<float, CUDAContext>(
d_Y_hat->size(), d_Y_hat->data<float>(), alpha_in.data<float>(),
d_Y_hat->mutable_data<float>(), &context_);
math::Mul<float, CUDAContext>(
d_Y_hat->size(), d_Y_hat->data<float>(), alpha_out.data<float>(),
d_Y_hat->mutable_data<float>(), &context_);
return true;
}
REGISTER_CUDA_OPERATOR(SmoothL1Loss,
SmoothL1LossOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(SmoothL1LossGradient,
SmoothL1LossGradientOp<float, CUDAContext>);
} // namespace caffe2

View File

@ -1,75 +0,0 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef SMOOTH_L1_LOSS_OP_H_
#define SMOOTH_L1_LOSS_OP_H_
#include "caffe2/core/context.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/operator.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
template <typename T, class Context>
class SmoothL1LossOp final : public Operator<Context> {
public:
SmoothL1LossOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws),
beta_(this->template GetSingleArgument<float>("beta", 1.)),
scale_(this->template GetSingleArgument<float>("scale", 1.)) {
CAFFE_ENFORCE(beta_ > 0);
CAFFE_ENFORCE(scale_ >= 0);
}
USE_OPERATOR_CONTEXT_FUNCTIONS;
bool RunOnDevice() override {
// No CPU implementation for now
CAFFE_NOT_IMPLEMENTED;
}
protected:
float beta_; // Transition point from L1 to L2 loss
float scale_; // Scale the loss by scale_
Tensor buff_{Context::GetDeviceType()}; // Buffer for element-wise differences
};
template <typename T, class Context>
class SmoothL1LossGradientOp final : public Operator<Context> {
public:
SmoothL1LossGradientOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws),
beta_(this->template GetSingleArgument<float>("beta", 1.)),
scale_(this->template GetSingleArgument<float>("scale", 1.)) {
CAFFE_ENFORCE(beta_ > 0);
CAFFE_ENFORCE(scale_ >= 0);
}
USE_OPERATOR_CONTEXT_FUNCTIONS;
bool RunOnDevice() override {
// No CPU implementation for now
CAFFE_NOT_IMPLEMENTED;
}
protected:
float beta_; // Transition point from L1 to L2 loss
float scale_; // Scale the loss by scale_
Tensor buff_{Context::GetDeviceType()}; // Buffer for element-wise differences
};
} // namespace caffe2
#endif // SMOOTH_L1_LOSS_OP_H_

View File

@ -1,104 +0,0 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "modules/detectron/softmax_focal_loss_op.h"
#include "caffe2/operators/softmax_utils.h"
namespace caffe2 {
REGISTER_CPU_OPERATOR(SoftmaxFocalLoss, SoftmaxFocalLossOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(
SoftmaxFocalLossGradient,
SoftmaxFocalLossGradientOp<float, CPUContext>);
OPERATOR_SCHEMA(SoftmaxFocalLoss)
.NumInputs(3)
.NumOutputs(2)
.SetDoc(R"DOC(
A multiclass form of Focal Loss designed for use in RetinaNet-like models.
The input is assumed to be unnormalized scores (sometimes called 'logits')
arranged in a 4D tensor with shape (N, C, H, W), where N is the number of
elements in the batch, H and W are the height and width, and C = num_anchors *
num_classes. The softmax is applied num_anchors times along the C axis.
The softmax version of focal loss is:
FL(p_t) = -alpha * (1 - p_t)**gamma * log(p_t),
where p_i = exp(s_i) / sum_j exp(s_j), t is the target (ground truth) class, and
s_j is the unnormalized score for class j.
See: https://arxiv.org/abs/1708.02002 for details.
)DOC")
.Arg(
"scale",
"(float) default 1.0; multiply the loss by this scale factor.")
.Arg("alpha", "(float) default 0.25; Focal Loss's alpha hyper-parameter.")
.Arg("gamma", "(float) default 1.0; Focal Loss's gamma hyper-parameter.")
.Arg(
"num_classes",
"(int) default 81; number of classes in each softmax group.")
.Input(
0,
"scores",
"4D tensor of softmax inputs (called 'scores' or 'logits') with shape "
"(N, C, H, W), where C = num_anchors * num_classes defines num_anchors "
"groups of contiguous num_classes softmax inputs.")
.Input(
1,
"labels",
"4D tensor of labels with shape (N, num_anchors, H, W). Each entry is "
"a class label in [0, num_classes - 1] (inclusive).")
.Input(
2,
"normalizer",
"Scalar; the loss is normalized by 1 / max(1, normalizer).")
.Output(0, "loss", "Scalar loss.")
.Output(
1,
"probabilities",
"4D tensor of softmax probabilities with shape (N, C, H, W), where "
"C = num_anchors * num_classes, and softmax was applied to each of the "
"num_anchors groups; within a group the num_classes values sum to 1.");
OPERATOR_SCHEMA(SoftmaxFocalLossGradient)
.NumInputs(5)
.NumOutputs(1)
.Input(0, "scores", "See SoftmaxFocalLoss.")
.Input(1, "labels", "See SoftmaxFocalLoss.")
.Input(2, "normalizer", "See SoftmaxFocalLoss.")
.Input(
3,
"probabilities",
"Output 1 from SoftmaxFocalLoss; See SoftmaxFocalLoss.")
.Input(4, "d_loss", "Gradient of forward output 0 (loss)")
.Output(0, "d_scores", "Gradient of forward input 0 (scores)");
class GetSoftmaxFocalLossGradient : public GradientMakerBase {
using GradientMakerBase::GradientMakerBase;
vector<OperatorDef> GetGradientDefs() override {
return SingleGradientDef(
"SoftmaxFocalLossGradient",
"",
vector<string>{I(0), I(1), I(2), O(1), GO(0)},
vector<string>{GI(0)});
}
};
REGISTER_GRADIENT(SoftmaxFocalLoss, GetSoftmaxFocalLossGradient);
} // namespace caffe2

View File

@ -1,256 +0,0 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cfloat>
#include "caffe2/core/context_gpu.h"
#include "modules/detectron/softmax_focal_loss_op.h"
namespace caffe2 {
namespace {
__global__ void SpatialSoftmaxKernel(const int N, const int A,
const int H, const int W, const float* Xdata, float* Pdata,
const int num_classes) {
CUDA_1D_KERNEL_LOOP(index, N * A * H * W) {
int D = num_classes * A;
int x = index % W;
int y = (index / W) % H;
int a = (index / (W * H)) % A;
int i = index / W / H / A;
// Subtract max on each cell for numerical reasons
float max_val = -FLT_MAX;
for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
max_val = max(max_val, Xdata[idx]);
}
// Exponentiate
float expsum = 0.0f;
for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
float expx = exp(Xdata[idx] - max_val);
Pdata[idx] = expx;
expsum += expx;
}
// Normalize
for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
Pdata[idx] /= expsum;
}
}
}
__global__ void SoftmaxFocalLossKernel(
const int N, const int A, const int H, const int W,
const float* Pdata, const int* targets, float* losses,
const float* weight_pos, const float gamma, const float alpha,
const int num_classes) {
CUDA_1D_KERNEL_LOOP(i, N * A * H * W) {
int D = A * num_classes;
int x = i % W;
int y = (i / W) % H;
int a = (i / (W * H)) % A;
int n = i / (W * H * A);
const int label = static_cast<int>(targets[i]);
float Np = c10::cuda::compat::max(weight_pos[0], static_cast<float>(1.0));
float z = (label == 0) * (1 - alpha) / Np +
(label >= 1) * alpha / Np;
losses[i] = 0.0;
if (label >= 0) {
int offset = a * num_classes;
int idx = n * (H * W * D) + (offset + label) * (H * W) + y * W + x;
losses[i] =
-(pow(1.0f - Pdata[idx], gamma) *
log(c10::cuda::compat::max(Pdata[idx], FLT_MIN))) * z;
}
}
}
__global__ void SoftmaxFocalLossGradientWeightKernel(
const int N, const int A, const int H, const int W,
const float* Pdata, const int* targets, float* buff,
const float* weight_pos, const float gamma, const float alpha,
const int num_classes) {
CUDA_1D_KERNEL_LOOP(i, N * A * H * W) {
int D = A * num_classes;
int x = i % W;
int y = (i / W) % H;
int a = (i / (W * H)) % A;
int n = i / (W * H * A);
const int label = static_cast<int>(targets[i]);
float Np = c10::cuda::compat::max(weight_pos[0], static_cast<float>(1.0));
float z = (label == 0) * (1 - alpha) / Np +
(label >= 1) * alpha / Np;
buff[i] = 0.0;
if (label >= 0) {
int offset = a * num_classes;
int idx = n * (H * W * D) + (offset + label) * (H * W) + y * W + x;
float onemp = 1. - Pdata[idx];
float p = Pdata[idx];
buff[i] =
(-pow(onemp, gamma) +
gamma * pow(onemp, gamma - 1) * p * log(c10::cuda::compat::max(p, FLT_MIN))) * z;
}
}
}
__global__ void SoftmaxFocalLossGradientKernel(
const int N, const int D, const int H, const int W,
const float* Pdata, const int* targets, const float* buff,
const float* d_loss_data, float* dX, const int num_classes) {
CUDA_1D_KERNEL_LOOP(i, N * D * H * W) {
int A = D / num_classes;
int x = i % W;
int y = (i / W) % H;
int d = (i / (W * H)) % D;
int a = d / num_classes;
int c = d % num_classes;
int n = i / (W * H * D);
float d_loss = *d_loss_data;
int ind = n * (H * W * A) + a * (H * W) + y * W + x;
const int label = static_cast<int>(targets[ind]);
float c1 = (label >= 0) * 1.0;
float c2 = (label == c) * 1.0;
dX[i] = 0.0;
dX[i] = c1 * d_loss * buff[ind] * (c2 - Pdata[i]);
}
}
} // namespace
template <>
bool SoftmaxFocalLossOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0); // Logits
auto& T = Input(1); // Labels
auto& wp = Input(2); // num of foreground
// average loss as output
// softmax probability, going to be re-used in gradient
int N = X.dim32(0);
int D = X.dim32(1);
int H = X.dim32(2);
int W = X.dim32(3);
int A = D / num_classes_;
ReinitializeTensor(&losses_, {N * A * H * W}, at::dtype<float>().device(CUDA));
auto* P = Output(1, {N * D * H * W}, at::dtype<float>());
auto* avg_loss = Output(0, vector<int64_t>(), at::dtype<float>());
math::Set<float, CUDAContext>(
avg_loss->size(), 0.f, avg_loss->mutable_data<float>(), &context_);
math::Set<float, CUDAContext>(
P->size(), 0.f, P->mutable_data<float>(), &context_);
math::Set<float, CUDAContext>(
losses_.size(), 0.f, losses_.mutable_data<float>(), &context_);
TORCH_DCHECK_EQ(X.ndim(), 4);
const float* Xdata = X.data<float>();
const float* Wdata = wp.data<float>();
// Spatial Softmax Kernel
SpatialSoftmaxKernel
<<<CAFFE_GET_BLOCKS(N * A * H * W), CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
N, A, H, W, Xdata, P->mutable_data<float>(), num_classes_);
C10_CUDA_KERNEL_LAUNCH_CHECK();
// Compute loss for each x,y location
const int* Tdata = T.data<int>();
SoftmaxFocalLossKernel
<<<CAFFE_GET_BLOCKS(N * A * H * W), CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
N, A, H, W, P->data<float>(), Tdata, losses_.mutable_data<float>(),
Wdata, gamma_, alpha_, num_classes_);
C10_CUDA_KERNEL_LAUNCH_CHECK();
// sum the losses
float* avg_loss_data = avg_loss->mutable_data<float>();
math::Sum<float, CUDAContext>(
losses_.size(), losses_.data<float>(), avg_loss_data, &context_);
math::Scale<float, float, CUDAContext>(
1, scale_, avg_loss_data, avg_loss_data, &context_);
return true;
}
template<>
bool SoftmaxFocalLossGradientOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0); // Logits
auto& T = Input(1); // Label
auto& wp = Input(2); // num of foreground example
auto& P = Input(3); // Softmax Probability
auto& d_avg_loss = Input(4);
int N = X.dim32(0);
int D = X.dim32(1);
int H = X.dim32(2);
int W = X.dim32(3);
int A = D / num_classes_;
ReinitializeTensor(&buff_, {N * A * H * W}, at::dtype<float>().device(CUDA));
auto* dX = Output(0, X.sizes(), at::dtype<float>()); // gradient wrt logits
const float* Xdata = X.data<float>();
const int* Tdata = T.data<int>();
const float* Pdata = P.data<float>();
const float* Wdata = wp.data<float>();
// Compute the weight for gradients
SoftmaxFocalLossGradientWeightKernel
<<<CAFFE_GET_BLOCKS(N * A * H * W), CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
N, A, H, W, Pdata, Tdata, buff_.mutable_data<float>(),
Wdata, gamma_, alpha_, num_classes_);
C10_CUDA_KERNEL_LAUNCH_CHECK();
// Compute the gradient with the weights
const float* Bdata = buff_.data<float>();
SoftmaxFocalLossGradientKernel
<<<CAFFE_GET_BLOCKS(N * D * H * W), CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
N, D, H, W, Pdata, Tdata, Bdata, d_avg_loss.data<float>(),
dX->mutable_data<float>(), num_classes_);
C10_CUDA_KERNEL_LAUNCH_CHECK();
math::Scale<float, float, CUDAContext>(
dX->size(),
scale_,
dX->data<float>(),
dX->mutable_data<float>(),
&context_);
return true;
}
REGISTER_CUDA_OPERATOR(SoftmaxFocalLoss,
SoftmaxFocalLossOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(SoftmaxFocalLossGradient,
SoftmaxFocalLossGradientOp<float, CUDAContext>);
} // namespace caffe2

View File

@ -1,91 +0,0 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef SOFTMAX_FOCAL_LOSS_OP_H_
#define SOFTMAX_FOCAL_LOSS_OP_H_
#include "caffe2/core/context.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/operator.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
template <typename T, class Context>
class SoftmaxFocalLossOp final : public Operator<Context> {
public:
SoftmaxFocalLossOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws),
scale_(this->template GetSingleArgument<float>("scale", 1.)),
gamma_(this->template GetSingleArgument<float>("gamma", 1.)),
alpha_(this->template GetSingleArgument<float>("alpha", 0.25)),
num_classes_(this->template GetSingleArgument<int>("num_classes", 81)),
order_(StringToStorageOrder(
this->template GetSingleArgument<string>("order", "NCHW"))) {
CAFFE_ENFORCE(scale_ >= 0);
CAFFE_ENFORCE_EQ(
order_, StorageOrder::NCHW, "Only NCHW order is supported right now.");
}
USE_OPERATOR_CONTEXT_FUNCTIONS;
bool RunOnDevice() override {
// No CPU implementation for now
CAFFE_NOT_IMPLEMENTED;
}
protected:
float scale_;
float gamma_;
float alpha_;
int num_classes_;
StorageOrder order_;
Tensor losses_;
};
template <typename T, class Context>
class SoftmaxFocalLossGradientOp final : public Operator<Context> {
public:
SoftmaxFocalLossGradientOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws),
scale_(this->template GetSingleArgument<float>("scale", 1.)),
gamma_(this->template GetSingleArgument<float>("gamma", 1.)),
alpha_(this->template GetSingleArgument<float>("alpha", 0.25)),
num_classes_(this->template GetSingleArgument<int>("num_classes", 81)),
order_(StringToStorageOrder(
this->template GetSingleArgument<string>("order", "NCHW"))) {
CAFFE_ENFORCE(scale_ >= 0);
CAFFE_ENFORCE_EQ(
order_, StorageOrder::NCHW, "Only NCHW order is supported right now.");
}
USE_OPERATOR_CONTEXT_FUNCTIONS;
bool RunOnDevice() override {
// No CPU implementation for now
CAFFE_NOT_IMPLEMENTED;
}
protected:
float scale_;
float gamma_;
float alpha_;
int num_classes_;
StorageOrder order_;
Tensor buff_;
};
} // namespace caffe2
#endif // SOFTMAX_FOCAL_LOSS_OP_H_

View File

@ -1,79 +0,0 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "spatial_narrow_as_op.h"
namespace caffe2 {
REGISTER_CPU_OPERATOR(SpatialNarrowAs, SpatialNarrowAsOp<CPUContext>);
REGISTER_CPU_OPERATOR(
SpatialNarrowAsGradient,
SpatialNarrowAsGradientOp<CPUContext>);
OPERATOR_SCHEMA(SpatialNarrowAs)
.NumInputs(2)
.NumOutputs(1)
.SetDoc(R"DOC(
Reduces ("narrows") the spatial extent of A to that of B by removing rows and
columns from the bottom and right.
)DOC")
.Input(
0,
"A",
"3D or 4D input of shape (N, H0, W0) or (N, C, H0, W0).")
.Input(
1,
"B",
"3D or 4D input of shape (N, H1, W1) or (N, C, H1, W1), where H1 <= H0 "
"and W1 <= W0.")
.Output(
0,
"C",
"Sub window of A containing rows [0, H1 - 1] (inclusive) and columns "
"[0, W1 - 1] (inclusive).");
OPERATOR_SCHEMA(SpatialNarrowAsGradient)
.NumInputs(3)
.NumOutputs(1)
.Input(
0,
"A",
"See SpatialNarrowAs.")
.Input(
1,
"B",
"See SpatialNarrowAs.")
.Input(
2,
"dC",
"Gradient of forward output 0 (C).")
.Output(
0,
"dA",
"Gradient of forward input 0 (A)");
class SpatialNarrowAsGradient : public GradientMakerBase {
using GradientMakerBase::GradientMakerBase;
vector<OperatorDef> GetGradientDefs() override {
return SingleGradientDef(
"SpatialNarrowAsGradient", "",
vector<string>{I(0), I(1), GO(0)},
vector<string>{GI(0)});
}
};
REGISTER_GRADIENT(SpatialNarrowAs, SpatialNarrowAsGradient);
} // namespace caffe2

View File

@ -1,165 +0,0 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "caffe2/core/context_gpu.h"
#include "caffe2/core/operator.h"
#include "modules/detectron/spatial_narrow_as_op.h"
namespace caffe2 {
namespace {
template <typename T>
__global__ void CopyKernel(
const int N,
const int C,
const int in_H,
const int in_W,
const int out_H,
const int out_W,
const T* in_data,
T* out_data) {
CUDA_1D_KERNEL_LOOP(index, N * C * out_H * out_W) {
int w = index % out_W;
int h = (index / out_W) % out_H;
int c = (index / out_W / out_H) % C;
int n = (index / out_W / out_H / C);
int in_index = n * C * in_H * in_W + c * in_H * in_W + h * in_W + w;
int out_index = n * C * out_H * out_W + c * out_H * out_W + h * out_W + w;
out_data[out_index] = in_data[in_index];
}
}
template <typename T>
__global__ void CopyGradientKernel(
const int N,
const int C,
const int in_H,
const int in_W,
const int out_H,
const int out_W,
const T* in_data,
T* out_data) {
CUDA_1D_KERNEL_LOOP(index, N * C * in_H * in_W) {
int w = index % in_W;
int h = (index / in_W) % in_H;
int c = (index / in_W / in_H) % C;
int n = (index / in_W / in_H / C);
int in_index = n * C * in_H * in_W + c * in_H * in_W + h * in_W + w;
int out_index = n * C * out_H * out_W + c * out_H * out_W + h * out_W + w;
out_data[out_index] = in_data[in_index];
}
}
} // namespace
template <>
bool SpatialNarrowAsOp<CUDAContext>::RunOnDevice() {
return DispatchHelper<TensorTypes<float_t, int32_t>>::call(this, Input(0));
}
template <>
template <typename T>
bool SpatialNarrowAsOp<CUDAContext>::DoRunWithType() {
// Narrows input 0 (A) spatially to match input 1 (B)
auto& A = Input(0);
auto& B = Input(1);
CAFFE_ENFORCE_EQ(A.dim32(0), B.dim32(0), "Input dim 0 must be equal.");
std::vector<int64_t> sizes;
if (A.ndim() == B.ndim()) {
CAFFE_ENFORCE_EQ(A.dim32(1), B.dim32(1), "Input dim 1 must be equal.");
CAFFE_ENFORCE_GE(
A.dim32(2), B.dim32(2), "Input 0 height must be >= input 1 height.");
CAFFE_ENFORCE_GE(
A.dim32(3), B.dim32(3), "Input 0 width must be >= input 1 width.");
sizes = B.sizes().vec();
} else {
// For (N, H, W) case
CAFFE_ENFORCE_EQ(A.ndim() - 1, B.ndim(), "Dimension mismatch.");
CAFFE_ENFORCE_GE(
A.dim32(2), B.dim32(1), "Input 0 height must be >= input 1 height.");
CAFFE_ENFORCE_GE(
A.dim32(3), B.dim32(2), "Input 0 width must be >= input 1 width.");
sizes = {A.dim32(0), A.dim32(1), B.dim32(1), B.dim32(2)};
}
auto* C = Output(0, sizes, at::dtype<T>());
int out_width = C->dim32(3);
int out_height = C->dim32(2);
int in_width = A.dim32(3);
int in_height = A.dim32(2);
CopyKernel<T><<<
CAFFE_GET_BLOCKS(C->size()),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
C->dim32(0),
C->dim32(1),
in_height,
in_width,
out_height,
out_width,
A.template data<T>(),
C->template mutable_data<T>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}
template <>
bool SpatialNarrowAsGradientOp<CUDAContext>::RunOnDevice() {
return DispatchHelper<TensorTypes<float_t, int32_t>>::call(this, Input(0));
}
template <>
template <typename T>
bool SpatialNarrowAsGradientOp<CUDAContext>::DoRunWithType() {
auto& A = Input(0);
auto& B = Input(1);
auto& dC = Input(2); // Gradient of net w.r.t. output of forward op
auto* dA = Output(0, A.sizes(), at::dtype<T>()); // Gradient of net w.r.t. input to forward op
math::Set<T, CUDAContext>(
dA->size(), 0.f, dA->template mutable_data<T>(), &context_);
int out_width = dA->dim32(3);
int out_height = dA->dim32(2);
int in_width = dC.dim32(3);
int in_height = dC.dim32(2);
CopyGradientKernel<T><<<
CAFFE_GET_BLOCKS(dC.size()),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
dA->dim32(0),
dA->dim32(1),
in_height,
in_width,
out_height,
out_width,
dC.template data<T>(),
dA->template mutable_data<T>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}
REGISTER_CUDA_OPERATOR(SpatialNarrowAs, SpatialNarrowAsOp<CUDAContext>);
REGISTER_CUDA_OPERATOR(
SpatialNarrowAsGradient,
SpatialNarrowAsGradientOp<CUDAContext>);
} // namespace caffe2

View File

@ -1,63 +0,0 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef SPATIAL_NARROW_AS_OP_H_
#define SPATIAL_NARROW_AS_OP_H_
#include "caffe2/core/context.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/operator.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
template <class Context>
class SpatialNarrowAsOp final : public Operator<Context> {
public:
SpatialNarrowAsOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws) {}
USE_OPERATOR_CONTEXT_FUNCTIONS;
USE_DISPATCH_HELPER;
bool RunOnDevice() override {
// No CPU implementation for now
CAFFE_NOT_IMPLEMENTED;
}
protected:
template <typename T>
bool DoRunWithType();
};
template <class Context>
class SpatialNarrowAsGradientOp final : public Operator<Context> {
public:
SpatialNarrowAsGradientOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws) {}
USE_OPERATOR_CONTEXT_FUNCTIONS;
USE_DISPATCH_HELPER;
bool RunOnDevice() override {
// No CPU implementation for now
CAFFE_NOT_IMPLEMENTED;
}
protected:
template <typename T>
bool DoRunWithType();
};
} // namespace caffe2
#endif // SPATIAL_NARROW_AS_OP_H_

View File

@ -1,83 +0,0 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "upsample_nearest_op.h"
#ifdef USE_MKLDNN
#include "caffe2/ideep/operators/operator_fallback_ideep.h"
#include "caffe2/ideep/utils/ideep_operator.h"
#endif
namespace caffe2 {
#ifdef USE_MKLDNN
REGISTER_IDEEP_OPERATOR(
UpsampleNearest,
IDEEPFallbackOp<UpsampleNearestOp<float, CPUContext>>);
#endif
REGISTER_CPU_OPERATOR(UpsampleNearest, UpsampleNearestOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(
UpsampleNearestGradient,
UpsampleNearestGradientOp<float, CPUContext>);
OPERATOR_SCHEMA(UpsampleNearest)
.NumInputs(1)
.NumOutputs(1)
.SetDoc(R"DOC(
Nearest neighbor upsampling operation. Implementation taken from THCUNN.
)DOC")
.Arg(
"scale",
"(int) default 2; integer upsampling factor.")
.Input(
0,
"X",
"4D feature map input of shape (N, C, H, W).")
.Output(
0,
"Y",
"4D feature map of shape (N, C, scale * H, scale * W); Values are "
"neareast neighbor samples from X.");
OPERATOR_SCHEMA(UpsampleNearestGradient)
.NumInputs(2)
.NumOutputs(1)
.Input(
0,
"X",
"See UpsampleNearest.")
.Input(
1,
"dY",
"Gradient of forward output 0 (Y).")
.Output(
0,
"dX",
"Gradient of forward input 0 (X).");
class GetUpsampleNearestGradient : public GradientMakerBase {
using GradientMakerBase::GradientMakerBase;
vector<OperatorDef> GetGradientDefs() override {
return SingleGradientDef(
"UpsampleNearestGradient",
"",
vector<string>{I(0), GO(0)},
vector<string>{GI(0)});
}
};
REGISTER_GRADIENT(UpsampleNearest, GetUpsampleNearestGradient);
} // namespace caffe2

View File

@ -1,223 +0,0 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Adapted from https://github.com/torch/cunn/blob/master/lib/THCUNN/SpatialUpSamplingNearest.cu
*
* Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
* Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
* Copyright (c) 2011-2013 NYU (Clement Farabet)
* Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert,
* Leon Bottou, Iain Melvin, Jason Weston)
* Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
* Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert,
* Samy Bengio, Johnny Mariethoz)
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. Neither the names of NEC Laboratories American and IDIAP Research
* Institute nor the names of its contributors may be used to endorse or
* promote products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "caffe2/core/context_gpu.h"
#include "modules/detectron/upsample_nearest_op.h"
namespace caffe2 {
namespace {
__device__ int translate_idx(int ii, int d1, int d2, int d3, int scale_factor) {
int x, y, z, w;
w = ii % d3;
ii = ii/d3;
z = ii % d2;
ii = ii/d2;
y = ii % d1;
ii = ii/d1;
x = ii;
w = w/scale_factor;
z = z/scale_factor;
d2 /= scale_factor;
d3 /= scale_factor;
return (((x*d1+y)*d2)+z)*d3+w;
}
__device__ int translate_idx_inv(
int ii, int d1, int d2, int d3, int scale_factor, int off_x, int off_y) {
int x, y, z, w;
w = ii % d3;
ii = ii/d3;
z = ii % d2;
ii = ii/d2;
y = ii % d1;
ii = ii/d1;
x = ii;
w = w*scale_factor+off_x;
z = z*scale_factor+off_y;
d2 *= scale_factor;
d3 *= scale_factor;
return (((x*d1+y)*d2)+z)*d3+w;
}
__global__ void upscale(const float *input, float *output, long no_elements,
int scale_factor, int d1, int d2, int d3) {
long ii = threadIdx.x + blockDim.x * blockIdx.x;
ii += threadIdx.y + blockDim.y * (blockDim.x * gridDim.x) * blockIdx.y;
if (ii >= no_elements) return;
int ipidx = translate_idx(ii, d1, d2, d3, scale_factor);
output[ii]=input[ipidx];
}
__global__ void downscale(float *gradInput_data, const float *gradOutput_data,
long no_elements, int scale_factor, int d1, int d2,
int d3) {
long ii = threadIdx.x + blockDim.x * blockIdx.x;
ii += threadIdx.y + blockDim.y * (blockDim.x * gridDim.x) * blockIdx.y;
if (ii >= no_elements) return;
for (int i=0; i < scale_factor; i++){
for(int j=0; j < scale_factor; j++){
int ipidx = translate_idx_inv(ii, d1, d2, d3, scale_factor, i, j);
gradInput_data[ii] += gradOutput_data[ipidx];
}
}
}
} // namespace
template<>
bool UpsampleNearestOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0);
auto* Y = Output(0);
vector<int64_t> out_shape;
for (int i = 0; i < X.ndim(); ++i) {
out_shape.push_back(X.dim32(i));
}
out_shape[X.ndim() - 1] *= scale_;
out_shape[X.ndim() - 2] *= scale_;
Y->Resize(out_shape);
int d1;
int d2;
int d3;
if (X.ndim() == 3) {
d1 = Y->dim32(0);
d2 = Y->dim32(1);
d3 = Y->dim32(2);
} else {
d1 = Y->dim32(1);
d2 = Y->dim32(2);
d3 = Y->dim32(3);
}
long no_elements = Y->size();
const float *input_data = X.data<float>();
float *output_data = Y->mutable_data<float>();
// cuda blocks & threads:
long nthreads = 256;
// Max number of blocks: http://en.wikipedia.org/wiki/CUDA
// 65535 for SM 2.x, 2^32 -1 for >= 3.0
// TODO: When we move to SM 3.5 we should update this
long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535);
long n_yblocks = (long)ceil(
(float)no_elements / (float)(n_xblocks * nthreads));
CAFFE_ENFORCE(n_yblocks <= 65535);
dim3 blocks(n_xblocks, n_yblocks);
dim3 threads(nthreads);
upscale<<<blocks, threads, 0, context_.cuda_stream()>>>(
input_data, output_data, no_elements, scale_, d1, d2, d3);
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}
template<>
bool UpsampleNearestGradientOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0); // Original input to "forward" op
auto& dY = Input(1); // Gradient of net w.r.t. output of "forward" op
// (aka "gradOutput")
auto* dX = Output(0); // Gradient of net w.r.t. input to "forward" op
// (aka "gradInput")
dX->ResizeLike(X);
float *gradInput_data = dX->mutable_data<float>();
const float *gradOutput_data = dY.data<float>();
int d1;
int d2;
int d3;
if (dX->ndim() == 3) {
d1 = dX->dim32(0);
d2 = dX->dim32(1);
d3 = dX->dim32(2);
} else {
d1 = dX->dim32(1);
d2 = dX->dim32(2);
d3 = dX->dim32(3);
}
long no_elements = dX->size();
// cuda blocks & threads:
long nthreads = 256;
// Max number of blocks: http://en.wikipedia.org/wiki/CUDA
// 65535 for SM 2.x, 2^32 -1 for >= 3.0
// TODO: When we move to SM 3.5 we should update this
long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535);
long n_yblocks = (long)ceil(
(float)no_elements / (float)(n_xblocks * nthreads));
CAFFE_ENFORCE(n_yblocks <= 65535);
dim3 blocks(n_xblocks, n_yblocks);
dim3 threads(nthreads);
math::Set<float, CUDAContext>(no_elements, 0.f, gradInput_data, &context_);
downscale<<<blocks, threads, 0, context_.cuda_stream()>>>(
gradInput_data, gradOutput_data, no_elements, scale_, d1, d2, d3);
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}
REGISTER_CUDA_OPERATOR(UpsampleNearest,
UpsampleNearestOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(UpsampleNearestGradient,
UpsampleNearestGradientOp<float, CUDAContext>);
} // namespace caffe2

View File

@ -1,106 +0,0 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef UPSAMPLE_NEAREST_OP_H_
#define UPSAMPLE_NEAREST_OP_H_
#include "caffe2/core/context.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/operator.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
template <typename T, class Context>
class UpsampleNearestOp final : public Operator<Context> {
public:
UpsampleNearestOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws),
scale_(this->template GetSingleArgument<int>("scale", 2)) {
TORCH_DCHECK_GE(scale_, 1);
}
USE_OPERATOR_CONTEXT_FUNCTIONS;
bool RunOnDevice() override {
auto& X = Input(0);
auto out_shape = X.sizes().vec();
out_shape[X.dim() - 1] *= scale_;
out_shape[X.dim() - 2] *= scale_;
auto* Y = Output(0, out_shape, at::dtype<T>());
int d1;
int d2;
int d3;
if (X.dim() == 3) {
d1 = Y->dim32(0);
d2 = Y->dim32(1);
d3 = Y->dim32(2);
} else {
d1 = Y->dim32(0) * Y->dim32(1);
d2 = Y->dim32(2);
d3 = Y->dim32(3);
}
const T *input_data = X.template data<T>();
T *output_data = Y->template mutable_data<T>();
int scaled_d2 = d2 / scale_;
int scaled_d3 = d3 / scale_;
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int i = 0; i < d1; ++i) {
for (int j = 0; j < d2; ++j) {
for (int u = 0; u < d3; ++u) {
int ii = (i * d2 + j) * d3 + u;
int scaled_u = u / scale_;
int scaled_j = j / scale_;
int ipidx = ((i * scaled_d2) + scaled_j) * scaled_d3 + scaled_u;
output_data[ii] = input_data[ipidx];
}
}
}
return true;
}
protected:
int scale_;
};
template <typename T, class Context>
class UpsampleNearestGradientOp final : public Operator<Context> {
public:
UpsampleNearestGradientOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws),
scale_(this->template GetSingleArgument<int>("scale", 2)) {
TORCH_DCHECK_GE(scale_, 1);
}
USE_OPERATOR_CONTEXT_FUNCTIONS;
bool RunOnDevice() override {
// No CPU implementation for now
CAFFE_NOT_IMPLEMENTED;
}
protected:
int scale_;
};
} // namespace caffe2
#endif // UPSAMPLE_NEAREST_OP_H_

View File

@ -1,42 +0,0 @@
import unittest
import caffe2.python.hypothesis_test_util as hu
import hypothesis.strategies as st
import numpy as np
from caffe2.python import core, dyndep
from hypothesis import given, settings
dyndep.InitOpsLibrary("@/caffe2/modules/detectron:detectron_ops")
class TestUpsampleNearestOp(hu.HypothesisTestCase):
@given(
N=st.integers(1, 3),
H=st.integers(10, 300),
W=st.integers(10, 300),
scale=st.integers(1, 3),
**hu.gcs,
)
@settings(deadline=None, max_examples=20)
def test_upsample_nearest_op(self, N, H, W, scale, gc, dc):
C = 32
X = np.random.randn(N, C, H, W).astype(np.float32)
op = core.CreateOperator("UpsampleNearest", ["X"], ["Y"], scale=scale)
def ref(X):
outH = H * scale
outW = W * scale
outH_idxs, outW_idxs = np.meshgrid(
np.arange(outH), np.arange(outW), indexing="ij"
)
inH_idxs = (outH_idxs / scale).astype(np.int32)
inW_idxs = (outW_idxs / scale).astype(np.int32)
Y = X[:, :, inH_idxs, inW_idxs]
return [Y]
self.assertReferenceChecks(device_option=gc, op=op, inputs=[X], reference=ref)
if __name__ == "__main__":
unittest.main()

View File

@ -1,23 +0,0 @@
if(NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
# If we are building the standalone module, we set the proper cmake variables.
cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
find_package(Caffe2 REQUIRED)
set(BUILD_TEST ON)
option(BUILD_SHARED_LIBS "Build shared libs." ON)
endif()
if(BUILD_TEST AND NOT BUILD_LITE_INTERPRETER)
add_library(
caffe2_module_test_dynamic
${CMAKE_CURRENT_SOURCE_DIR}/module_test_dynamic.cc)
if(HAVE_SOVERSION)
set_target_properties(caffe2_module_test_dynamic PROPERTIES
VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION})
endif()
target_link_libraries(caffe2_module_test_dynamic torch_library)
install(TARGETS caffe2_module_test_dynamic DESTINATION lib)
if(MSVC AND BUILD_SHARED_LIBS)
install(FILES $<TARGET_PDB_FILE:caffe2_module_test_dynamic> DESTINATION lib OPTIONAL)
endif()
endif()

View File

@ -1,41 +0,0 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "caffe2/core/module.h"
#include "caffe2/core/operator.h"
// An explicitly defined module, testing correctness when we dynamically link a
// module
CAFFE2_MODULE(caffe2_module_test_dynamic, "Dynamic module only used for testing.");
namespace caffe2 {
class Caffe2ModuleTestDynamicDummyOp : public OperatorBase {
public:
using OperatorBase::OperatorBase;
bool Run(int /* unused */ /*stream_id*/) override {
return true;
}
virtual string type() {
return "base";
}
};
REGISTER_CPU_OPERATOR(
Caffe2ModuleTestDynamicDummy, Caffe2ModuleTestDynamicDummyOp);
OPERATOR_SCHEMA(Caffe2ModuleTestDynamicDummy);
} // namespace caffe2

View File

@ -1,32 +0,0 @@
if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
if(NOT USE_OBSERVERS)
return()
endif()
else()
cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
project(caffe2_observers CXX)
find_package(Caffe2 REQUIRED)
option(BUILD_SHARED_LIBS "Build shared libs." ON)
endif()
add_library(caffe2_observers
"${CMAKE_CURRENT_SOURCE_DIR}/net_observer_reporter_print.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/observer_config.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/perf_observer.cc"
)
if(HAVE_SOVERSION)
set_target_properties(caffe2_observers PROPERTIES
VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION})
endif()
target_link_libraries(caffe2_observers PUBLIC torch_library)
target_include_directories(caffe2_observers PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/..)
target_compile_options(caffe2_observers PRIVATE "-DCAFFE2_BUILD_OBSERVER_LIB")
install(TARGETS caffe2_observers DESTINATION lib)
caffe2_interface_library(caffe2_observers caffe2_observers_library)
if(MSVC AND BUILD_SHARED_LIBS)
install(FILES $<TARGET_PDB_FILE:caffe2_observers> DESTINATION lib OPTIONAL)
endif()
if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
set(Caffe2_MODULES ${Caffe2_MODULES} caffe2_observers_library PARENT_SCOPE)
endif()

View File

@ -1,7 +0,0 @@
#include "c10/macros/Macros.h"
#ifdef CAFFE2_BUILD_OBSERVER_LIB
#define CAFFE2_OBSERVER_API C10_EXPORT
#else
#define CAFFE2_OBSERVER_API C10_IMPORT
#endif

View File

@ -1,38 +0,0 @@
#pragma once
#include <map>
#include "caffe2/core/common.h"
#include "caffe2/core/net.h"
#include "observers/macros.h"
namespace caffe2 {
struct PerformanceInformation {
// Analytic
int64_t flops = 0;
int64_t bytes_written = 0;
int64_t bytes_read = 0;
std::vector<TensorShape> tensor_shapes = {};
std::vector<Argument> args = {};
std::string engine = ""; // the engine used
std::string type = ""; // the type of the operator
// Measured
double latency = 0;
double cpuMilliseconds = 0;
};
class CAFFE2_OBSERVER_API NetObserverReporter {
public:
virtual ~NetObserverReporter() = default;
/*
Report the delay metric collected by the observer.
The delays are saved in a map. The key is an identifier associated
with the reported delay. The value is the delay value in float
*/
virtual void report(
NetBase* net,
std::map<std::string, PerformanceInformation>&) = 0;
};
}

View File

@ -1,158 +0,0 @@
#include "observers/net_observer_reporter_print.h"
#include <algorithm>
#include <sstream>
#include "caffe2/core/init.h"
#include "observers/observer_config.h"
#include <c10/util/irange.h>
namespace caffe2 {
const std::string NetObserverReporterPrint::IDENTIFIER = "Caffe2Observer ";
static std::string get_op_args(PerformanceInformation p);
static std::string get_tensor_shapes(PerformanceInformation p);
static std::string sanatize(std::string json_s);
void NetObserverReporterPrint::report(
NetBase* net,
std::map<std::string, PerformanceInformation>& info) {
// Not allowed to use json library
std::vector<std::map<std::string, std::string>> caffe2_perf;
for (auto& p : info) {
if ((p.first == "NET_DELAY") && (info.size() == 1)) {
// for Net_delay perf
caffe2_perf.push_back({{"type", "NET"},
{"value", c10::to_string(p.second.latency * 1000)},
{"unit", "us"},
{"metric", "latency"}});
caffe2_perf.push_back({{"type", "NET_"},
{
"value",
c10::to_string(
p.second.cpuMilliseconds /
p.second.latency *
100),
},
{"unit", "percent"},
{"metric", "cpu_percent"}});
} else if (p.first != "NET_DELAY") {
// for operator perf
std::string shape_str = get_tensor_shapes(p.second);
std::string args_str = get_op_args(p.second);
std::string type = p.first;
caffe2_perf.push_back({{"type", type},
{"value", c10::to_string(p.second.latency * 1000)},
{"unit", "us"},
{"metric", "latency"}});
caffe2_perf.push_back({{"type", type},
{
"value",
c10::to_string(
p.second.cpuMilliseconds /
p.second.latency *
100),
},
{"unit", "percent"},
{"metric", "cpu_percent"}});
if (p.second.flops > 0) {
caffe2_perf.push_back({{"type", type},
{"value", c10::to_string(p.second.flops)},
{"unit", "flop"},
{"metric", "flops"}});
}
if (shape_str != "") {
caffe2_perf.push_back({{"type", type},
{"info_string", shape_str},
{"unit", ""},
{"metric", "tensor_shapes"}});
}
if (args_str != "") {
caffe2_perf.push_back({{"type", type},
{"info_string", args_str},
{"unit", ""},
{"metric", "op_args"}});
}
}
}
// NOLINTNEXTLINE(modernize-loop-convert)
for (auto it = caffe2_perf.begin(); it != caffe2_perf.end(); it++) {
std::stringstream buffer;
auto entry = *it;
buffer << IDENTIFIER << "{";
// NOLINTNEXTLINE(modernize-raw-string-literal)
buffer << "\"type\": \"" << sanatize(entry["type"]) << "\","
// NOLINTNEXTLINE(modernize-raw-string-literal)
<< "\"unit\": \"" << sanatize(entry["unit"]) << "\","
// NOLINTNEXTLINE(modernize-raw-string-literal)
<< "\"metric\": \"" << sanatize(entry["metric"]) << "\",";
if (entry.find("value") != entry.end()) {
// NOLINTNEXTLINE(modernize-raw-string-literal)
buffer << "\"value\": \"" << sanatize(entry["value"]) << "\"";
} else if (entry.find("info_string") != entry.end()) {
// NOLINTNEXTLINE(modernize-raw-string-literal)
buffer << "\"info_string\": \"" << sanatize(entry["info_string"]) << "\"";
}
buffer << "}";
LOG(INFO) << buffer.str();
}
}
static std::string get_tensor_shapes(PerformanceInformation p) {
std::string shape_str;
std::stringstream shape_stream;
if (!p.tensor_shapes.empty()) {
shape_stream << "[";
for (const auto i : c10::irange(p.tensor_shapes.size())) {
shape_stream << "[";
for (int j = 0; j < p.tensor_shapes[i].dims_size(); j++) {
shape_stream << p.tensor_shapes[i].dims(j) << ", ";
}
shape_stream << "], ";
}
shape_stream << "]";
shape_str = shape_stream.str();
} else {
shape_str = "";
}
return shape_str;
}
static std::string get_op_args(PerformanceInformation p) {
std::string args_str;
if (!p.args.empty()) {
std::stringstream args;
args << "[";
for (const auto i : c10::irange(p.args.size())) {
args << "{" << p.args[i].name() << ": ";
if (p.args[i].has_i()) {
args << p.args[i].i();
} else if (p.args[i].has_s()) {
args << p.args[i].s();
} else if (p.args[i].has_n()) {
args << &p.args[i].n();
} else if (p.args[i].has_f()) {
args << p.args[i].f();
} else {
args << "None";
}
args << "}, ";
}
args << "]";
args_str = args.str();
} else {
args_str = "";
}
return args_str;
}
static std::string sanatize(std::string json_s) {
// Remove illegal characters from the name that would cause json string to
// become invalid
json_s.erase(std::remove(json_s.begin(), json_s.end(), '"'), json_s.end());
json_s.erase(std::remove(json_s.begin(), json_s.end(), '\\'), json_s.end());
return json_s;
}
}

View File

@ -1,16 +0,0 @@
#pragma once
#include "observers/macros.h"
#include "observers/net_observer_reporter.h"
#include "caffe2/core/common.h"
namespace caffe2 {
class CAFFE2_OBSERVER_API NetObserverReporterPrint : public NetObserverReporter {
public:
static const std::string IDENTIFIER;
void report(NetBase* net, std::map<std::string, PerformanceInformation>&) override;
};
} // namespace caffe2

View File

@ -1,12 +0,0 @@
#include "observers/observer_config.h"
namespace caffe2 {
int ObserverConfig::netInitSampleRate_ = 0;
int ObserverConfig::netFollowupSampleRate_ = 0;
int ObserverConfig::netFollowupSampleCount_ = 0;
int ObserverConfig::operatorNetSampleRatio_ = 0;
int ObserverConfig::skipIters_ = 0;
unique_ptr<NetObserverReporter> ObserverConfig::reporter_ = nullptr;
int ObserverConfig::marker_ = -1;
}

View File

@ -1,99 +0,0 @@
#pragma once
#include "observers/macros.h"
#include "observers/net_observer_reporter.h"
#include "caffe2/core/common.h"
namespace caffe2 {
/*
netInitSampleRate_ == 1 && operatorNetSampleRatio_ == 1 :
Log operator metrics in every iteration
netInitSampleRate_ == 1 && operatorNetSampleRatio_ == 0 :
Log net metrics in every iterationn
netInitSampleRate_ == n && netFollowupSampleRate_ == m &&
netFollowupSampleCount == c && operatorNetSampleRatio_ == 1 :
Log operator metrics first at odds of 1 / n. Once first logged,
the following c logs are at odds of 1 / min(n, m). Then repeat
netInitSampleRate_ == n && netFollowupSampleRate_ == m &&
netFollowupSampleCount == c && operatorNetSampleRatio_ == 0 :
Log net metrics first at odds of 1 / n. Once first logged,
the following c logs are at odds of 1 / min(n, m). Then repeat
netInitSampleRate_ == n && netFollowupSampleRate_ == m &&
netFollowupSampleCount == c && operatorNetSampleRatio_ == o :
Log net metrics first at odds of 1 / n. Once first logged,
the following c logs are at odds of 1 / min(n, m), if the random number
is multiples of o, log operator metrics instead. Then repeat
skipIters_ == n: skip the first n iterations of the net.
*/
class CAFFE2_OBSERVER_API ObserverConfig {
public:
static void initSampleRate(
int netInitSampleRate,
int netFollowupSampleRate,
int netFollowupSampleCount,
int operatorNetSampleRatio,
int skipIters) {
CAFFE_ENFORCE(netFollowupSampleRate <= netInitSampleRate);
CAFFE_ENFORCE(netFollowupSampleRate >= 1 || netInitSampleRate == 0);
netInitSampleRate_ = netInitSampleRate;
netFollowupSampleRate_ = netFollowupSampleRate;
netFollowupSampleCount_ = netFollowupSampleCount;
operatorNetSampleRatio_ = operatorNetSampleRatio;
skipIters_ = skipIters;
}
static int getNetInitSampleRate() {
return netInitSampleRate_;
}
static int getNetFollowupSampleRate() {
return netFollowupSampleRate_;
}
static int getNetFollowupSampleCount() {
return netFollowupSampleCount_;
}
static int getOpoeratorNetSampleRatio() {
return operatorNetSampleRatio_;
}
static int getSkipIters() {
return skipIters_;
}
static void setReporter(unique_ptr<NetObserverReporter> reporter) {
reporter_ = std::move(reporter);
}
static NetObserverReporter* getReporter() {
CAFFE_ENFORCE(reporter_);
return reporter_.get();
}
static void setMarker(int marker) {
marker_ = marker;
}
static int getMarker() {
return marker_;
}
private:
/* The odds of log net metric initially or immediately after reset */
static int netInitSampleRate_;
/* The odds of log net metric after log once after start of reset */
static int netFollowupSampleRate_;
/* The number of follow up logs to be collected for odds of
netFollowupSampleRate_ */
static int netFollowupSampleCount_;
/* The odds to log the operator metric instead of the net metric.
When the operator is logged the net is not logged. */
static int operatorNetSampleRatio_;
/* skip the first few iterations */
static int skipIters_;
static unique_ptr<NetObserverReporter> reporter_;
/* marker used in identifying the metrics in certain reporters */
static int marker_;
};
}

View File

@ -1,330 +0,0 @@
#include "observers/perf_observer.h"
#include "observers/observer_config.h"
#ifndef C10_MOBILE
#include "caffe2/core/flags.h"
#include "observers/net_observer_reporter_print.h"
#endif
#include <random>
// NOLINTNEXTLINE(modernize-deprecated-headers)
#include <time.h>
#include "caffe2/core/common.h"
#include "caffe2/core/init.h"
#include "caffe2/core/operator.h"
#if defined(TARGET_OS_MAC) || \
defined(TARGET_OS_IPHONE) || \
defined(TARGET_IPHONE_SIMULATOR)
#define _APPLE 1
#endif
#ifdef _WIN32
#ifndef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN
#endif
#include <windows.h>
#endif
#ifdef _APPLE
#include <mach/mach_time.h>
#include <sys/time.h>
#include <sys/resource.h>
#endif
#ifndef C10_MOBILE
C10_DEFINE_int64(
aiBench_netInitSampleRate,
0,
"One in N sampling rate for net delay");
C10_DEFINE_int64(
aiBench_netFollowupSampleRate,
0,
"One in N sampling rate for net delay");
C10_DEFINE_int64(
aiBench_netFollowupSampleCount,
0,
"control the following c logs");
C10_DEFINE_int64(
aiBench_operatorNetSampleRatio,
0,
"One in N sampling rate for operator delay");
C10_DEFINE_int64(
aiBench_skipIters,
0,
"skip the first N iterations of the net run");
#endif
namespace caffe2 {
namespace {
bool registerGlobalPerfNetObserverCreator(int* /*pargc*/, char*** /*pargv*/) {
AddGlobalNetObserverCreator([](NetBase* subject) {
return std::make_unique<PerfNetObserver>(subject);
});
#if !defined(C10_MOBILE)
// for aibench usage
caffe2::ObserverConfig::setReporter(
std::make_unique<caffe2::NetObserverReporterPrint>());
caffe2::ObserverConfig::initSampleRate(
FLAGS_aiBench_netInitSampleRate,
FLAGS_aiBench_netFollowupSampleRate,
FLAGS_aiBench_netFollowupSampleCount,
FLAGS_aiBench_operatorNetSampleRatio,
FLAGS_aiBench_skipIters);
#endif
return true;
}
} // namespace
#ifdef _WIN32
double getTicksPerMillisecond() {
static LARGE_INTEGER ticks_per_sec;
if (!ticks_per_sec.QuadPart) {
QueryPerformanceFrequency(&ticks_per_sec);
if (!ticks_per_sec.QuadPart) {
return 0.0;
}
}
return static_cast<double>(ticks_per_sec.QuadPart) / 1000.0;
}
#elif !defined _APPLE
double getClockTimeMilliseconds(clockid_t clk_id) {
int result;
struct timespec tp;
result = clock_gettime(clk_id, &tp);
if (result == -1) {
return 0.0;
} else {
return tp.tv_sec * 1000.0 + tp.tv_nsec / 1000000.0;
}
}
#endif
double getWallClockTimeMilliseconds() {
#ifdef _WIN32
double ticks_per_ms = getTicksPerMillisecond();
if (ticks_per_ms) {
LARGE_INTEGER ticks;
if (QueryPerformanceCounter(&ticks)) {
return static_cast<double>(ticks.QuadPart) / ticks_per_ms;
}
}
return 0.0;
#elif defined _APPLE
static mach_timebase_info_data_t info;
if (info.denom == 0) {
mach_timebase_info(&info);
}
uint64_t now = mach_absolute_time();
now = now * info.numer / info.denom; // convert to nanoseconds
return now / 1000000.0;
#else
return getClockTimeMilliseconds(CLOCK_MONOTONIC);
#endif
}
double getCpuTimeMilliseconds() {
#ifdef _WIN32
FILETIME creation_time;
FILETIME exit_time;
FILETIME kernel_time;
FILETIME user_time;
if (GetProcessTimes(
GetCurrentProcess(),
&creation_time,
&exit_time,
&kernel_time,
&user_time)) {
ULARGE_INTEGER kernel;
ULARGE_INTEGER user;
kernel.HighPart = kernel_time.dwHighDateTime;
kernel.LowPart = kernel_time.dwLowDateTime;
user.HighPart = user_time.dwHighDateTime;
user.LowPart = user_time.dwLowDateTime;
return (static_cast<double>(kernel.QuadPart) +
static_cast<double>(user.QuadPart)) / 10000.0;
}
return 0.0;
#elif defined _APPLE
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
struct rusage ru;
if (getrusage(RUSAGE_SELF, &ru)) {
return 0.0;
}
return ru.ru_utime.tv_sec * 1000.0
+ ru.ru_utime.tv_usec / 1000.0
+ ru.ru_stime.tv_sec * 1000.0
+ ru.ru_stime.tv_usec / 1000.0;
#else
return getClockTimeMilliseconds(CLOCK_PROCESS_CPUTIME_ID);
#endif
}
REGISTER_CAFFE2_EARLY_INIT_FUNCTION(
registerGlobalPerfNetObserverCreator,
&registerGlobalPerfNetObserverCreator,
"Caffe2 net global observer creator");
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
PerfNetObserver::PerfNetObserver(NetBase* subject_)
: NetObserver(subject_), numRuns_(0) {}
// NOLINTNEXTLINE(modernize-use-equals-default)
PerfNetObserver::~PerfNetObserver() {}
void PerfNetObserver::Start() {
static int visitCount = 0;
// Select whether to log the operator or the net.
// We have one sample rate for the entire app.
int netInitSampleRate = ObserverConfig::getNetInitSampleRate();
int netFollowupSampleRate = ObserverConfig::getNetFollowupSampleRate();
int netFollowupSampleCount = ObserverConfig::getNetFollowupSampleCount();
int operatorNetSampleRatio = ObserverConfig::getOpoeratorNetSampleRatio();
int skipIters = ObserverConfig::getSkipIters();
int sampleRate = visitCount > 0 ? netFollowupSampleRate : netInitSampleRate;
// NOLINTNEXTLINE(clang-analyzer-security.insecureAPI.rand)
if (skipIters <= static_cast<int>(numRuns_) && sampleRate > 0 && rand() % sampleRate == 0) {
visitCount++;
if (visitCount == netFollowupSampleCount) {
visitCount = 0;
}
// NOLINTNEXTLINE(clang-analyzer-security.insecureAPI.rand)
if (operatorNetSampleRatio > 0 && rand() % operatorNetSampleRatio == 0) {
logType_ = PerfNetObserver::OPERATOR_DELAY;
} else {
logType_ = PerfNetObserver::NET_DELAY;
}
} else {
logType_ = PerfNetObserver::NONE;
}
numRuns_++;
if (logType_ == PerfNetObserver::OPERATOR_DELAY) {
/* Always recreate new operator observers
whenever we measure operator delay */
const auto& operators = subject_->GetOperators();
for (auto* op : operators) {
observerMap_[op] = op->AttachObserver(
std::make_unique<PerfOperatorObserver>(op, this));
}
}
wallMilliseconds_ = getWallClockTimeMilliseconds();
cpuMilliseconds_ = getCpuTimeMilliseconds();
}
void PerfNetObserver::Stop() {
if (logType_ == PerfNetObserver::NONE) {
return;
}
std::map<std::string, PerformanceInformation> info;
PerformanceInformation net_perf;
net_perf.cpuMilliseconds =
getCpuTimeMilliseconds() - cpuMilliseconds_;
net_perf.latency =
getWallClockTimeMilliseconds() - wallMilliseconds_;
if (logType_ == PerfNetObserver::OPERATOR_DELAY) {
const auto& operators = subject_->GetOperators();
for (unsigned idx = 0; idx < operators.size(); ++idx) {
const auto* op = operators[idx];
auto name = getObserverName(op, static_cast<int>(idx));
PerformanceInformation p;
const PerfOperatorObserver* opObserver =
static_cast<const PerfOperatorObserver*>(observerMap_[op]);
p.latency = opObserver->getWallMilliseconds();
p.cpuMilliseconds = opObserver->getCpuMilliseconds();
p.engine = op->engine();
p.type = op->type();
p.tensor_shapes =
static_cast<const PerfOperatorObserver*>(observerMap_[op])
->getTensorShapes();
if (op->has_debug_def()) {
// NOLINTNEXTLINE(performance-for-range-copy)
for (auto arg : op->debug_def().arg()) {
p.args.emplace_back(arg);
}
}
info.insert({name, p});
}
/* clear all operator delay after use so that we don't spent time
collecting the operator delay info in later runs */
for (auto* op : operators) {
op->DetachObserver(observerMap_[op]);
}
observerMap_.clear();
}
info.insert({"NET_DELAY", net_perf});
ObserverConfig::getReporter()->report(subject_, info);
}
caffe2::string PerfNetObserver::getObserverName(const OperatorBase* op, int idx)
const {
string opType = op->has_debug_def() ? op->debug_def().type() : "NO_TYPE";
string displayName =
(op->has_debug_def() ? op->debug_def().name().size()
? op->debug_def().name()
: (op->debug_def().output_size() ? op->debug_def().output(0)
: "NO_OUTPUT")
: "NO_DEF");
caffe2::string name =
"ID_" + c10::to_string(idx) + "_" + opType + "_" + displayName;
return name;
}
PerfOperatorObserver::PerfOperatorObserver(
OperatorBase* op,
PerfNetObserver* netObserver)
: ObserverBase<OperatorBase>(op),
netObserver_(netObserver),
wallMilliseconds_(0),
cpuMilliseconds_(0) {
CAFFE_ENFORCE(netObserver_, "Observers can't operate outside of the net");
}
// NOLINTNEXTLINE(modernize-use-equals-default)
PerfOperatorObserver::~PerfOperatorObserver() {}
void PerfOperatorObserver::Start() {
wallMilliseconds_ = getWallClockTimeMilliseconds();
cpuMilliseconds_ = getCpuTimeMilliseconds();
}
void PerfOperatorObserver::Stop() {
/* Time from the start of the net minus the time spent on all other
operators is the time spent on this operator */
cpuMilliseconds_ =
getCpuTimeMilliseconds() - cpuMilliseconds_;
wallMilliseconds_ =
getWallClockTimeMilliseconds() - wallMilliseconds_;
tensor_shapes_ = subject_->InputTensorShapes();
}
double PerfOperatorObserver::getWallMilliseconds() const {
return wallMilliseconds_;
}
double PerfOperatorObserver::getCpuMilliseconds() const {
return cpuMilliseconds_;
}
std::vector<TensorShape> PerfOperatorObserver::getTensorShapes() const {
return tensor_shapes_;
}
} // namespace caffe2

View File

@ -1,66 +0,0 @@
#pragma once
#include "caffe2/core/common.h"
#include "caffe2/core/net.h"
#include "caffe2/core/observer.h"
#include "caffe2/core/timer.h"
#include "observers/macros.h"
#include <unordered_map>
namespace caffe2 {
double getClockTimeMilliseconds();
class CAFFE2_OBSERVER_API PerfNetObserver : public NetObserver {
public:
explicit PerfNetObserver(NetBase* subject_);
virtual ~PerfNetObserver();
private:
void Start() override;
void Stop() override;
caffe2::string getObserverName(const OperatorBase* op, int idx) const;
private:
enum LogType {
NONE,
OPERATOR_DELAY,
NET_DELAY,
};
LogType logType_;
unsigned int numRuns_;
std::unordered_map<const OperatorBase*, const ObserverBase<OperatorBase>*>
observerMap_;
double wallMilliseconds_;
double cpuMilliseconds_;
};
class PerfOperatorObserver : public ObserverBase<OperatorBase> {
public:
PerfOperatorObserver(OperatorBase* op, PerfNetObserver* netObserver);
virtual ~PerfOperatorObserver();
double getWallMilliseconds() const;
double getCpuMilliseconds() const;
std::vector<TensorShape> getTensorShapes() const;
private:
void Start() override;
void Stop() override;
private:
// Observer of a net that owns corresponding op. We make sure net is never
// destructed while operator observer is still alive. First operator observer
// gets destructed, then the op, then the net and its observer.
// We do this trick in order to get access to net's name and other fields
// without storing inside the operator observer. Each field is memory
// costly here and a raw pointer is a cheapest sholution
PerfNetObserver* netObserver_;
double wallMilliseconds_;
double cpuMilliseconds_;
std::vector<TensorShape> tensor_shapes_;
};
} // namespace caffe2

View File

@ -88,12 +88,6 @@
# disables use of system-wide nccl (we will use our submoduled
# copy in third_party/nccl)
#
# BUILD_CAFFE2_OPS=0
# disable Caffe2 operators build
#
# BUILD_CAFFE2=0
# disable Caffe2 build
#
# USE_IBVERBS
# toggle features related to distributed support
#

View File

@ -145,26 +145,6 @@ void validateBlock(
"\n\nDefined at:\n" + getNodeStackTraceString(node))
}
} else {
#ifdef BUILD_CAFFE2
// Assuming this is a Caffe2 change as it only modifies an aten op
// for operator_export_type == ONNX_ATEN_FALLBACK, which is a common
// pattern for Caffe2-specific scenarios.
if (node->kind() == aten::expand) {
if (operator_export_type ==
onnx_torch::OperatorExportTypes::ONNX_ATEN_FALLBACK) {
WithInsertPoint guard(node);
auto* new_node =
b->owningGraph()->insertNode(b->owningGraph()->create(
Symbol(::c10::aten::ATen),
node->inputs(),
node->outputs().size()));
for (size_t i = 0; i < node->outputs().size(); ++i) {
node->output(i)->replaceAllUsesWith(new_node->output(i));
}
new_node->s_(Symbol::fromQualString("attr::operator"), "expand");
}
}
#endif
if (node->kind() == prim::PackPadded || node->kind() == prim::PadPacked) {
if (operator_export_type !=
onnx_torch::OperatorExportTypes::ONNX_FALLTHROUGH) {

View File

@ -293,10 +293,6 @@ void initONNXBindings(PyObject* module) {
onnx.attr("PRODUCER_VERSION") = py::str(TORCH_VERSION);
#ifdef BUILD_CAFFE2
onnx.attr("_CAFFE2_ATEN_FALLBACK") = true;
#else
onnx.attr("_CAFFE2_ATEN_FALLBACK") = false;
#endif
}
} // namespace torch::onnx

View File

@ -350,11 +350,6 @@ def export(
%3 : Float = onnx::Mul(%2, %0)
return (%3)
If PyTorch was built with Caffe2 (i.e. with ``BUILD_CAFFE2=1``), then
Caffe2-specific behavior will be enabled, including special support
for ops are produced by the modules described in
`Quantization <https://pytorch.org/docs/stable/quantization.html>`_.
.. warning::
Models exported this way are probably runnable only by Caffe2.
@ -1802,9 +1797,8 @@ def _add_output_to_block(block: _C.Block, value: _C.Value) -> int:
def _should_aten_fallback(
name: str, opset_version: int, operator_export_type: _C_onnx.OperatorExportTypes
):
# For BUILD_CAFFE2=0 builds, if domain=="aten" and operator_export_type==ONNX_ATEN,
# For all builds, if domain=="aten" and operator_export_type==ONNX_ATEN,
# an aten::ATen operator is created regardless of symbolics existence
# For BUILD_CAFFE2=1, the same applies only if there is no symbolic available
is_exportable_aten_op = registration.registry.is_registered_op(name, opset_version)
is_onnx_aten_export = operator_export_type == _C_onnx.OperatorExportTypes.ONNX_ATEN