Remove caffe2 from more build files (#125898)

Co-authored-by: Aaron Gokaslan <aaronGokaslan@gmail.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/125898 Approved by: https://github.com/Skylion007
2025-10-20 21:14:14 +08:00 · 2024-05-13 18:37:54 +00:00
parent b620231378
commit b9e7b35912
70 changed files with 36 additions and 5361 deletions
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -44,11 +44,6 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
  fi
 fi

-if [[ ${BUILD_ENVIRONMENT} == *"caffe2"* ]]; then
-  echo "Caffe2 build is ON"
-  export BUILD_CAFFE2=ON
-fi
-
 if [[ ${BUILD_ENVIRONMENT} == *"paralleltbb"* ]]; then
  export ATEN_THREADING=TBB
  export USE_TBB=1
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -181,11 +181,7 @@ option(BUILD_BINARY "Build C++ binaries" OFF)
 option(BUILD_DOCS "Build Caffe2 documentation" OFF)
 option(BUILD_CUSTOM_PROTOBUF "Build and use Caffe2's own protobuf under third_party" ON)
 option(BUILD_PYTHON "Build Python binaries" ON)
-option(BUILD_CAFFE2 "Master flag to build Caffe2" OFF)
 option(BUILD_LITE_INTERPRETER "Master flag to build Lite Interpreter" OFF)
-cmake_dependent_option(
-    BUILD_CAFFE2_OPS "Build Caffe2 operators" ON
-    "BUILD_CAFFE2" OFF)
 option(BUILD_SHARED_LIBS "Build libcaffe2.so" ON)
 cmake_dependent_option(
    CAFFE2_LINK_LOCAL_PROTOBUF "If set, build protobuf inside libcaffe2.so." ON
@ -635,7 +631,6 @@ if(INTERN_BUILD_MOBILE)
  endif()
  set(BUILD_PYTHON OFF)
  set(BUILD_FUNCTORCH OFF)
-  set(BUILD_CAFFE2_OPS OFF)
  set(USE_DISTRIBUTED OFF)
  set(NO_API ON)
  set(USE_FBGEMM OFF)
@ -1208,13 +1203,6 @@ else()
      "shared libs.")
 endif()

-# ---[ Modules
-# If master flag for buildling Caffe2 is disabled, we also disable the
-# build for Caffe2 related operator modules.
-if(BUILD_CAFFE2)
-  add_subdirectory(modules)
-endif()
-
 # ---[ Binaries
 # Binaries will be built after the Caffe2 main libraries and the modules
 # are built. For the binaries, they will be linked to the Caffe2 main
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -667,7 +667,6 @@ only interested in a specific component.
 - Working on a test binary? Run `(cd build && ninja bin/test_binary_name)` to
  rebuild only that test binary (without rerunning cmake). (Replace `ninja` with
  `make` if you don't have ninja installed).
- Don't need Caffe2?  Pass `BUILD_CAFFE2=0` to disable Caffe2 build.

 On the initial build, you can also speed things up with the environment
 variables `DEBUG`, `USE_DISTRIBUTED`, `USE_MKLDNN`, `USE_CUDA`, `USE_FLASH_ATTENTION`, `USE_MEM_EFF_ATTENTION`, `BUILD_TEST`, `USE_FBGEMM`, `USE_NNPACK` and `USE_QNNPACK`.
@ -1196,7 +1195,7 @@ build_with_asan()
  LDFLAGS="-stdlib=libstdc++" \
  CFLAGS="-fsanitize=address -fno-sanitize-recover=all -shared-libasan -pthread" \
  CXX_FLAGS="-pthread" \
-  USE_CUDA=0 USE_OPENMP=0 BUILD_CAFFE2_OPS=0 USE_DISTRIBUTED=0 DEBUG=1 \
+  USE_CUDA=0 USE_OPENMP=0 USE_DISTRIBUTED=0 DEBUG=1 \
  python setup.py develop
 }

--- a/README.md
+++ b/README.md
@ -379,7 +379,7 @@ You can also pass the `CMAKE_VARS="..."` environment variable to specify additio
 See [setup.py](./setup.py) for the list of available variables.

 ```bash
-CMAKE_VARS="BUILD_CAFFE2=ON BUILD_CAFFE2_OPS=ON" make -f docker.Makefile
+make -f docker.Makefile
 ```

 ### Building the Documentation
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -54,7 +54,7 @@ if(NOT BUILD_LITE_INTERPRETER)
 endif()
 EXCLUDE(ATen_CORE_SRCS "${ATen_CORE_SRCS}" ${ATen_CORE_TEST_SRCS})
 # Exclude TensorImpl_test.cpp if compiling without Caffe2
-if(NOT BUILD_CAFFE2 AND NOT BUILD_LITE_INTERPRETER)
+if(NOT BUILD_LITE_INTERPRETER)
  file(GLOB_RECURSE ATen_CORE_EXCLUDED_TEST_SRCS "core/TensorImpl_test.cpp")
  EXCLUDE(ATen_CORE_TEST_SRCS "${ATen_CORE_TEST_SRCS}" ${ATen_CORE_EXCLUDED_TEST_SRCS})
 endif()
--- a/aten/src/ATen/test/CMakeLists.txt
+++ b/aten/src/ATen/test/CMakeLists.txt
@ -123,16 +123,6 @@ list(APPEND ATen_XPU_TEST_SRCS
  ${CMAKE_CURRENT_SOURCE_DIR}/xpu_generator_test.cpp
  )

-# Caffe2 specific tests
-if(BUILD_CAFFE2)
-  list(APPEND ATen_CPU_TEST_SRCS
-    ${CMAKE_CURRENT_SOURCE_DIR}/ExclusivelyOwned_test.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/tensor_interop_test.cpp)
-  list(APPEND ATen_CUDA_TEST_SRCS
-    ${CMAKE_CURRENT_SOURCE_DIR}/cuda_tensor_interop_test.cpp)
-endif()
-
-
 # ---[ Send the lists to the parent scope.
 set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE)
 set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE)
--- a/binaries/CMakeLists.txt
+++ b/binaries/CMakeLists.txt
@ -7,16 +7,6 @@ if(INTERN_BUILD_MOBILE)
  return()
 endif()

-if(BUILD_CAFFE2)
-  caffe2_binary_target("at_launch_benchmark.cc")
-  target_include_directories(at_launch_benchmark PUBLIC
-    ${CMAKE_BINARY_DIR}/aten/src)
-
-  caffe2_binary_target("intra_inter_benchmark.cc")
-  target_include_directories(intra_inter_benchmark PUBLIC
-    ${CMAKE_BINARY_DIR}/aten/src)
-endif()
-
 caffe2_binary_target("parallel_info.cc")
 target_include_directories(parallel_info PUBLIC
  ${CMAKE_BINARY_DIR}/aten/src) # provides "ATen/TypeExtendedInterface.h" to ATen.h
--- a/buckbuild.bzl
+++ b/buckbuild.bzl
@ -279,7 +279,6 @@ def get_pt_preprocessor_flags():
        "-D_THP_CORE",
        "-DUSE_SCALARS",
        "-DNO_CUDNN_DESTROY_HANDLE",
-        "-DBUILD_CAFFE2",
    ]

    if _is_build_mode_dev():
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -110,21 +110,11 @@ endif()
 add_subdirectory(core)
 add_subdirectory(serialize)
 add_subdirectory(utils)
-if(BUILD_CAFFE2 OR (NOT USE_FBGEMM))
+if(NOT USE_FBGEMM)
  add_subdirectory(perfkernels)
 endif()

-# Skip modules that are not used by libtorch mobile yet.
-if(BUILD_CAFFE2 AND NOT INTERN_BUILD_MOBILE)
-  add_subdirectory(core/nomnigraph)
-  if(USE_NVRTC)
-    add_subdirectory(cuda_rtc)
-  endif()
-  if(BUILD_CAFFE2_OPS)
-  endif()
-  add_subdirectory(proto)
-endif()
-if(NOT BUILD_CAFFE2 AND NOT INTERN_BUILD_MOBILE)
+if(NOT INTERN_BUILD_MOBILE)
  add_subdirectory(proto)
 endif()

@ -585,17 +575,10 @@ if(NOT INTERN_BUILD_MOBILE AND NOT BUILD_LITE_INTERPRETER)
    ${TORCH_SRC_DIR}/csrc/utils/byte_order.cpp
  )

-  # Disable legacy import of building without Caffe2 support
-  if(BUILD_CAFFE2)
-    list(APPEND TORCH_SRCS
-      ${TORCH_SRC_DIR}/csrc/jit/serialization/import_legacy.cpp
-    )
-  else()
-    set_source_files_properties(
-      ${TORCH_SRC_DIR}/csrc/jit/serialization/import.cpp
-      PROPERTIES COMPILE_FLAGS "-DC10_DISABLE_LEGACY_IMPORT"
-    )
-  endif()
+  set_source_files_properties(
+    ${TORCH_SRC_DIR}/csrc/jit/serialization/import.cpp
+    PROPERTIES COMPILE_FLAGS "-DC10_DISABLE_LEGACY_IMPORT"
+  )
  if(USE_DISTRIBUTED)
    append_filelist("libtorch_distributed_base_sources" TORCH_SRCS)
    if(NOT WIN32)
@ -809,11 +792,6 @@ if(HAVE_SOVERSION)
      VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION})
 endif()
 torch_compile_options(torch_cpu)  # see cmake/public/utils.cmake
-if(BUILD_CAFFE2 AND NOT MSVC)
-  # Caffe2 has too many signed-unsigned violation, but the framework is dead
-  # So no point in fixing those
-  target_compile_options(torch_cpu PRIVATE "-Wno-sign-compare")
-endif()

 # Ignore Wdeprecated-XXX errors from third-party libraries
 if(NOT MSVC)
@ -1921,14 +1899,6 @@ if(BUILD_TEST)
      endif()
    endforeach()
  endif()
-
-  # For special tests that explicitly uses dependencies, we add them here
-  if(BUILD_CAFFE2 AND USE_MPI)
-    target_link_libraries(mpi_test MPI::MPI_CXX)
-    if(USE_CUDA)
-      target_link_libraries(mpi_gpu_test MPI::MPI_CXX)
-    endif()
-  endif()
 endif()

 if(MSVC)
@ -1998,11 +1968,6 @@ if(BUILD_PYTHON)
    set_source_files_properties(${TORCH_SRC_DIR}/../caffe2/operators/box_with_nms_limit_op.cc PROPERTIES COMPILE_FLAGS -Wno-attributes)
  endif()

-  # ---[ Python.
-  if(BUILD_CAFFE2)
-  target_compile_definitions(torch PRIVATE BUILD_CAFFE2)
-  endif()
-
  # generated pb files are copied from build/caffe2 to caffe2
  # if we copied them back to build this would create a build cycle
  # consider removing the need for globs
--- a/caffe2/init.py
+++ b/caffe2/init.py
@ -2,5 +2,4 @@ import warnings
 from torch.onnx import _CAFFE2_ATEN_FALLBACK

 if not _CAFFE2_ATEN_FALLBACK:
-    warnings.warn("Caffe2 support is not fully enabled in this PyTorch build. "
-                  "Please enable Caffe2 by building PyTorch from source with `BUILD_CAFFE2=1` flag.")
+    warnings.warn("Caffe2 support is no longer present in PyTorch.")
--- a/caffe2/core/CMakeLists.txt
+++ b/caffe2/core/CMakeLists.txt
@ -1,68 +1,4 @@
-if(NOT BUILD_CAFFE2 OR INTERN_BUILD_MOBILE)
-  list(APPEND Caffe2_CPU_SRCS
-    "${CMAKE_CURRENT_SOURCE_DIR}/common.cc"
-  )
-  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
-  return()
-endif()
-
-# ---[ GPU files
-# ------[ cuDNN
-if(USE_CUDNN)
-  file(GLOB tmp *_cudnn.cc)
-  set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
-endif()
-# ------[ general GPU
-file(GLOB tmp *_gpu.cc)
-set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
-# ------[ CUDA sources
-file(GLOB tmp *.cu)
-set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
-# exclude test files
-file(GLOB tmp *_test.cc)
-exclude(Caffe2_GPU_SRCS "${Caffe2_GPU_SRCS}" ${tmp})
-
-# ---[ general HIP files
-file(GLOB tmp hip/*.cc)
-set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${tmp})
-# ------[ HIP sources
-file(GLOB tmp hip/*.hip)
-set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${tmp})
-# exclude test files
-file(GLOB tmp hip/*_test.cc)
-exclude(Caffe2_HIP_SRCS "${Caffe2_HIP_SRCS}" ${tmp})
-
-# ---[ CPU files.
-file(GLOB tmp *.cc)
-# Manually remove the cudnn files since we might be using USE_CUDNN=OFF
-# TODO: when we move to explicit file list, this would not be needed.
-file(GLOB tmp_cudnn *_cudnn.cc)
-exclude(tmp "${tmp}" ${tmp_cudnn})
-set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp})
-# exclude test files and gpu files
-file(GLOB tmp *_test.cc)
-exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp})
-exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_GPU_SRCS})
-exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_HIP_SRCS})
-
-# ---[ GPU test files
-file(GLOB tmp *_gpu_test.cc)
-set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} ${tmp})
-
-# ---[ HIP test files
-file(GLOB tmp hip/*_test.cc)
-set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} ${tmp})
-
-# ---[ CPU test files
-file(GLOB tmp *_test.cc)
-set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp})
-exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_GPU_TEST_SRCS})
-exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_HIP_TEST_SRCS})
-
-# ---[ Send the lists to the parent scope.
+list(APPEND Caffe2_CPU_SRCS
+  "${CMAKE_CURRENT_SOURCE_DIR}/common.cc"
+)
 set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
-set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
-set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE)
-set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
-set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
-set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} PARENT_SCOPE)
--- a/caffe2/core/blob_serialization.cc
+++ b/caffe2/core/blob_serialization.cc
@ -1,5 +1,6 @@
 #include "caffe2/core/blob_serialization.h"

+#include <limits>
 #include <mutex>
 #include <sstream>
 #include <utility>
@ -83,8 +84,7 @@ Range<T*> GetMutableTensorDataRange(
    size_t start,
    size_t numElements) {
  CAFFE_ENFORCE(
-      // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-      start + numElements <= tensor.numel(),
+      static_cast<int64_t>(start + numElements) <= tensor.numel(),
      "Requested invalid mutable tensor range [",
      start,
      ", ",
@ -100,8 +100,7 @@ c10::ArrayRef<T> GetTensorDataRange(
    size_t start,
    size_t numElements) {
  CAFFE_ENFORCE(
-      // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-      start + numElements <= tensor.numel(),
+      static_cast<int64_t>(start + numElements) <= tensor.numel(),
      "Requested invalid tensor range [",
      start,
      ", ",
@ -390,8 +389,7 @@ void TensorSerializer::SerializeWithOptions(
  // Poorman's IOBound ThreadPool
  SimpleQueue<size_t> chunkQueue;
  auto task = [&]() {
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    size_t chunkStart;
+    size_t chunkStart = std::numeric_limits<size_t>::max();
    while (chunkQueue.Pop(&chunkStart)) {
      processChunk(chunkStart);
    }
@ -409,8 +407,7 @@ void TensorSerializer::SerializeWithOptions(
  VLOG(1) << "Serializing blob " << name;
  // Serialize whole vector. If vector is empty, it's shape still needs to be
  // serialized in empty proto
-  for (size_t chunkBegin = 0;
-       // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
+  for (int64_t chunkBegin = 0;
       chunkBegin < std::max(tensor.numel(), static_cast<int64_t>(1));
       chunkBegin += chunk_size) {
    VLOG(2) << "Starting a chunk at " << chunkBegin;
@ -582,8 +579,7 @@ void SerializeTensorData(const SerializeParams<float>& params) {
      BlobSerializationOptions_FloatFormat_FLOAT_BFLOAT16) {
    // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
    std::unique_ptr<float[]> tmp_buffer;
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    const float* src;
+    const float* src = nullptr;
    if (params.context.device() == CPU) {
      src = params.input.data();
    } else {
@ -653,14 +649,12 @@ void TensorSerializer::Serialize(
    size_t chunkBegin,
    int32_t chunkSize) {
  CAFFE_ENFORCE(
-      // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-      chunkBegin <= input.numel(),
+      static_cast<int64_t>(chunkBegin) <= input.numel(),
      "Chunk begin is out of tensor: ",
      chunkBegin,
      ' ',
      input.numel());
-  // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-  if (chunkBegin + chunkSize > input.numel()) {
+  if (static_cast<int64_t>(chunkBegin + chunkSize) > input.numel()) {
    chunkSize = input.numel() - chunkBegin;
  }

@ -1029,8 +1023,7 @@ DESERIALIZE_IMPL(float, FMT_BFLOAT16) {
      params.tensor_proto.raw_data().data());

  // If we are on a big-endian machine, byte-swap the serialized data.
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  const fbgemm::bfloat16* src;
+  const fbgemm::bfloat16* src = nullptr;
  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
  std::unique_ptr<fbgemm::bfloat16[]> bswap_buffer;
  if (kIsLittleEndian) {
@ -1045,8 +1038,7 @@ DESERIALIZE_IMPL(float, FMT_BFLOAT16) {
  // bfloat16 to float conversion.
  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
  std::unique_ptr<float[]> tmp_buffer;
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  float* dest;
+  float* dest = nullptr;
  if (params.context.device() == CPU) {
    dest = params.dest.data();
  } else {
--- a/caffe2/proto/CMakeLists.txt
+++ b/caffe2/proto/CMakeLists.txt
@ -1,8 +1,4 @@
-if(BUILD_CAFFE2)
-  file(GLOB Caffe2_PROTOBUF_FILES "${CMAKE_CURRENT_SOURCE_DIR}/*.proto")
-else()
-  set(Caffe2_PROTOBUF_FILES "${CMAKE_CURRENT_SOURCE_DIR}/torch.proto;${CMAKE_CURRENT_SOURCE_DIR}/caffe2.proto")
-endif()
+set(Caffe2_PROTOBUF_FILES "${CMAKE_CURRENT_SOURCE_DIR}/torch.proto;${CMAKE_CURRENT_SOURCE_DIR}/caffe2.proto")

 caffe2_protobuf_generate_cpp_py(Caffe2_PROTO_SRCS Caffe2_PROTO_HEADERS Caffe2_PROTO_PY ${Caffe2_PROTOBUF_FILES})

--- a/caffe2/proto/init.py
+++ b/caffe2/proto/init.py
@ -14,8 +14,7 @@ import warnings
 try:
    from caffe2.proto import caffe2_pb2, metanet_pb2, torch_pb2
 except ImportError:
-    warnings.warn('Caffe2 support is not enabled in this PyTorch build. '
-                  'Please enable Caffe2 by building PyTorch from source with `BUILD_CAFFE2=1` flag.')
+    warnings.warn('Caffe2 support is no longer present in PyTorch.')
    raise

 try:
--- a/caffe2/python/init.py
+++ b/caffe2/python/init.py
@ -6,8 +6,7 @@ import warnings
 try:
    from caffe2.proto import caffe2_pb2
 except ImportError:
-    warnings.warn('Caffe2 support is not enabled in this PyTorch build. '
-                  'Please enable Caffe2 by building PyTorch from source with `BUILD_CAFFE2=1` flag.')
+    warnings.warn('Caffe2 support is no longer present in PyTorch.')
    raise

 # TODO: refactor & remove the following alias
--- a/caffe2/utils/CMakeLists.txt
+++ b/caffe2/utils/CMakeLists.txt
@ -1,100 +1,18 @@
-if(NOT BUILD_CAFFE2 OR INTERN_BUILD_MOBILE)
-  list(APPEND Caffe2_CPU_SRCS
-    utils/string_utils.cc
-    utils/threadpool/ThreadPool.cc
-  )
-
-  if(USE_PTHREADPOOL AND NOT USE_INTERNAL_PTHREADPOOL_IMPL)
-    list(APPEND Caffe2_CPU_SRCS
-      utils/threadpool/pthreadpool-cpp.cc
-      utils/threadpool/thread_pool_guard.cpp
-    )
-  endif()
-
-  if(NOT BUILD_CAFFE2 AND NOT INTERN_BUILD_MOBILE)
-    list(APPEND Caffe2_CPU_SRCS
-      utils/proto_wrap.cc
-    )
-  endif()
-  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
-  return()
-endif()
-
 list(APPEND Caffe2_CPU_SRCS
-  utils/bench_utils.cc
-  utils/cpuid.cc
-  utils/math/broadcast.cc
-  utils/math/elementwise.cc
-  utils/math/reduce.cc
-  utils/math/transpose.cc
-  utils/math/utils.cc
-  utils/math_cpu.cc
-  utils/murmur_hash3.cc
-  utils/proto_utils.cc
-  utils/proto_wrap.cc
+  utils/string_utils.cc
  utils/threadpool/ThreadPool.cc
-  utils/signal_handler.cc
-  utils/smart_tensor_printer.cc
-  utils/string_utils.cc)
+)

-if(USE_PTHREADPOOL)
+if(USE_PTHREADPOOL AND NOT USE_INTERNAL_PTHREADPOOL_IMPL)
  list(APPEND Caffe2_CPU_SRCS
    utils/threadpool/pthreadpool-cpp.cc
-    utils/threadpool/thread_pool_guard.cpp)
-  if(USE_INTERNAL_PTHREADPOOL_IMPL)
-    list(APPEND Caffe2_CPU_SRCS
-      utils/threadpool/pthreadpool.cc
-      utils/threadpool/pthreadpool_impl.cc)
-  endif()
+    utils/threadpool/thread_pool_guard.cpp
+  )
 endif()

-set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS}
-        utils/math/broadcast.cu
-        utils/math/elementwise.cu
-        utils/math/reduce.cu
-        utils/math/transpose.cu
-        utils/math_gpu.cu
-        )
-
-set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS}
-        utils/math/hip/broadcast.hip
-        utils/math/hip/elementwise.hip
-        utils/math/hip/reduce.hip
-        utils/math/hip/transpose.hip
-        utils/hip/math_gpu.hip
-        )
-
-set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS}
-        utils/fixed_divisor_test.cc
-        utils/math_test.cc
-        utils/fatal_signal_asan_no_sig_test.cc
-        utils/simple_queue_test.cc
-        utils/proto_utils_test.cc
-        utils/smart_tensor_printer_test.cc
-        utils/cast_test.cc
-        )
-
-if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "s390x")
-  set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS}
-          utils/cpuid_test.cc
-          )
+if(NOT INTERN_BUILD_MOBILE)
+  list(APPEND Caffe2_CPU_SRCS
+    utils/proto_wrap.cc
+  )
 endif()
-
-set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS}
-        utils/math_gpu_test.cc
-        )
-
-set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS}
-        utils/hip/math_gpu_test.cc
-        utils/hip/math_blas_gpu_test.cc
-        )
-
-# TODO Once all source files are defined inside the local c10_utils_xxx targets,
-# it should be the job of the parent CMakeLists.txt to decide what to do with the target (i.e. link it to caffe2)
-# instead of us locally adding it to Caffe2_xxx variables.
 set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
-set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
-set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE)
-set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
-set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
-set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} PARENT_SCOPE)
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@ -1685,9 +1685,6 @@ if(NOT INTERN_BUILD_MOBILE)
    if(MKLDNN_FOUND)
      set(AT_MKLDNN_ENABLED 1)
      include_directories(AFTER SYSTEM ${MKLDNN_INCLUDE_DIR})
-      if(BUILD_CAFFE2_OPS)
-        list(APPEND Caffe2_DEPENDENCY_LIBS caffe2::mkldnn)
-      endif(BUILD_CAFFE2_OPS)
    else()
      message(WARNING "MKLDNN could not be found.")
      caffe2_update_option(USE_MKLDNN OFF)
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@ -23,8 +23,6 @@ function(caffe2_print_configuration_summary)
  message(STATUS "")

  message(STATUS "  TORCH_VERSION         : ${TORCH_VERSION}")
-  message(STATUS "  BUILD_CAFFE2          : ${BUILD_CAFFE2}")
-  message(STATUS "  BUILD_CAFFE2_OPS      : ${BUILD_CAFFE2_OPS}")
  message(STATUS "  BUILD_STATIC_RUNTIME_BENCHMARK: ${BUILD_STATIC_RUNTIME_BENCHMARK}")
  message(STATUS "  BUILD_BINARY          : ${BUILD_BINARY}")
  message(STATUS "  BUILD_CUSTOM_PROTOBUF : ${BUILD_CUSTOM_PROTOBUF}")
--- a/cmake/TorchConfig.cmake.in
+++ b/cmake/TorchConfig.cmake.in
@ -80,9 +80,6 @@ else()
  # shared library.
  # TODO: this list might be incomplete.
  append_torchlib_if_found(c10)
-  if(@BUILD_CAFFE2@)
-    append_torchlib_if_found(Caffe2_perfkernels_avx512 Caffe2_perfkernels_avx2 Caffe2_perfkernels_avx)
-  endif()

  if(@USE_NNPACK@)
    append_torchlib_if_found(nnpack)
--- a/modules/CMakeLists.txt
+++ b/modules/CMakeLists.txt
@ -1,7 +0,0 @@
-project(modules CXX C)
-add_subdirectory(detectron)
-add_subdirectory(module_test)
-add_subdirectory(observers)
-
-# Finally, set Caffe2_MODULES to parent scope.
-set(Caffe2_MODULES ${Caffe2_MODULES} PARENT_SCOPE)
--- a/modules/detectron/CMakeLists.txt
+++ b/modules/detectron/CMakeLists.txt
@ -1,57 +0,0 @@
-file(GLOB Detectron_CPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc)
-file(GLOB Detectron_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cu)
-file(GLOB_RECURSE Detectron_HIP_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.hip)
-
-if(BUILD_CAFFE2_OPS)
-  # Note(ilijar): Since Detectron ops currently have no
-  # CPU implementation, we only build GPU ops for now.
-  if(USE_CUDA)
-    add_library(
-        caffe2_detectron_ops_gpu SHARED
-        ${Detectron_CPU_SRCS}
-        ${Detectron_GPU_SRCS})
-
-    target_link_libraries(caffe2_detectron_ops_gpu PRIVATE torch)
-    if(USE_OPENMP)
-      target_link_libraries(caffe2_detectron_ops_gpu PRIVATE caffe2::openmp)
-    endif()
-
-    if(USE_MKLDNN)
-      target_link_libraries(caffe2_detectron_ops_gpu PRIVATE caffe2::mkldnn)
-    endif()
-    install(TARGETS caffe2_detectron_ops_gpu DESTINATION lib)
-    if(MSVC)
-      install(FILES $<TARGET_PDB_FILE:caffe2_detectron_ops_gpu> DESTINATION lib OPTIONAL)
-    endif()
-  elseif(USE_ROCM)
-    hip_include_directories(${Caffe2_HIP_INCLUDES})
-    set_source_files_properties(${Detectron_HIP_SRCS} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
-    HIP_ADD_LIBRARY(
-        caffe2_detectron_ops_hip SHARED
-        ${Detectron_CPU_SRCS}
-        ${Detectron_HIP_SRCS})
-    target_compile_options(caffe2_detectron_ops_hip PRIVATE ${HIP_CXX_FLAGS})
-    if(USE_MKLDNN)
-      target_link_libraries(caffe2_detectron_ops_hip PRIVATE caffe2::mkldnn)
-    endif()
-    target_link_libraries(caffe2_detectron_ops_hip PRIVATE torch)
-    install(TARGETS caffe2_detectron_ops_hip DESTINATION lib)
-  elseif(NOT IOS_PLATFORM)
-    add_library(caffe2_detectron_ops SHARED ${Detectron_CPU_SRCS})
-    if(HAVE_SOVERSION)
-      set_target_properties(caffe2_detectron_ops PROPERTIES
-        VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION})
-    endif()
-    target_link_libraries(caffe2_detectron_ops PRIVATE torch)
-    if(USE_OPENMP)
-      target_link_libraries(caffe2_detectron_ops PRIVATE caffe2::openmp)
-    endif()
-    if(USE_MKLDNN)
-      target_link_libraries(caffe2_detectron_ops PRIVATE caffe2::mkldnn)
-    endif()
-    install(TARGETS caffe2_detectron_ops DESTINATION lib)
-    if(MSVC)
-      install(FILES $<TARGET_PDB_FILE:caffe2_detectron_ops> DESTINATION lib OPTIONAL)
-    endif()
-  endif()
-endif()
--- a/modules/detectron/group_spatial_softmax_op.cc
+++ b/modules/detectron/group_spatial_softmax_op.cc
@ -1,83 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "modules/detectron/group_spatial_softmax_op.h"
-
-#include "caffe2/operators/softmax_utils.h"
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(
-    GroupSpatialSoftmax,
-    GroupSpatialSoftmaxOp<float, CPUContext>);
-REGISTER_CPU_OPERATOR(
-    GroupSpatialSoftmaxGradient,
-    GroupSpatialSoftmaxGradientOp<float, CPUContext>);
-
-OPERATOR_SCHEMA(GroupSpatialSoftmax)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .SetDoc(R"DOC(
-RetinaNet specific form of spatial softmax.
-
-The input is assumed to be unnormalized scores (sometimes called 'logits')
-arranged in a 4D tensor with shape (N, C, H, W), where N is the number of
-elements in the batch, H and W are the height and width, and C = num_anchors *
-num_classes defines num_anchors 'groups' of softmax inputs, each of length
-num_classes. The softmax is applied to each group independently.
-
-See: https://arxiv.org/abs/1708.02002 for details.
-)DOC")
-    .Arg(
-        "num_classes",
-        "(int) default 81; number of classes in each softmax group.")
-    .Input(
-        0,
-        "scores",
-        "4D tensor of softmax inputs (called 'scores' or 'logits') with shape "
-        "(N, C, H, W), where C = num_anchors * num_classes defines num_anchors "
-        "groups of contiguous num_classes softmax inputs.")
-    .Output(
-        0,
-        "probabilities",
-        "4D tensor of softmax probabilities with shape (N, C, H, W), where "
-        "C = num_anchors * num_classes, and softmax was applied to each of the "
-        "num_anchors groups; within a group the num_classes values sum to 1.");
-
-OPERATOR_SCHEMA(GroupSpatialSoftmaxGradient)
-    .NumInputs(2)
-    .NumOutputs(1)
-    .Input(0, "scores", "See GroupSpatialSoftmax")
-    .Input(
-        1,
-        "d_probabilities",
-        "Gradient of forward output 0 (probabilities).")
-    .Output(0, "d_scores", "Gradient of forward input 0 (scores).");
-
-class GetGroupSpatialSoftmaxGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    return SingleGradientDef(
-        "GroupSpatialSoftmaxGradient",
-        "",
-        vector<string>{O(0), GO(0)},
-        vector<string>{GI(0)});
-  }
-};
-
-REGISTER_GRADIENT(GroupSpatialSoftmax, GetGroupSpatialSoftmaxGradient);
-
-} // namespace caffe2
--- a/modules/detectron/group_spatial_softmax_op.cu
+++ b/modules/detectron/group_spatial_softmax_op.cu
@ -1,181 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cfloat>
-
-#include "caffe2/core/context_gpu.h"
-#include "modules/detectron/group_spatial_softmax_op.h"
-
-namespace caffe2 {
-
-namespace {
-
-__global__ void GroupSpatialSoftmaxKernel(const int num, const int A, const int W,
-    const int H, const float* Xdata, float* Pdata, const int num_classes) {
-  // Loop through labels (N x A x H x W)
-  CUDA_1D_KERNEL_LOOP(index, num * A * H * W) {
-    int D = num_classes * A;
-    int x = index % W;
-    int y = (index / W) % H;
-    int a = (index / (W * H)) % A;
-    int i = index / W / H / A;
-
-    // Subtract max on each cell for numerical reasons
-    float max_val = -FLT_MAX;
-    for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
-      int idx = i * (H * W * D) +  c * (H * W) + y * W + x;
-      max_val = max(max_val, Xdata[idx]);
-    }
-    // Exponentiate
-    float expsum = 0.0f;
-    for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
-      int idx = i * (H * W * D) + c * (H * W) + y * W + x;
-      float expx = exp(Xdata[idx] - max_val);
-      Pdata[idx] = expx;
-      expsum += expx;
-    }
-
-    // Normalize
-    for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
-      int idx = i * (H * W * D) + c * (H * W) + y * W + x;
-      Pdata[idx] /= expsum;
-    }
-
-  }
-}
-
-__global__ void SumProbsKernel(const int N, const int A, const int W,
-    const int H, const float* Ydata, const float* dYdata,
-    float* sum_probs_data, const int num_classes) {
-  CUDA_1D_KERNEL_LOOP(i, N * A * W * H) {
-    int D = num_classes * A;
-    int x = i % W;
-    int y = (i / W) % H;
-    int a = (i / (W * H)) % A;
-    int n = i / (W * H * A);
-
-    sum_probs_data[i] = 0.0;
-    for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
-      int idx = n * (H * W * D) + c * (H * W) + y * W + x;
-      sum_probs_data[i] += (Ydata[idx] * dYdata[idx]);
-    }
-  }
-}
-
-__global__ void SubSumKernel(
-    const int N, const int A, const int W, const int H,
-    const float* sum_probs_data, float* dXdata, const int num_classes) {
-  CUDA_1D_KERNEL_LOOP(i, N * (A * num_classes) * W * H) {
-    int D = num_classes * A;
-    int x = i % W;
-    int y = (i / W) % H;
-    int a = ((i / (W * H)) % D) / num_classes;
-    int n = i / W / H / D;
-    int idx = n * (H * W * A) + a * (H * W) + y * W + x;
-    dXdata[i] = (dXdata[i] - sum_probs_data[idx]);
-  }
-}
-
-} // namespace
-
-
-template <>
-bool GroupSpatialSoftmaxOp<float, CUDAContext>::RunOnDevice() {
-  auto& X = Input(0);  // Logits
-
-  int N = X.dim32(0);
-  int D = X.dim32(1);
-  int H = X.dim32(2);
-  int W = X.dim32(3);
-  int A = D / num_classes_;
-
-  auto* P = Output(0, X.sizes(), at::dtype<float>()); // Probabilities from softmax
-  TORCH_DCHECK_EQ(X.ndim(), 4);
-
-  const float* Xdata = X.data<float>();
-  float* Pdata = P->mutable_data<float>();
-
-  // Softmax for each x,y location
-  GroupSpatialSoftmaxKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
-                         0, context_.cuda_stream()>>>(
-      N, A, W, H, Xdata, Pdata, num_classes_);
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-  return true;
-}
-
-
-template<>
-bool GroupSpatialSoftmaxGradientOp<float, CUDAContext>::RunOnDevice() {
-  auto& Y = Input(0);  // Probabilities from softmax
-  auto& dY = Input(1);
-
-
-  TORCH_DCHECK_EQ(Y.ndim(), 4);
-
-  int N = Y.dim32(0);
-  int D = Y.dim32(1);
-  int H = Y.dim32(2);
-  int W = Y.dim32(3);
-  int A = D / num_classes_;
-
-  auto* dX = Output(0, Y.sizes(), at::dtype<float>());
-
-  if (sum_probs_.size() != N * A * H * W) {
-    ReinitializeTensor(&sum_probs_, {N * A * H * W}, at::dtype<float>().device(CUDA));
-  }
-
-  const float* Ydata = Y.data<float>();
-  const float* dYdata = dY.data<float>();
-  float* dXdata = dX->mutable_data<float>();
-
-  float* sum_probs_data = sum_probs_.mutable_data<float>();
-  math::Set<float, CUDAContext>(
-      sum_probs_.size(), 0.0f, sum_probs_data, &context_);
-
-  // Complete math:
-  // J_ij = h_i (delta_ij - h_j)
-  // d x_i = sum_j d h_ij = sum_j J_ij * dy_j
-  //       = sum_j h_i (delta_ij - h_j) * dy_j
-  //       = h_i dy_i - (sum_j h_i h_j dy_j)
-  //       = h_i dy_i - h_i sum_j h_j dy_j
-
-  // Step 0: dx = dy
-  context_.Copy<float, CUDAContext, CUDAContext>(Y.size(), dYdata, dXdata);
-
-  // Step 1: s = Sum(dY[j] * Y[j])
-  SumProbsKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS, 0,
-                   context_.cuda_stream()>>>(
-    N, A, W, H, Ydata, dYdata, sum_probs_data, num_classes_);
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-  // Step 2: dX[i] = dX[i] - s
-  SubSumKernel<<<CAFFE_GET_BLOCKS(Y.size()), CAFFE_CUDA_NUM_THREADS, 0,
-                  context_.cuda_stream()>>>(
-    N, A, W, H, sum_probs_.data<float>(), dXdata, num_classes_);
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-  // Step 3: dX[i] = Y[i] * dX[i]
-  math::Mul<float, CUDAContext>(Y.size(), dXdata, Ydata, dXdata, &context_);
-
-  return true;
-}
-
-
-REGISTER_CUDA_OPERATOR(GroupSpatialSoftmax,
-                       GroupSpatialSoftmaxOp<float, CUDAContext>);
-REGISTER_CUDA_OPERATOR(GroupSpatialSoftmaxGradient,
-                       GroupSpatialSoftmaxGradientOp<float, CUDAContext>);
-} // namespace caffe2
--- a/modules/detectron/group_spatial_softmax_op.h
+++ b/modules/detectron/group_spatial_softmax_op.h
@ -1,76 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef GROUP_SPATIAL_SOFTMAX_OP_H_
-#define GROUP_SPATIAL_SOFTMAX_OP_H_
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-
-template <typename T, class Context>
-class GroupSpatialSoftmaxOp final : public Operator<Context> {
- public:
-  GroupSpatialSoftmaxOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        num_classes_(this->template GetSingleArgument<int>("num_classes", 81)),
-        order_(StringToStorageOrder(
-            this->template GetSingleArgument<string>("order", "NCHW"))) {
-    CAFFE_ENFORCE_EQ(
-        order_, StorageOrder::NCHW, "Only NCHW order is supported right now.");
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  int num_classes_;
-  StorageOrder order_;
-};
-
-template <typename T, class Context>
-class GroupSpatialSoftmaxGradientOp final : public Operator<Context> {
- public:
-  GroupSpatialSoftmaxGradientOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws),
-        num_classes_(this->template GetSingleArgument<int>("num_classes", 81)),
-        order_(StringToStorageOrder(
-            this->template GetSingleArgument<string>("order", "NCHW"))) {
-    CAFFE_ENFORCE_EQ(
-        order_, StorageOrder::NCHW, "Only NCHW order is supported right now.");
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  int num_classes_;
-  StorageOrder order_;
-  Tensor sum_probs_;
-};
-
-} // namespace caffe2
-
-#endif // GROUP_SPATIAL_SOFTMAX_OP_H_
--- a/modules/detectron/ps_roi_pool_op.cc
+++ b/modules/detectron/ps_roi_pool_op.cc
@ -1,106 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "ps_roi_pool_op.h"
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(PSRoIPool, PSRoIPoolOp<float, CPUContext>);
-REGISTER_CPU_OPERATOR(
-    PSRoIPoolGradient,
-    PSRoIPoolGradientOp<float, CPUContext>);
-
-OPERATOR_SCHEMA(PSRoIPool)
-    .NumInputs(2)
-    .NumOutputs(2)
-    .SetDoc(R"DOC(
-Position Sensitive Region of Interest Pooling as used in R-FCN.
-)DOC")
-    .Arg(
-        "spatial_scale",
-        "(float) default 1.0; Spatial scale of the input feature map X "
-        "relative to the input image. E.g., 0.0625 if X has a stride of 16 "
-        "w.r.t. the input image.")
-    .Arg(
-        "group_size",
-        "(int) default 1; pooled_h = pooled_w = group_size where pooled_{h,w} "
-        "is the pooled output Y's height and width, respectively.")
-    .Arg(
-        "output_dim",
-        "(int) default 1; number of channels in the pooled output, which might "
-        "be the number of classes is used for classification or 4 if used for "
-        "class agnostic bounding box regression.")
-    .Input(
-        0,
-        "X",
-        "4D position sensitive feature map input of shape (N, C, H, W), where "
-        "C = group_size**2 * output_dim.")
-    .Input(
-        1,
-        "RoIs",
-        "2D input of shape (R, 5) specifying R RoIs with five columns "
-        "representing: batch index in [0, N - 1], x1, y1, x2, y2. The RoI "
-        "coordinates are in the coordinate system of the input image.")
-    .Output(
-        0,
-        "Y",
-        "4D output of shape (R, output_dim, pooled_h, pooled_w). The r-th "
-        "batch element is a pooled feature map cooresponding to the r-th RoI.")
-    .Output(
-        1,
-        "argmaxes",
-        "4D output of shape (R, output_dim, pooled_h, pooled_w). Same as Y, "
-        "except it records the argmax indices rather than the max pooled "
-        "values.");
-
-OPERATOR_SCHEMA(PSRoIPoolGradient)
-    .NumInputs(4)
-    .NumOutputs(1)
-    .Input(
-        0,
-        "X",
-        "See PSRoIPool.")
-    .Input(
-        1,
-        "RoIs",
-        "See PSRoIPool.")
-    .Input(
-        2,
-        "argmaxes",
-        "See PSRoIPool.")
-    .Input(
-        3,
-        "dY",
-        "Gradient of forward output 0 (Y)")
-    .Output(
-        0,
-        "dX",
-        "Gradient of forward input 0 (X)");
-
-class GetPSRoIPoolGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    return SingleGradientDef(
-        "PSRoIPoolGradient",
-        "",
-        vector<string>{I(0), I(1), O(1), GO(0)},
-        vector<string>{GI(0)});
-  }
-};
-
-REGISTER_GRADIENT(PSRoIPool, GetPSRoIPoolGradient);
-
-} // namespace caffe2
--- a/modules/detectron/ps_roi_pool_op.cu
+++ b/modules/detectron/ps_roi_pool_op.cu
@ -1,289 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Based on https://github.com/daijifeng001/caffe-rfcn/blob/r-fcn/src/caffe/layers/psroi_pooling_layer.cu
-//
-// ------------------------------------------------------------------
-// R-FCN
-// Copyright (c) 2016 Microsoft
-// Licensed under The MIT License [see r-fcn/LICENSE for details]
-// Written by Yi Li
-// ------------------------------------------------------------------
-//
-// COPYRIGHT
-//
-// All contributions by the University of California:
-// Copyright (c) 2014, 2015, The Regents of the University of California
-// (Regents)
-// All rights reserved.
-//
-// All other contributions:
-// Copyright (c) 2014, 2015, the respective contributors
-// All rights reserved.
-//
-// Caffe uses a shared copyright model: each contributor holds copyright over
-// their contributions to Caffe. The project versioning records all such
-// contribution and copyright details. If a contributor wants to further mark
-// their specific copyright on a particular contribution, they should indicate
-// their copyright solely in the commit message of the change when it is
-// committed.
-//
-// LICENSE
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// 1. Redistributions of source code must retain the above copyright notice,
-//    this list of conditions and the following disclaimer.
-// 2. Redistributions in binary form must reproduce the above copyright notice,
-//    this list of conditions and the following disclaimer in the documentation
-//    and/or other materials provided with the distribution.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-// CONTRIBUTION AGREEMENT
-//
-// By contributing to the BVLC/caffe repository through pull-request, comment,
-// or otherwise, the contributor releases their content to the
-// license and copyright terms herein.
-
-#include <cfloat>
-
-#include "caffe2/core/context_gpu.h"
-#include "modules/detectron/ps_roi_pool_op.h"
-
-namespace caffe2 {
-
-namespace {
-
-template <typename T>
-inline __device__ T gpu_atomic_add(const T val, T* address);
-
-template <>
-inline __device__
-float gpu_atomic_add(const float val, float* address) {
-  return atomicAdd(address, val);
-}
-
-template <typename T>
-__global__ void PSRoIPoolForward(
-    const int nthreads,
-    const T* bottom_data,
-    const T spatial_scale,
-    const int channels,
-    const int height,
-    const int width,
-    const int pooled_height,
-    const int pooled_width,
-    const T* bottom_rois,
-    const int output_dim,
-    const int group_size,
-    T* top_data,
-    int* mapping_channel) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    // The output is in order (n, ctop, ph, pw)
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int ctop = (index / pooled_width / pooled_height) % output_dim;
-    int n = index / pooled_width / pooled_height / output_dim;
-
-    // [start, end) interval for spatial sampling
-    const T* offset_bottom_rois = bottom_rois + n * 5;
-    int roi_batch_ind = offset_bottom_rois[0];
-    T roi_start_w = static_cast<T>(
-      roundf(offset_bottom_rois[1])) * spatial_scale;
-    T roi_start_h = static_cast<T>(
-      roundf(offset_bottom_rois[2])) * spatial_scale;
-    T roi_end_w = static_cast<T>(
-      roundf(offset_bottom_rois[3]) + 1.) * spatial_scale;
-    T roi_end_h = static_cast<T>(
-      roundf(offset_bottom_rois[4]) + 1.) * spatial_scale;
-
-    // Force too small ROIs to be 1x1
-    T roi_width = c10::cuda::compat::max(roi_end_w - roi_start_w, static_cast<T>(0.1));  // avoid 0
-    T roi_height = c10::cuda::compat::max(roi_end_h - roi_start_h, static_cast<T>(0.1));
-
-    // Compute w and h at bottom
-    T bin_size_h = roi_height / static_cast<T>(pooled_height);
-    T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-    // Add roi offsets and clip to input boundaries
-    int hstart = floor(
-      static_cast<T>(ph) * bin_size_h + roi_start_h);
-    int wstart = floor(
-      static_cast<T>(pw)* bin_size_w + roi_start_w);
-    int hend = ceil(
-      static_cast<T>(ph + 1) * bin_size_h + roi_start_h);
-    int wend = ceil(
-      static_cast<T>(pw + 1) * bin_size_w + roi_start_w);
-
-    hstart = min(max(hstart, 0), height);
-    hend = min(max(hend, 0), height);
-    wstart = min(max(wstart, 0),width);
-    wend = min(max(wend, 0), width);
-    bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-    int gw = pw;
-    int gh = ph;
-    int c = (ctop * group_size + gh) * group_size + gw;
-
-    const T* offset_bottom_data =
-      bottom_data + (roi_batch_ind * channels + c) * height * width;
-    T out_sum = 0;
-    for (int h = hstart; h < hend; ++h){
-     for (int w = wstart; w < wend; ++w){
-       int bottom_index = h*width + w;
-       out_sum += offset_bottom_data[bottom_index];
-     }
-    }
-
-    T bin_area = (hend - hstart) * (wend - wstart);
-    top_data[index] = is_empty ? 0. : out_sum / bin_area;
-    mapping_channel[index] = c;
-  }
-}
-
-template <typename T>
-__global__ void PSRoIPoolBackward(
-    const int nthreads,
-    const T* top_diff,
-    const int* mapping_channel,
-    const int num_rois,
-    const T spatial_scale,
-    const int channels,
-    const int height,
-    const int width,
-    const int pooled_height,
-    const int pooled_width,
-    const int output_dim,
-    T* bottom_diff,
-    const T* bottom_rois) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    // The output is in order (n, ctop, ph, pw)
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int n = index / pooled_width / pooled_height / output_dim;
-
-    // [start, end) interval for spatial sampling
-    const T* offset_bottom_rois = bottom_rois + n * 5;
-    int roi_batch_ind = offset_bottom_rois[0];
-    T roi_start_w = static_cast<T>(
-      roundf(offset_bottom_rois[1])) * spatial_scale;
-    T roi_start_h = static_cast<T>(
-      roundf(offset_bottom_rois[2])) * spatial_scale;
-    T roi_end_w = static_cast<T>(
-      roundf(offset_bottom_rois[3]) + 1.) * spatial_scale;
-    T roi_end_h = static_cast<T>(
-      roundf(offset_bottom_rois[4]) + 1.) * spatial_scale;
-
-    // Force too small ROIs to be 1x1
-    T roi_width = c10::cuda::compat::max(roi_end_w - roi_start_w, static_cast<T>(0.1)); //avoid 0
-    T roi_height = c10::cuda::compat::max(roi_end_h - roi_start_h, static_cast<T>(0.1));
-
-    // Compute w and h at bottom
-    T bin_size_h = roi_height / static_cast<T>(pooled_height);
-    T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-    int hstart = floor(
-      static_cast<T>(ph)* bin_size_h + roi_start_h);
-    int wstart = floor(
-      static_cast<T>(pw)* bin_size_w + roi_start_w);
-    int hend = ceil(
-      static_cast<T>(ph + 1) * bin_size_h + roi_start_h);
-    int wend = ceil(
-      static_cast<T>(pw + 1) * bin_size_w + roi_start_w);
-    // Add roi offsets and clip to input boundaries
-    hstart = min(max(hstart, 0), height);
-    hend = min(max(hend, 0), height);
-    wstart = min(max(wstart, 0), width);
-    wend = min(max(wend, 0), width);
-    bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-    // Compute c at bottom
-    int c = mapping_channel[index];
-    T* offset_bottom_diff =
-      bottom_diff + (roi_batch_ind * channels + c) * height * width;
-    T bin_area = (hend - hstart) * (wend - wstart);
-    T diff_val = is_empty ? 0. : top_diff[index] / bin_area;
-    for (int h = hstart; h < hend; ++h){
-      for (int w = wstart; w < wend; ++w){
-        int bottom_index = h * width + w;
-        gpu_atomic_add(diff_val, offset_bottom_diff + bottom_index);
-      }
-    }
-  }
-}
-
-} // namespace
-
-template<>
-bool PSRoIPoolOp<float, CUDAContext>::RunOnDevice() {
-  auto& X = Input(0);  // Input data to pool
-  auto& R = Input(1);  // RoIs
-
-  auto* Y = Output(0, {R.dim32(0), output_dim_, pooled_height_, pooled_width_}, at::dtype<float>()); // PSRoI pooled data
-  auto* A = Output(1, Y->sizes(), at::dtype<int>()); // mapping_channel
-  int output_size = Y->numel();
-  PSRoIPoolForward<float><<<CAFFE_GET_BLOCKS(output_size),
-                            CAFFE_CUDA_NUM_THREADS,
-                            0, context_.cuda_stream()>>>(
-      output_size, X.data<float>(), spatial_scale_, X.dim32(1), X.dim32(2),
-      X.dim32(3), pooled_height_, pooled_width_, R.data<float>(), output_dim_,
-      group_size_, Y->mutable_data<float>(), A->mutable_data<int>());
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-  return true;
-}
-
-
-template<>
-bool PSRoIPoolGradientOp<float, CUDAContext>::RunOnDevice() {
-  auto& X  = Input(0);  // Input data to pool
-  auto& R  = Input(1);  // RoIs
-  auto& A  = Input(2);  // mapping channels
-  auto& dY = Input(3);  // Gradient of net w.r.t. output of "forward" op
-                        // (aka "gradOutput")
-
-  auto* dX = Output(0, X.sizes(), at::dtype<float>()); // Gradient of net w.r.t. input to "forward" op
-                                                       // (aka "gradInput")
-  // Must zero-out dX before accumulating gradients
-  math::Set<float, CUDAContext>(
-      dX->size(), 0.f, dX->mutable_data<float>(), &context_);
-  PSRoIPoolBackward<float><<<CAFFE_GET_BLOCKS(dY.size()),
-                             CAFFE_CUDA_NUM_THREADS,
-                             0, context_.cuda_stream()>>>(
-      dY.size(), dY.data<float>(), A.data<int>(), R.dim32(0), spatial_scale_,
-      X.dim32(1), X.dim32(2), X.dim32(3), pooled_height_, pooled_width_,
-      output_dim_, dX->mutable_data<float>(), R.data<float>());
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-  return true;
-}
-
-
-REGISTER_CUDA_OPERATOR(PSRoIPool,
-                       PSRoIPoolOp<float, CUDAContext>);
-REGISTER_CUDA_OPERATOR(PSRoIPoolGradient,
-                       PSRoIPoolGradientOp<float, CUDAContext>);
-} // namespace caffe2
--- a/modules/detectron/ps_roi_pool_op.h
+++ b/modules/detectron/ps_roi_pool_op.h
@ -1,93 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef PS_ROI_POOL_OP_H_
-#define PS_ROI_POOL_OP_H_
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-
-template <typename T, class Context>
-class PSRoIPoolOp final : public Operator<Context> {
- public:
-  PSRoIPoolOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        spatial_scale_(this->template GetSingleArgument<float>(
-              "spatial_scale", 1.)),
-        group_size_(this->template GetSingleArgument<int>("group_size", 1)),
-        output_dim_(this->template GetSingleArgument<int>("output_dim", 1)) {
-    TORCH_DCHECK_GT(spatial_scale_, 0);
-    TORCH_DCHECK_GT(group_size_, 0);
-    pooled_height_ = group_size_;
-    pooled_width_ = group_size_;
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-   float spatial_scale_;
-   int group_size_;
-   int output_dim_;
-   int pooled_height_;
-   int pooled_width_;
-   int channels_;
-   int height_;
-   int width_;
- };
-
-template <typename T, class Context>
-class PSRoIPoolGradientOp final : public Operator<Context> {
- public:
-  PSRoIPoolGradientOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws),
-        spatial_scale_(this->template GetSingleArgument<float>(
-              "spatial_scale", 1.)),
-        group_size_(this->template GetSingleArgument<int>("group_size", 1)),
-        output_dim_(this->template GetSingleArgument<int>("output_dim", 1)) {
-    TORCH_DCHECK_GT(spatial_scale_, 0);
-    TORCH_DCHECK_GT(group_size_, 0);
-    pooled_height_ = group_size_;
-    pooled_width_ = group_size_;
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  float spatial_scale_;
-  int group_size_;
-  int output_dim_;
-  int pooled_height_;
-  int pooled_width_;
-  int channels_;
-  int height_;
-  int width_;
-};
-
-} // namespace caffe2
-
-#endif // PS_ROI_POOL_OP_H_
--- a/modules/detectron/roi_pool_f_op.cc
+++ b/modules/detectron/roi_pool_f_op.cc
@ -1,99 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "roi_pool_f_op.h"
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(RoIPoolF, RoIPoolFOp<float, CPUContext>);
-REGISTER_CPU_OPERATOR(RoIPoolFGradient, RoIPoolFGradientOp<float, CPUContext>);
-
-OPERATOR_SCHEMA(RoIPoolF)
-    .NumInputs(2)
-    .NumOutputs(2)
-    .SetDoc(R"DOC(
-Region of Interest (RoI) pooling operation as used in Fast R-CNN.
-)DOC")
-    .Arg(
-        "spatial_scale",
-        "(float) default 1.0; Spatial scale of the input feature map X "
-        "relative to the input image. E.g., 0.0625 if X has a stride of 16 "
-        "w.r.t. the input image.")
-    .Arg(
-        "pooled_h",
-        "(int) default 1; Pooled output Y's height.")
-    .Arg(
-        "pooled_w",
-        "(int) default 1; Pooled output Y's width.")
-    .Input(
-        0,
-        "X",
-        "4D feature map input of shape (N, C, H, W).")
-    .Input(
-        1,
-        "RoIs",
-        "2D input of shape (R, 5) specifying R RoIs with five columns "
-        "representing: batch index in [0, N - 1], x1, y1, x2, y2. The RoI "
-        "coordinates are in the coordinate system of the input image.")
-    .Output(
-        0,
-        "Y",
-        "4D output of shape (R, C, pooled_h, pooled_w). The r-th batch element "
-        "is a pooled feature map cooresponding to the r-th RoI.")
-    .Output(
-        1,
-        "argmaxes",
-        "4D output of shape (R, C, pooled_h, pooled_w). Same as Y, except it "
-        "records the argmax indices rather than the max pooled values.");
-
-OPERATOR_SCHEMA(RoIPoolFGradient)
-    .NumInputs(4)
-    .NumOutputs(1)
-    .Input(
-        0,
-        "X",
-        "See RoIPoolF.")
-    .Input(
-        1,
-        "RoIs",
-        "See RoIPoolF.")
-    .Input(
-        2,
-        "argmaxes",
-        "See RoIPoolF.")
-    .Input(
-        3,
-        "dY",
-        "Gradient of forward output 0 (Y)")
-    .Output(
-        0,
-        "dX",
-        "Gradient of forward input 0 (X)");
-
-class GetRoIPoolFGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    return SingleGradientDef(
-        "RoIPoolFGradient",
-        "",
-        vector<string>{I(0), I(1), O(1), GO(0)},
-        vector<string>{GI(0)});
-  }
-};
-
-REGISTER_GRADIENT(RoIPoolF, GetRoIPoolFGradient);
-
-} // namespace caffe2
--- a/modules/detectron/roi_pool_f_op.cu
+++ b/modules/detectron/roi_pool_f_op.cu
@ -1,187 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cfloat>
-
-#include "caffe2/core/context_gpu.h"
-#include "modules/detectron/roi_pool_f_op.h"
-
-namespace caffe2 {
-
-namespace {
-
-template <typename T>
-inline __device__ T gpu_atomic_add(const T val, T* address);
-
-template <>
-inline __device__
-float gpu_atomic_add(const float val, float* address) {
-  return atomicAdd(address, val);
-}
-
-template <typename T>
-__global__ void RoIPoolFForward(const int nthreads, const T* bottom_data,
-    const T spatial_scale, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    const T* bottom_rois, T* top_data, int* argmax_data) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    // (n, c, ph, pw) is an element in the pooled output
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int c = (index / pooled_width / pooled_height) % channels;
-    int n = index / pooled_width / pooled_height / channels;
-
-    const T* offset_bottom_rois = bottom_rois + n * 5;
-    int roi_batch_ind = offset_bottom_rois[0];
-    int roi_start_w = roundf(offset_bottom_rois[1] * spatial_scale);
-    int roi_start_h = roundf(offset_bottom_rois[2] * spatial_scale);
-    int roi_end_w = roundf(offset_bottom_rois[3] * spatial_scale);
-    int roi_end_h = roundf(offset_bottom_rois[4] * spatial_scale);
-
-    // Force malformed ROIs to be 1x1
-    int roi_width = max(roi_end_w - roi_start_w + 1, 1);
-    int roi_height = max(roi_end_h - roi_start_h + 1, 1);
-    T bin_size_h = static_cast<T>(roi_height)
-                       / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width)
-                       / static_cast<T>(pooled_width);
-
-    int hstart = static_cast<int>(floor(static_cast<T>(ph)
-                                        * bin_size_h));
-    int wstart = static_cast<int>(floor(static_cast<T>(pw)
-                                        * bin_size_w));
-    int hend = static_cast<int>(ceil(static_cast<T>(ph + 1)
-                                     * bin_size_h));
-    int wend = static_cast<int>(ceil(static_cast<T>(pw + 1)
-                                     * bin_size_w));
-
-    // Add roi offsets and clip to input boundaries
-    hstart = min(max(hstart + roi_start_h, 0), height);
-    hend = min(max(hend + roi_start_h, 0), height);
-    wstart = min(max(wstart + roi_start_w, 0), width);
-    wend = min(max(wend + roi_start_w, 0), width);
-    bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-    // Define an empty pooling region to be zero
-    T maxval = is_empty ? 0 : -FLT_MAX;
-    // If nothing is pooled, argmax = -1 causes nothing to be backprop'd
-    int maxidx = -1;
-    const T* offset_bottom_data =
-        bottom_data + (roi_batch_ind * channels + c) * height * width;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        int bottom_index = h * width + w;
-        if (offset_bottom_data[bottom_index] > maxval) {
-          maxval = offset_bottom_data[bottom_index];
-          maxidx = bottom_index;
-        }
-      }
-    }
-    top_data[index] = maxval;
-    argmax_data[index] = maxidx;
-  }
-}
-
-template <typename T>
-__global__ void RoIPoolFBackward(const int nthreads, const T* top_diff,
-    const int* argmax_data, const int num_rois, const T spatial_scale,
-    const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, T* bottom_diff,
-    const T* bottom_rois) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    // (n, c, ph, pw) is an element in the pooled output
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int c = (index / pooled_width / pooled_height) % channels;
-    int n = index / pooled_width / pooled_height / channels;
-
-    const T* offset_bottom_rois = bottom_rois + n * 5;
-    int roi_batch_ind = offset_bottom_rois[0];
-    int bottom_offset = (roi_batch_ind * channels + c) * height * width;
-    int top_offset    = (n * channels + c) * pooled_height * pooled_width;
-    const T* offset_top_diff = top_diff + top_offset;
-    T* offset_bottom_diff = bottom_diff + bottom_offset;
-    const int* offset_argmax_data = argmax_data + top_offset;
-
-    int argmax = offset_argmax_data[ph * pooled_width + pw];
-    if (argmax != -1) {
-      gpu_atomic_add(
-          static_cast<T>(offset_top_diff[ph * pooled_width + pw]),
-          offset_bottom_diff + argmax);
-    }
-  }
-}
-
-} // namespace
-
-template<>
-bool RoIPoolFOp<float, CUDAContext>::RunOnDevice() {
-  auto& X = Input(0);  // Input data to pool
-  auto& R = Input(1);  // RoIs
-
-  if (R.size() == 0) {
-    // Handle empty rois
-    std::vector<int64_t> sizes = {0, X.dim32(1), pooled_height_, pooled_width_};
-    /* auto* Y = */ Output(0, sizes, at::dtype<float>());
-    /* auto* A = */ Output(1, sizes, at::dtype<int>());
-    return true;
-  }
-
-  auto* Y = Output(0, {R.dim32(0), X.dim32(1), pooled_height_, pooled_width_}, at::dtype<float>()); // RoI pooled data
-  auto* A = Output(1, Y->sizes(), at::dtype<int>()); // argmaxes
-  int output_size = Y->size();
-  RoIPoolFForward<float><<<CAFFE_GET_BLOCKS(output_size),
-                          CAFFE_CUDA_NUM_THREADS,
-                          0, context_.cuda_stream()>>>(
-      output_size, X.data<float>(), spatial_scale_, X.dim32(1), X.dim32(2),
-      X.dim32(3), pooled_height_, pooled_width_, R.data<float>(),
-      Y->mutable_data<float>(), A->mutable_data<int>());
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-  return true;
-}
-
-
-template<>
-bool RoIPoolFGradientOp<float, CUDAContext>::RunOnDevice() {
-  auto& X  = Input(0);  // Input data to pool
-  auto& R  = Input(1);  // RoIs
-  auto& A  = Input(2);  // argmaxes
-  auto& dY = Input(3);  // Gradient of net w.r.t. output of "forward" op
-                        // (aka "gradOutput")
-
-  auto* dX = Output(0, X.sizes(), at::dtype<float>());    // Gradient of net w.r.t. input to "forward" op
-                        // (aka "gradInput")
-  // Must zero-out dX before accumulating gradients
-  math::Set<float, CUDAContext>(
-      dX->size(), 0.f, dX->mutable_data<float>(), &context_);
-  if (dY.size() > 0) {  // Handle possibly empty gradient if there were no rois
-    RoIPoolFBackward<float><<<CAFFE_GET_BLOCKS(dY.size()),
-                             CAFFE_CUDA_NUM_THREADS,
-                             0, context_.cuda_stream()>>>(
-        dY.size(), dY.data<float>(), A.data<int>(), R.dim32(0), spatial_scale_,
-        X.dim32(1), X.dim32(2), X.dim32(3), pooled_height_, pooled_width_,
-        dX->mutable_data<float>(), R.data<float>());
-    C10_CUDA_KERNEL_LAUNCH_CHECK();
-  }
-  return true;
-}
-
-
-REGISTER_CUDA_OPERATOR(RoIPoolF,
-                       RoIPoolFOp<float, CUDAContext>);
-REGISTER_CUDA_OPERATOR(RoIPoolFGradient,
-                       RoIPoolFGradientOp<float, CUDAContext>);
-} // namespace caffe2
--- a/modules/detectron/roi_pool_f_op.h
+++ b/modules/detectron/roi_pool_f_op.h
@ -1,81 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ROI_POOL_F_OP_H_
-#define ROI_POOL_F_OP_H_
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-
-template <typename T, class Context>
-class RoIPoolFOp final : public Operator<Context> {
- public:
-  RoIPoolFOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        spatial_scale_(this->template GetSingleArgument<float>(
-              "spatial_scale", 1.)),
-        pooled_height_(this->template GetSingleArgument<int>("pooled_h", 1)),
-        pooled_width_(this->template GetSingleArgument<int>("pooled_w", 1)) {
-    TORCH_DCHECK_GT(spatial_scale_, 0);
-    TORCH_DCHECK_GT(pooled_height_, 0);
-    TORCH_DCHECK_GT(pooled_width_, 0);
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  float spatial_scale_;
-  int pooled_height_;
-  int pooled_width_;
-};
-
-template <typename T, class Context>
-class RoIPoolFGradientOp final : public Operator<Context> {
- public:
-  RoIPoolFGradientOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws),
-        spatial_scale_(this->template GetSingleArgument<float>(
-              "spatial_scale", 1.)),
-        pooled_height_(this->template GetSingleArgument<int>("pooled_h", 1)),
-        pooled_width_(this->template GetSingleArgument<int>("pooled_w", 1)) {
-    TORCH_DCHECK_GT(spatial_scale_, 0);
-    TORCH_DCHECK_GT(pooled_height_, 0);
-    TORCH_DCHECK_GT(pooled_width_, 0);
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  float spatial_scale_;
-  int pooled_height_;
-  int pooled_width_;
-};
-
-} // namespace caffe2
-
-#endif // ROI_POOL_F_OP_H_
--- a/modules/detectron/sample_as_op.cc
+++ b/modules/detectron/sample_as_op.cc
@ -1,81 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "sample_as_op.h"
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(SampleAs, SampleAsOp<float, CPUContext>);
-REGISTER_CPU_OPERATOR(SampleAsGradient, SampleAsGradientOp<float, CPUContext>);
-
-OPERATOR_SCHEMA(SampleAs)
-    .NumInputs(2)
-    .NumOutputs(1)
-    .SetDoc(R"DOC(
-Select the batch elements from input tensor X where the corresponding input
-label value is > 0.
-)DOC")
-    .Input(
-        0,
-        "X",
-        "Tensor of at least 1D shape (N, ...).")
-    .Input(
-        1,
-        "labels",
-        "Tensor of type int with 1D shape (N, ).")
-    .Output(
-        0,
-        "Y",
-        "Tensor with number of dims matching X, but with the length of dim 0 "
-        "equal to the number of non-zero elements in labels. The batch items "
-        "from X corresponding to the non-zero elements in labels are copied "
-        "into Y.");
-
-OPERATOR_SCHEMA(SampleAsGradient)
-    .NumInputs(3)
-    .NumOutputs(1)
-    .Input(
-        0,
-        "X",
-        "See SampleAs.")
-    .Input(
-        1,
-        "labels",
-        "See SampleAs."
-    )
-    .Input(
-        2,
-        "dY",
-        "Gradient of forward output 0 (Y).")
-    .Output(
-        0,
-        "dX",
-        "Gradient of forward input 0 (X).");
-
-class GetSampleAsGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    return SingleGradientDef(
-        "SampleAsGradient",
-        "",
-        vector<string>{I(0), I(1), GO(0)},
-        vector<string>{GI(0)});
-  }
-};
-
-REGISTER_GRADIENT(SampleAs, GetSampleAsGradient);
-
-} // namespace caffe2
--- a/modules/detectron/sample_as_op.cu
+++ b/modules/detectron/sample_as_op.cu
@ -1,116 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* SampleAs by Kaiming He for Mask R-CNN
-X.dim32(0) = L.dim32(0)
-Y's output samples are the samples of X for which L > 0.
-*/
-#include <cfloat>
-
-#include "caffe2/core/context_gpu.h"
-#include "modules/detectron/sample_as_op.h"
-
-#include <stdio.h>
-
-namespace caffe2 {
-
-template <>
-bool SampleAsOp<float, CUDAContext>::RunOnDevice() {
-  auto& X = Input(0); // Input data to be sliced
-  auto& L = Input(1); // Target data that provide the identity
-
-  CAFFE_ENFORCE(
-      X.dim32(0) == L.dim32(0),
-      "X.dim32(0) must be equal to L.dim32(0)",
-      "(",
-      X.dim32(0),
-      " vs. ",
-      L.dim32(0),
-      ")");
-
-  // copy L to CPU:
-  std::vector<int> labels(L.dim32(0));
-  context_.CopyBytes<CUDAContext, CPUContext>(
-      L.dim32(0) * sizeof(int), L.data<int>(), &labels[0]);
-  // Make sure that the copy is finished
-  context_.FinishDeviceComputation();
-
-  int count = 0;
-  for (int i = 0; i < L.dim32(0); i++) {
-    if (labels[i] > 0) {
-      count++;
-    }
-  }
-  assert(count > 0);
-
-  // resize Y
-  vector<int64_t> out_shape(X.sizes().vec());
-  out_shape[0] = count;
-  auto* Y = Output(0, out_shape, at::dtype<float>()); // Sliced data (Y.dim32(0) = num of (L > 0))
-
-  const int len = X.size() / X.dim32(0);
-
-  float* output = Y->mutable_data<float>();
-  for (int i = 0; i < L.dim32(0); i++) {
-    if (labels[i] > 0) {
-      context_.CopyBytes<CUDAContext, CUDAContext>(
-          len * sizeof(float), X.data<float>() + i * len, output);
-      output += len;
-    } // if
-  } // i
-
-  return true;
-}
-
-template <>
-bool SampleAsGradientOp<float, CUDAContext>::RunOnDevice() {
-  auto& X = Input(0);
-  auto& L = Input(1);
-  auto& dY = Input(2);
-
-
-  auto* dX = Output(0, X.sizes(), at::dtype<float>());
-
-  // copy L to CPU:
-  std::vector<int> labels(L.dim32(0));
-  context_.CopyBytes<CUDAContext, CPUContext>(
-      L.dim32(0) * sizeof(int), L.data<int>(), &labels[0]);
-  // Make sure that the copy is finished
-  context_.FinishDeviceComputation();
-
-  // zero-out dX
-  math::Set<float, CUDAContext>(
-      dX->size(), 0.f, dX->mutable_data<float>(), &context_);
-
-  const int len = X.size() / X.dim32(0);
-
-  const float* input = dY.data<float>();
-  for (int i = 0; i < L.dim32(0); i++) {
-    if (labels[i] > 0) {
-      context_.CopyBytes<CUDAContext, CUDAContext>(
-          len * sizeof(float), input, dX->mutable_data<float>() + i * len);
-      input += len;
-    } // if
-  } // i
-
-  return true;
-}
-
-REGISTER_CUDA_OPERATOR(SampleAs, SampleAsOp<float, CUDAContext>);
-REGISTER_CUDA_OPERATOR(
-    SampleAsGradient,
-    SampleAsGradientOp<float, CUDAContext>);
-} // namespace caffe2
--- a/modules/detectron/sample_as_op.h
+++ b/modules/detectron/sample_as_op.h
@ -1,55 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef SAMPLE_AS_OP_H_
-#define SAMPLE_AS_OP_H_
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-
-template <typename T, class Context>
-class SampleAsOp final : public Operator<Context> {
- public:
-  SampleAsOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws) {}
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-};
-
-template <typename T, class Context>
-class SampleAsGradientOp final : public Operator<Context> {
- public:
-  SampleAsGradientOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws) {}
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-};
-
-} // namespace caffe2
-
-#endif // SAMPLE_AS_OP_H_
--- a/modules/detectron/select_smooth_l1_loss_op.cc
+++ b/modules/detectron/select_smooth_l1_loss_op.cc
@ -1,107 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "select_smooth_l1_loss_op.h"
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(
-    SelectSmoothL1Loss,
-    SelectSmoothL1LossOp<float, CPUContext>);
-REGISTER_CPU_OPERATOR(
-    SelectSmoothL1LossGradient,
-    SelectSmoothL1LossGradientOp<float, CPUContext>);
-
-OPERATOR_SCHEMA(SelectSmoothL1Loss)
-    .NumInputs(4)
-    .NumOutputs(1)
-    .SetDoc(R"DOC(
-RetinaNet specific op for computing Smooth L1 Loss at select locations in a 4D
-tensor that encodes bounding box regression predictions.
-)DOC")
-    .Arg(
-        "beta",
-        "(float) default 1.0; L2 to L1 transition point.")
-    .Arg(
-        "scale",
-        "(float) default 1.0; multiply the loss by this scale factor.")
-    .Input(
-        0,
-        "Y_hat",
-        "4D tensor of bounding box regression predictions with shape "
-        "(N, 4 * num_bbox_classes * num_anchors, H, W).")
-    .Input(
-        1,
-        "Y",
-        "2D tensor of labels shape (M, 4) for 4 contiguous channels starting "
-        "at each of the M locations selected by the locations input.")
-    .Input(
-        2,
-        "locations",
-        "2D tensor of shape (M, 4) that identifies M 'select' locations "
-        "encoded by the four columns: (n, c, y, x). The loss is computed on the "
-        "four contiguous channel locations [c, c + 3] (inclusive).")
-    .Input(
-        3,
-        "normalizer",
-        "Scalar; the loss is divided by max(1, normalizer).")
-    .Output(
-        0,
-        "loss",
-        "Scalar loss.");
-
-OPERATOR_SCHEMA(SelectSmoothL1LossGradient)
-    .NumInputs(5)
-    .NumOutputs(1)
-    .Input(
-        0,
-        "Y_hat",
-        "See SelectSmoothL1Loss.")
-    .Input(
-        1,
-        "Y",
-        "See SelectSmoothL1Loss.")
-    .Input(
-        2,
-        "locations",
-        "See SelectSmoothL1Loss.")
-    .Input(
-        3,
-        "normalizer",
-        "See SelectSmoothL1Loss.")
-    .Input(
-        4,
-        "d_loss",
-        "Gradient of forward output 0 (loss).")
-    .Output(
-        0,
-        "d_Y_hat",
-        "Gradient of forward input 0 (Y_hat).");
-
-class GetSelectSmoothL1LossGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    return SingleGradientDef(
-        "SelectSmoothL1LossGradient",
-        "",
-        vector<string>{I(0), I(1), I(2), I(3), GO(0)},
-        vector<string>{GI(0)});
-  }
-};
-
-REGISTER_GRADIENT(SelectSmoothL1Loss, GetSelectSmoothL1LossGradient);
-
-} // namespace caffe2
--- a/modules/detectron/select_smooth_l1_loss_op.cu
+++ b/modules/detectron/select_smooth_l1_loss_op.cu
@ -1,189 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "caffe2/core/context_gpu.h"
-#include "modules/detectron/select_smooth_l1_loss_op.h"
-
-namespace caffe2 {
-
-namespace {
-__global__ void SelectSmoothL1Kernel(
-    const int D, const int H, const int W,
-    const int M, const float* Y_hat, const float* Y, const float* L, float* out,
-    const float* S, const float beta) {
-  // f(x) = 0.5 * x^2 / beta      if |x| < beta
-  //        |x| - 0.5 * beta      otherwise
-  CUDA_1D_KERNEL_LOOP(i, M) {
-    int n = L[i * 4];
-    int c = L[i * 4 + 1];
-    int y = L[i * 4 + 2];
-    int x = L[i * 4 + 3];
-
-    for (int j = 0; j < 4; j++){
-      // Y_hat: N x (A * CLS * 4) x H x W
-      int ind = n * (D * H * W) + (c + j) * (H * W) + y * W + x;
-      float y_hat = Y_hat[ind];
-      float y = Y[i * 4 + j];
-      float val = y_hat - y;
-      float abs_val = c10::cuda::compat::abs(val);
-      if (abs_val < beta) {
-        out[ind] = (0.5 * val * val / beta) / c10::cuda::compat::max(S[0], static_cast<float>(1.0));
-      } else {
-        out[ind] = (abs_val - 0.5 * beta) / c10::cuda::compat::max(S[0], static_cast<float>(1.0));
-      }
-    }
-  }
-}
-
-
-__global__ void SelectSmoothL1GradientKernel(
-    const int D, const int H, const int W,
-    const int M,
-    const float* Y_hat,
-    const float* Y,
-    const float* L,
-    float* out,
-    const float* d_loss_data,
-    float norm,
-    const float* S,
-    float beta) {
-  // f'(x) = x / beta     if |x| < beta
-  //       = sign(x)      otherwise
-  // We also scale by norm * d_loss in this kernel for convenience
-  CUDA_1D_KERNEL_LOOP(i, M) {
-    int n = L[i * 4];
-    int c = L[i * 4 + 1];
-    int y = L[i * 4 + 2];
-    int x = L[i * 4 + 3];
-    float d_loss = *d_loss_data;
-
-    for (int j = 0; j < 4; j++) {
-      int ind = n * (D * H * W) + (c + j) * (H * W) + y * W + x;
-      float y_hat = Y_hat[ind];
-      float y = Y[i * 4 + j];
-      float val = y_hat - y;
-      float abs_val = c10::cuda::compat::abs(val);
-      if (abs_val < beta) {
-        out[ind] = norm * d_loss * val / beta / c10::cuda::compat::max(S[0], static_cast<float>(1.0));
-      } else {
-        out[ind] = norm * d_loss * ((float(0) < val) - (val < float(0))) / c10::cuda::compat::max(S[0], static_cast<float>(1.0));
-      }
-    }
-  }
-}
-} // namespace
-
-
-template<>
-bool SelectSmoothL1LossOp<float, CUDAContext>::RunOnDevice() {
-  // bbox targets predictions, for example: N x (A * 4) H x W in cls-agnostic case
-  auto& Y_hat     = Input(0);
-  // true targets: for example: M x 4 where M is the #fg boxes per fpn level
-  auto& Y         = Input(1);
-  // locations of fg boxes: M x 4
-  auto& L         = Input(2);
-  // total number of fg boxes across all FPN levels: scalar
-  auto& S         = Input(3);
-
-
-  auto* avg_loss = Output(0, vector<int64_t>(), at::dtype<float>());
-  if (Y.size() == 0){
-    math::Set<float, CUDAContext>(
-      1, static_cast<float>(0), avg_loss->mutable_data<float>(), &context_);
-    return true;
-  }
-
-  int N = Y_hat.dim32(0);
-  int D = Y_hat.dim32(1);
-  int H = Y_hat.dim32(2);
-  int W = Y_hat.dim32(3);
-
-  int M = Y.dim32(0);
-
-  // initialization
-  buff_.ResizeLike(Y_hat);
-  math::Set<float, CUDAContext>(
-    1, static_cast<float>(0), avg_loss->mutable_data<float>(), &context_);
-  math::Set<float, CUDAContext>(
-    buff_.size(), 0.0, buff_.mutable_data<float>(), &context_);
-
-  // Element-wise smooth l1 loss
-  // l := SelectSmoothL1((y_hat - y))
-  SelectSmoothL1Kernel<<<CAFFE_GET_BLOCKS(buff_.size()),
-                         CAFFE_CUDA_NUM_THREADS,
-                         0, context_.cuda_stream()>>>(
-    D, H, W,
-    M, Y_hat.data<float>(), Y.data<float>(),
-    L.data<float>(), buff_.mutable_data<float>(),
-    S.data<float>(), beta_);
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-  // Sum of all losses
-  // al := sum_i l_i
-  float* avg_loss_data = avg_loss->mutable_data<float>();
-  math::Sum<float, CUDAContext>(
-      buff_.size(), buff_.data<float>(), avg_loss_data, &context_);
-
-  // Average of input batch size
-  math::Scale<float, float, CUDAContext>(
-      1, scale_, avg_loss_data, avg_loss_data, &context_);
-  return true;
-}
-
-template<>
-bool SelectSmoothL1LossGradientOp<float, CUDAContext>::RunOnDevice() {
-  auto& Y_hat      = Input(0);
-  auto& Y          = Input(1);
-  auto& L          = Input(2);
-  auto& S          = Input(3);
-  // Below is gradient of net w.r.t. avg_loss ("gradOutput"), should be all 1's
-  auto& d_avg_loss = Input(4);
-
-  auto* d_Y_hat = Output(0, Y_hat.sizes(), at::dtype<float>()); // gradient of net w.r.t. Y_hat ("gradInput")
-  math::Set<float, CUDAContext>(
-    d_Y_hat->size(), 0.0, d_Y_hat->mutable_data<float>(), &context_);
-  if (Y.size() == 0){
-    return true;
-  }
-
-  int N = Y_hat.dim32(0);
-  int D = Y_hat.dim32(1);
-  int H = Y_hat.dim32(2);
-  int W = Y_hat.dim32(3);
-
-  int M = Y.dim32(0);
-  // Element-wise weighted difference (can be used to ignore or reweight
-  // specific components)
-  // d := (y_hat - y)
-  // d_Y_hat := d_avg_loss * SelectSmoothL1'((y_hat - y))
-
-  SelectSmoothL1GradientKernel<<<CAFFE_GET_BLOCKS(d_Y_hat->size()),
-                                 CAFFE_CUDA_NUM_THREADS,
-                                 0, context_.cuda_stream()>>>(
-    D, H, W, M, Y_hat.data<float>(), Y.data<float>(),
-    L.data<float>(), d_Y_hat->mutable_data<float>(),
-    d_avg_loss.data<float>(), scale_, S.data<float>(), beta_);
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-  return true;
-}
-
-
-REGISTER_CUDA_OPERATOR(SelectSmoothL1Loss,
-                       SelectSmoothL1LossOp<float, CUDAContext>);
-REGISTER_CUDA_OPERATOR(SelectSmoothL1LossGradient,
-                       SelectSmoothL1LossGradientOp<float, CUDAContext>);
-}  // namespace caffe2
--- a/modules/detectron/select_smooth_l1_loss_op.h
+++ b/modules/detectron/select_smooth_l1_loss_op.h
@ -1,77 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef SELECT_SMOOTH_L1_LOSS_OP_H_
-#define SELECT_SMOOTH_L1_LOSS_OP_H_
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-
-template <typename T, class Context>
-class SelectSmoothL1LossOp final : public Operator<Context> {
- public:
-  SelectSmoothL1LossOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        beta_(this->template GetSingleArgument<float>("beta", 1.)),
-        scale_(this->template GetSingleArgument<float>("scale", 1.)) {
-    CAFFE_ENFORCE(beta_ > 0);
-    CAFFE_ENFORCE(scale_ >= 0);
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  float beta_; // Transition point from L1 to L2 loss
-  float scale_; // Scale the loss by scale_
-  int dim_; // dimension for 1 anchor prediction
-  Tensor buff_{Context::GetDeviceType()}; // Buffer for element-wise differences
-};
-
-template <typename T, class Context>
-class SelectSmoothL1LossGradientOp final : public Operator<Context> {
- public:
-  SelectSmoothL1LossGradientOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws),
-        beta_(this->template GetSingleArgument<float>("beta", 1.)),
-        scale_(this->template GetSingleArgument<float>("scale", 1.)) {
-    CAFFE_ENFORCE(beta_ > 0);
-    CAFFE_ENFORCE(scale_ >= 0);
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  float beta_; // Transition point from L1 to L2 loss
-  float scale_; // Scale the loss by scale_
-  int dim_; // dimension for 1 anchor prediction
-  Tensor buff_{Context::GetDeviceType()}; // Buffer for element-wise differences
-};
-
-} // namespace caffe2
-
-#endif // SELECT_SMOOTH_L1_LOSS_OP_H_
--- a/modules/detectron/sigmoid_cross_entropy_loss_op.cc
+++ b/modules/detectron/sigmoid_cross_entropy_loss_op.cc
@ -1,96 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "sigmoid_cross_entropy_loss_op.h"
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(
-    SigmoidCrossEntropyLoss,
-    SigmoidCrossEntropyLossOp<float, CPUContext>);
-REGISTER_CPU_OPERATOR(
-    SigmoidCrossEntropyLossGradient,
-    SigmoidCrossEntropyLossGradientOp<float, CPUContext>);
-
-OPERATOR_SCHEMA(SigmoidCrossEntropyLoss)
-    .NumInputs(2)
-    .NumOutputs(1)
-    .SetDoc(R"DOC(
-Compute sigmoid activations followed by averaged binary cross entropy loss. The
-target values may be in {-1, 0, 1}, where -1 indicates that the corresponding
-sample should be ignored and {0, 1} correspond to the binary classes 0 and 1. By
-default the loss is divided by the number of targets > -1 and then multiplied by
-the `scale` op argument. The divisive normalization may be disable by setting
-the op argument `normalize` to 0 (the multiplication by `scale` still takes
-effect).
-
-This op fuses sigmoid and cross entropy for numerical stability in both forward
-and gradient computation.
-)DOC")
-    .Arg(
-        "scale",
-        "(float) default 1.0; multiply the loss by this scale factor.")
-    .Arg(
-        "normalize",
-        "(int) default 1; if true, divide the loss by the number of targets > "
-        "-1.")
-    .Input(
-        0,
-        "X",
-        "Tensor of predicted logits (shape must be at least 1D).")
-    .Input(
-        1,
-        "targets",
-        "Tensor of targets of type int and same shape as logits X.")
-    .Output(
-        0,
-        "loss",
-        "Scalar loss.");
-
-OPERATOR_SCHEMA(SigmoidCrossEntropyLossGradient)
-    .NumInputs(3)
-    .NumOutputs(1)
-    .Input(
-        0,
-        "X",
-        "See SigmoidCrossEntropyLoss.")
-    .Input(
-        1,
-        "targets",
-        "See SigmoidCrossEntropyLoss.")
-    .Input(
-        2,
-        "d_loss",
-        "Gradient of forward output 0 (loss).")
-    .Output(
-        0,
-        "dX",
-        "Gradient of forward input 0 (X).");
-
-class GetSigmoidCrossEntropyLossGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    return SingleGradientDef(
-        "SigmoidCrossEntropyLossGradient",
-        "",
-        vector<string>{I(0), I(1), GO(0)},
-        vector<string>{GI(0)});
-  }
-};
-
-REGISTER_GRADIENT(SigmoidCrossEntropyLoss, GetSigmoidCrossEntropyLossGradient);
-
-} // namespace caffe2
--- a/modules/detectron/sigmoid_cross_entropy_loss_op.cu
+++ b/modules/detectron/sigmoid_cross_entropy_loss_op.cu
@ -1,190 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "caffe2/core/context_gpu.h"
-#include "modules/detectron/sigmoid_cross_entropy_loss_op.h"
-
-namespace caffe2 {
-
-namespace {
-__global__ void ElementwiseMaxKernel(const int n, float* data, const float a) {
-  CUDA_1D_KERNEL_LOOP(index, n) {
-    data[index] = (data[index] > a) ? data[index] : a;
-  }
-}
-
-__global__ void SigmoidCrossEntropyLossKernel(
-    const int n,
-    const float* logits,
-    const int* targets,
-    float* losses,
-    float* counts) {
-  CUDA_1D_KERNEL_LOOP(index, n) {
-    if (targets[index] == -1) {
-      losses[index] = 0.;
-      counts[index] = 0.;
-    } else {
-      losses[index] =
-          -1. * logits[index] * (targets[index] - (logits[index] >= 0)) +
-          logf(
-              1 +
-              expf(logits[index] - 2 * logits[index] * (logits[index] >= 0)));
-      counts[index] = 1.;
-    }
-  }
-}
-
-__global__ void SigmoidCrossEntropyLossGradientKernel(
-    const int n,
-    const float* logits,
-    const int* targets,
-    float* d_logits,
-    float* counts) {
-  CUDA_1D_KERNEL_LOOP(index, n) {
-    if (targets[index] == -1) {
-      d_logits[index] = 0.;
-      counts[index] = 0.;
-    } else {
-      d_logits[index] = 1. / (1. + expf(-logits[index])) - targets[index];
-      counts[index] = 1.;
-    }
-  }
-}
-} // namespace
-
-template <>
-bool SigmoidCrossEntropyLossOp<float, CUDAContext>::RunOnDevice() {
-  auto& X = Input(0);
-  auto& T = Input(1);
-
-
-  CAFFE_ENFORCE(
-      X.size() == T.size(),
-      "Logit and target must have the same size",
-      "(",
-      X.size(),
-      " vs. ",
-      T.size(),
-      ")");
-  auto* avg_loss = Output(0, vector<int64_t>(), at::dtype<float>());
-  counts_.ResizeLike(X);
-  losses_.ResizeLike(X);
-  ReinitializeTensor(&normalizer_, vector<int64_t>(), at::dtype<float>().device(CUDA));
-  SigmoidCrossEntropyLossKernel<<<
-      CAFFE_GET_BLOCKS(X.size()),
-      CAFFE_CUDA_NUM_THREADS,
-      0,
-      context_.cuda_stream()>>>(
-      X.size(),
-      X.data<float>(),
-      T.data<int>(),
-      losses_.mutable_data<float>(),
-      counts_.mutable_data<float>());
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-  float* avg_loss_data = avg_loss->mutable_data<float>();
-  math::Sum<float, CUDAContext>(
-      losses_.size(), losses_.data<float>(), avg_loss_data, &context_);
-  if (normalize_) {
-    float* normalizer_data = normalizer_.mutable_data<float>();
-    math::Sum<float, CUDAContext>(
-        counts_.size(), counts_.data<float>(), normalizer_data, &context_);
-    // Prevent division by zero is all counts are zero
-    ElementwiseMaxKernel<<<
-        CAFFE_GET_BLOCKS(normalizer_.size()),
-        CAFFE_CUDA_NUM_THREADS,
-        0,
-        context_.cuda_stream()>>>(normalizer_.size(), normalizer_data, 1e-5);
-    C10_CUDA_KERNEL_LAUNCH_CHECK();
-    math::Div<float, CUDAContext>(
-        1, avg_loss_data, normalizer_data, avg_loss_data, &context_);
-  }
-  math::Scale<float, float, CUDAContext>(
-      1, scale_, avg_loss_data, avg_loss_data, &context_);
-
-  return true;
-}
-
-template <>
-bool SigmoidCrossEntropyLossGradientOp<float, CUDAContext>::RunOnDevice() {
-  auto& X = Input(0);
-  auto& T = Input(1);
-  auto& d_avg_loss = Input(2);
-
-
-  auto* dX = Output(0, X.sizes(), at::dtype<float>());
-  counts_.ResizeLike(X);
-  ReinitializeTensor(&normalizer_, vector<int64_t>(), at::dtype<float>().device(CUDA));
-  SigmoidCrossEntropyLossGradientKernel<<<
-      CAFFE_GET_BLOCKS(X.size()),
-      CAFFE_CUDA_NUM_THREADS,
-      0,
-      context_.cuda_stream()>>>(
-      X.size(),
-      X.data<float>(),
-      T.data<int>(),
-      dX->mutable_data<float>(),
-      counts_.mutable_data<float>());
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-  if (normalize_) {
-    float* normalizer_data = normalizer_.mutable_data<float>();
-    math::Sum<float, CUDAContext>(
-        counts_.size(), counts_.data<float>(), normalizer_data, &context_);
-    // Prevent division by zero is all counts are zero
-    ElementwiseMaxKernel<<<
-        CAFFE_GET_BLOCKS(normalizer_.size()),
-        CAFFE_CUDA_NUM_THREADS,
-        0,
-        context_.cuda_stream()>>>(normalizer_.size(), normalizer_data, 1e-5);
-    C10_CUDA_KERNEL_LAUNCH_CHECK();
-    math::Div<float, CUDAContext>(
-        1,
-        d_avg_loss.data<float>(),
-        normalizer_data,
-        normalizer_data,
-        &context_);
-    math::Scale<float, float, CUDAContext>(
-        1, scale_, normalizer_data, normalizer_data, &context_);
-    math::Scale<float, float, CUDAContext>(
-        dX->size(),
-        normalizer_data,
-        dX->data<float>(),
-        dX->mutable_data<float>(),
-        &context_);
-  } else {
-    math::Scale<float, float, CUDAContext>(
-        dX->size(),
-        scale_,
-        dX->data<float>(),
-        dX->mutable_data<float>(),
-        &context_);
-    math::Scale<float, float, CUDAContext>(
-        dX->size(),
-        d_avg_loss.data<float>(),
-        dX->data<float>(),
-        dX->mutable_data<float>(),
-        &context_);
-  }
-  return true;
-}
-
-REGISTER_CUDA_OPERATOR(
-    SigmoidCrossEntropyLoss,
-    SigmoidCrossEntropyLossOp<float, CUDAContext>);
-REGISTER_CUDA_OPERATOR(
-    SigmoidCrossEntropyLossGradient,
-    SigmoidCrossEntropyLossGradientOp<float, CUDAContext>);
-} // namespace caffe2
--- a/modules/detectron/sigmoid_cross_entropy_loss_op.h
+++ b/modules/detectron/sigmoid_cross_entropy_loss_op.h
@ -1,78 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef SIGMOID_CROSS_ENTROPY_LOSS_OP_H_
-#define SIGMOID_CROSS_ENTROPY_LOSS_OP_H_
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-
-template <typename T, class Context>
-class SigmoidCrossEntropyLossOp final : public Operator<Context> {
- public:
-  SigmoidCrossEntropyLossOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        scale_(this->template GetSingleArgument<float>("scale", 1.)),
-        normalize_(this->template GetSingleArgument<int>("normalize", 1)) {
-    CAFFE_ENFORCE(scale_ >= 0);
-    CAFFE_ENFORCE(normalize_ == 0 || normalize_ == 1);
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  float scale_;
-  int normalize_;
-  Tensor losses_{Context::GetDeviceType()};
-  Tensor counts_{Context::GetDeviceType()};
-  Tensor normalizer_;
-};
-
-template <typename T, class Context>
-class SigmoidCrossEntropyLossGradientOp final : public Operator<Context> {
- public:
-  SigmoidCrossEntropyLossGradientOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws),
-        scale_(this->template GetSingleArgument<float>("scale", 1.)),
-        normalize_(this->template GetSingleArgument<int>("normalize", 1)) {
-    CAFFE_ENFORCE(scale_ >= 0);
-    CAFFE_ENFORCE(normalize_ == 0 || normalize_ == 1);
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  float scale_;
-  int normalize_;
-  Tensor counts_{Context::GetDeviceType()};
-  Tensor normalizer_;
-};
-
-} // namespace caffe2
-
-#endif // SIGMOID_CROSS_ENTROPY_LOSS_OP_H_
--- a/modules/detectron/sigmoid_focal_loss_op.cc
+++ b/modules/detectron/sigmoid_focal_loss_op.cc
@ -1,119 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "sigmoid_focal_loss_op.h"
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(SigmoidFocalLoss, SigmoidFocalLossOp<float, CPUContext>);
-REGISTER_CPU_OPERATOR(
-    SigmoidFocalLossGradient,
-    SigmoidFocalLossGradientOp<float, CPUContext>);
-
-OPERATOR_SCHEMA(SigmoidFocalLoss)
-    .NumInputs(3)
-    .NumOutputs(1)
-    .SetDoc(R"DOC(
-The binary form of Focal Loss designed for use in RetinaNet-like models.
-The input is assumed to be unnormalized scores (sometimes called 'logits')
-arranged in a 4D tensor with shape (N, C, H, W), where N is the number of
-elements in the batch, H and W are the height and width, and C = num_anchors *
-num_classes defines num_anchors 'groups' of logits, each of length
-num_classes. For the binary form of Focal Loss, num_classes does not include
-the background category. (So, for COCO, num_classes = 80, not 81.)
-
-The binary form of focal loss is:
-
-  FL(p_t) = -alpha * (1 - p_t)**gamma * log(p_t),
-
-where p = sigmoid(x), p_t = p or 1 - p depending on if the label is 1 or 0,
-respectively.
-
-See: https://arxiv.org/abs/1708.02002 for details.
-)DOC")
-    .Arg(
-       "scale",
-       "(float) default 1.0; multiply the loss by this scale factor.")
-    .Arg(
-       "alpha",
-       "(float) default 0.25; Focal Loss's alpha hyper-parameter.")
-    .Arg(
-       "gamma",
-       "(float) default 1.0; Focal Loss's gamma hyper-parameter.")
-    .Arg(
-       "num_classes",
-       "(int) default 80; number of classes (excluding background).")
-    .Input(
-       0,
-       "logits",
-       "4D tensor of sigmoid inputs (called 'scores' or 'logits') with shape "
-       "(N, C, H, W), where C = num_anchors * num_classes.")
-    .Input(
-       1,
-       "labels",
-       "4D tensor of labels with shape (N, num_anchors, H, W). Each entry is "
-       "a class label in [0, num_classes - 1] (inclusive). The label "
-       "identifies the one class that should have a sigmoid target of 1.")
-    .Input(
-       2,
-       "normalizer",
-       "Scalar; the loss is normalized by 1 / max(1, normalizer)."
-    )
-    .Output(
-       0,
-       "loss",
-       "Scalar loss.");
-
-OPERATOR_SCHEMA(SigmoidFocalLossGradient)
-    .NumInputs(4)
-    .NumOutputs(1)
-    .Input(
-        0,
-        "logits",
-        "See SigmoidFocalLoss.")
-    .Input(
-        1,
-        "labels",
-        "See SigmoidFocalLoss.")
-    .Input(
-        2,
-        "normalizer",
-        "See SigmoidFocalLoss.")
-    .Input(
-        3,
-        "d_loss",
-        "Gradient of forward output 0 (loss)")
-    .Output(
-        0,
-        "d_logits",
-        "Gradient of forward input 0 (logits)");
-
-class GetSigmoidFocalLossGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-
-  vector<OperatorDef> GetGradientDefs() override {
-    vector<string> blob_names{
-        {I(0), I(1), I(2), GO(0)},
-    };
-
-    return SingleGradientDef(
-        "SigmoidFocalLossGradient", "", blob_names, vector<string>{GI(0)});
-  }
-};
-
-REGISTER_GRADIENT(SigmoidFocalLoss, GetSigmoidFocalLossGradient);
-
-} // namespace caffe2
--- a/modules/detectron/sigmoid_focal_loss_op.cu
+++ b/modules/detectron/sigmoid_focal_loss_op.cu
@ -1,185 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cfloat>
-
-#include "caffe2/core/context_gpu.h"
-#include "modules/detectron/sigmoid_focal_loss_op.h"
-
-namespace caffe2 {
-
-namespace {
-
-__global__ void SigmoidFocalLossKernel(
-    const int N, const int D, const int H, const int W, const float* logits,
-    const int* targets, const float* weight_pos,
-    const float gamma, const float alpha,
-    const int num_classes, float* losses) {
-  CUDA_1D_KERNEL_LOOP(i, N * D * H * W) {
-    int x = i % W;
-    int y = (i / W) % H;
-    int c = (i / (W * H)) % D;  // channel, here D is channel dim in input NxDxHxW
-    int n = i / (W * H * D);    // n in NxDxHxW
-
-    int A = D / num_classes;   // num_anchors = A
-    int a = c / num_classes;   // current anchor out of A anchors in D = A * num_cls
-    int d = c % num_classes;   // current class
-    int t = targets[n * (H * W * A) + a * (H * W) + y * W + x];   // target
-
-    // check whether the class is true class or not.
-    // The target classes are in range 1 - 81 and the d is in range 0-80
-    // because we predict A*80 dim, so for comparison purpose, compare t and (d+1)
-    float c1 = (t == (d + 1));
-    float c2 = (t != -1 & t != (d + 1));
-
-    float Np = c10::cuda::compat::max(weight_pos[0], static_cast<float>(1.0));
-    float zn = (1.0 - alpha) / Np;
-    float zp = alpha / Np;
-
-    // p = 1. / 1. + expf(-x)
-    float p = 1. / (1. + expf(-logits[i]));
-
-    // (1 - p)**gamma * log(p) where
-    float term1 = powf((1. - p), gamma) * logf(c10::cuda::compat::max(p, FLT_MIN));
-    // p**gamma * log(1 - p)
-    float term2 =
-        powf(p, gamma) *
-        (-1. * logits[i] * (logits[i] >= 0) -
-         logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0))));
-
-    losses[i] = 0.0;
-    losses[i] += -c1 * term1 * zp;
-    losses[i] += -c2 * term2 * zn;
-  }
-}
-
-__global__ void SigmoidFocalLossGradientKernel(
-    const int N, const int D, const int H, const int W, const float* logits,
-    const int* targets, float* dX_data, const float* weight_pos,
-    const float gamma, const float alpha, const int num_classes,
-    const float* avg_loss) {
-  CUDA_1D_KERNEL_LOOP(i, N * D * H * W) {
-      float a_loss = avg_loss[0];
-      int x = i % W;
-      int y = (i / W) % H;
-      int c = (i / (W * H)) % D;
-      int n = i / (W * H * D);
-
-      int A = D / num_classes;   // num_anchors
-      int a = c / num_classes;   // current anchor
-      int d = c % num_classes;   // current class
-
-      float Np = c10::cuda::compat::max(weight_pos[0], static_cast<float>(1.0));
-      float zn = (1.0 - alpha) / Np;
-      float zp = alpha / Np;
-      int t = targets[n * (H * W * A) + a * (H * W) + y * W + x];
-
-      float c1 = (t == (d + 1));
-      float c2 = (t != -1 & t != (d + 1));
-      float p = 1. / (1. + expf(-logits[i]));
-
-      // (1-p)**g * (1 - p - g*p*log(p))
-      float term1 =
-          powf((1. - p), gamma) *
-          (1. - p - (p * gamma * logf(c10::cuda::compat::max(p, FLT_MIN))));
-      // (p**g) * (g*(1-p)*log(1-p) - p)
-      float term2 =
-          powf(p, gamma) *
-          ((-1. * logits[i] * (logits[i] >= 0) -
-           logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))) *
-           (1. - p) * gamma - p);
-      dX_data[i] = 0.0;
-      dX_data[i] += -c1 * zp * term1;
-      dX_data[i] += -c2 * zn * term2;
-      dX_data[i] = dX_data[i] * a_loss;
-  }
-}
-} // namespace
-
-template<>
-bool SigmoidFocalLossOp<float, CUDAContext>::RunOnDevice() {
-  // Input logits, for example: N x (A * 80) x H x W in cls-agnostic
-  auto& X = Input(0);
-  // Target, for example: N x A x H x W
-  auto& T = Input(1);
-  // Number of positive examples: scalar
-  auto& wp = Input(2);
-  // output avg Sigmoid focal loss as mentioned in RetinaNet paper
-
-
-  int N = X.dim32(0);
-  int D = X.dim32(1);
-  int H = X.dim32(2);
-  int W = X.dim32(3);
-
-  auto* avg_loss = Output(0, vector<int64_t>(), at::dtype<float>());
-  losses_.ResizeLike(X);
-  float* avg_loss_data = avg_loss->mutable_data<float>();
-
-  SigmoidFocalLossKernel<<<CAFFE_GET_BLOCKS(X.size()),
-          CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
-      N, D, H, W, X.data<float>(), T.data<int>(),
-      wp.data<float>(), gamma_, alpha_, num_classes_,
-      losses_.mutable_data<float>());
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-  math::Sum<float, CUDAContext>(
-      losses_.size(), losses_.data<float>(), avg_loss_data, &context_);
-  math::Scale<float, float, CUDAContext>(
-      1, scale_, avg_loss_data, avg_loss_data, &context_);
-
-  return true;
-}
-
-
-template<>
-bool SigmoidFocalLossGradientOp<float, CUDAContext>::RunOnDevice() {
-  auto& X = Input(0);
-  auto& T = Input(1);
-  auto& wp = Input(2);
-  auto& d_avg_loss = Input(InputSize() - 1);
-
-
-  // get input shape
-  int N = X.dim32(0);
-  int D = X.dim32(1);
-  int H = X.dim32(2);
-  int W = X.dim32(3);
-
-  auto* dX = Output(0, X.sizes(), at::dtype<float>());
-
-  SigmoidFocalLossGradientKernel<<<CAFFE_GET_BLOCKS(X.size()),
-          CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
-      N, D, H, W, X.data<float>(), T.data<int>(), dX->mutable_data<float>(),
-      wp.data<float>(), gamma_, alpha_, num_classes_,
-      d_avg_loss.data<float>());
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-  math::Scale<float, float, CUDAContext>(
-      dX->size(),
-      scale_,
-      dX->data<float>(),
-      dX->mutable_data<float>(),
-      &context_);
-
-  return true;
-}
-
-
-REGISTER_CUDA_OPERATOR(SigmoidFocalLoss,
-                       SigmoidFocalLossOp<float, CUDAContext>);
-REGISTER_CUDA_OPERATOR(SigmoidFocalLossGradient,
-                       SigmoidFocalLossGradientOp<float, CUDAContext>);
-} // namespace caffe2
--- a/modules/detectron/sigmoid_focal_loss_op.h
+++ b/modules/detectron/sigmoid_focal_loss_op.h
@ -1,83 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef SIGMOID_FOCAL_LOSS_OP_H_
-#define SIGMOID_FOCAL_LOSS_OP_H_
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-
-template <typename T, class Context>
-class SigmoidFocalLossOp final : public Operator<Context> {
- public:
-  SigmoidFocalLossOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        scale_(this->template GetSingleArgument<float>("scale", 1.)),
-        num_classes_(this->template GetSingleArgument<int>("num_classes", 80)),
-        gamma_(this->template GetSingleArgument<float>("gamma", 1.)),
-        alpha_(this->template GetSingleArgument<float>("alpha", 0.25)) {
-    CAFFE_ENFORCE(scale_ >= 0);
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  float scale_;
-  int num_classes_;
-  float gamma_;
-  float alpha_;
-  Tensor losses_{Context::GetDeviceType()};
-  Tensor counts_{Context::GetDeviceType()};
-};
-
-template <typename T, class Context>
-class SigmoidFocalLossGradientOp final : public Operator<Context> {
- public:
-  SigmoidFocalLossGradientOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws),
-        scale_(this->template GetSingleArgument<float>("scale", 1.)),
-        num_classes_(this->template GetSingleArgument<int>("num_classes", 80)),
-        gamma_(this->template GetSingleArgument<float>("gamma", 1.)),
-        alpha_(this->template GetSingleArgument<float>("alpha", 0.25)) {
-    CAFFE_ENFORCE(scale_ >= 0);
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  float scale_;
-  int num_classes_;
-  float gamma_;
-  float alpha_;
-  Tensor counts_{Context::GetDeviceType()};
-  Tensor weights_{Context::GetDeviceType()}; // unignored weights
-};
-
-} // namespace caffe2
-
-#endif // SIGMOID_FOCAL_LOSS_OP_H_
--- a/modules/detectron/smooth_l1_loss_op.cc
+++ b/modules/detectron/smooth_l1_loss_op.cc
@ -1,117 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "smooth_l1_loss_op.h"
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(SmoothL1Loss, SmoothL1LossOp<float, CPUContext>);
-REGISTER_CPU_OPERATOR(
-    SmoothL1LossGradient,
-    SmoothL1LossGradientOp<float, CPUContext>);
-
-OPERATOR_SCHEMA(SmoothL1Loss)
-    .NumInputs(4)
-    .NumOutputs(1)
-    .SetDoc(R"DOC(
-Smooth L1 Loss is a minor variation of Huber loss in which the point of
-transition between L2 loss and L1 loss is adjustable by a hyper-parameter beta:
-
-  SmoothL1(x) = 0.5 * x^2 / beta      if |x| < beta
-                |x| - 0.5 * beta      otherwise.
-
-SmoothL1 is used in Fast R-CNN and descendants as the loss function for bounding
-box regression.
-
-The loss computed by this op has a flexible form:
-
-  scale / N * sum_i alpha_out[i] * SmoothL1(alpha_in[i] * (y_hat[i] - y[i])).
-
-The weights alpha_in and alpha_out are called the "inside" and "outside"
-weights, respectively. The inside weights are typically set to either 0 or 1 to
-implement ignoring (when 0) certain samples. The outside weights can be used
-to implement a per-sample loss weight. The overall loss is scaled by scale / N,
-where N is the number of batch elements in the input predictions.
-)DOC")
-    .Arg(
-        "beta",
-        "(float) default 1.0; L2 to L1 transition point.")
-    .Arg(
-        "scale",
-        "(float) default 1.0; multiply the loss by this scale factor.")
-    .Input(
-        0,
-        "Y_hat",
-        "Tensor of predictions (at least 1D).")
-    .Input(
-        1,
-        "Y",
-        "Tensor of labels with the same shape as Y_hat.")
-    .Input(
-        2,
-        "alpha_in",
-        "Tensor of inside weights with the same shape as Y.")
-    .Input(
-        3,
-        "alpha_out",
-        "Tensor of outside weights with the same shape as Y.")
-    .Output(
-        0,
-        "loss",
-        "Scalar loss.");
-
-OPERATOR_SCHEMA(SmoothL1LossGradient)
-    .NumInputs(5)
-    .NumOutputs(1)
-    .Input(
-        0,
-        "Y_hat",
-        "See SmoothL1Loss.")
-    .Input(
-        1,
-        "Y",
-        "See SmoothL1Loss.")
-    .Input(
-        2,
-        "alpha_in",
-        "See SmoothL1Loss.")
-    .Input(
-        3,
-        "alpha_out",
-        "See SmoothL1Loss.")
-    .Input(
-        4,
-        "d_loss",
-        "Gradient of forward output 0 (loss).")
-    .Output(
-        0,
-        "d_Y_hat",
-        "Gradient of forward input 0 (Y_hat).");
-
-class GetSmoothL1LossGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    return SingleGradientDef(
-        "SmoothL1LossGradient",
-        "",
-        vector<string>{I(0), I(1), I(2), I(3), GO(0)},
-        vector<string>{GI(0)});
-  }
-};
-
-REGISTER_GRADIENT(SmoothL1Loss, GetSmoothL1LossGradient);
-
-} // namespace caffe2
--- a/modules/detectron/smooth_l1_loss_op.cu
+++ b/modules/detectron/smooth_l1_loss_op.cu
@ -1,185 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "caffe2/core/context_gpu.h"
-#include "modules/detectron/smooth_l1_loss_op.h"
-
-namespace caffe2 {
-
-namespace {
-template <typename T>
-__global__ void SmoothL1Kernel(
-    const int n, const T* in, T* out, T beta) {
-  // f(x) = 0.5 * x^2 / beta      if |x| < beta
-  //        |x| - 0.5 * beta      otherwise
-  CUDA_1D_KERNEL_LOOP(index, n) {
-    T val = in[index];
-    T abs_val = c10::cuda::compat::abs(val);
-    if (abs_val < beta) {
-      out[index] = 0.5 * val * val / beta;
-    } else {
-      out[index] = abs_val - 0.5 * beta;
-    }
-  }
-}
-
-template <typename T>
-__global__ void SmoothL1GradientKernel(
-    const int n,
-    const T* in,
-    T* out,
-    const T* d_loss_data,
-    T norm,
-    T beta) {
-  // f'(x) = x / beta     if |x| < beta
-  //       = sign(x)      otherwise
-  // We also scale by norm * d_loss in this kernel for convenience
-  CUDA_1D_KERNEL_LOOP(index, n) {
-    T val = in[index];
-    T abs_val = c10::cuda::compat::abs(val);
-    T d_loss = *d_loss_data;
-    if (abs_val < beta) {
-      out[index] = norm * d_loss * val / beta;
-    } else {
-      out[index] = norm * d_loss * ((T(0) < val) - (val < T(0)));
-    }
-  }
-}
-} // namespace
-
-template<>
-bool SmoothL1LossOp<float, CUDAContext>::RunOnDevice() {
-  auto& Y_hat     = Input(0);
-  auto& Y         = Input(1);
-  auto& alpha_in  = Input(2);
-  auto& alpha_out = Input(3);
-
-
-  int N = Y.dim32(0);
-  // Require the same number of elements along axis 0 (batch size), but
-  // otherwise don't care about the shape (just the number of elements)
-  CAFFE_ENFORCE_EQ(Y_hat.dim32(0), Y.dim32(0),
-      "Y_hat and Y must have the same number of elements along axis 0");
-  CAFFE_ENFORCE_EQ(Y_hat.size(), Y.size(),
-      "Y_hat and Y must have the same number of elements");
-  CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_in.size());
-  CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_out.size());
-
-  auto* avg_loss = Output(0, vector<int64_t>(), at::dtype<float>());
-  buff_.ResizeLike(Y);
-
-  // Difference
-  // d := y_hat - y
-  math::Sub<float, CUDAContext>(
-      Y.size(), Y_hat.data<float>(), Y.data<float>(),
-      buff_.mutable_data<float>(), &context_);
-  // Element-wise weighted difference (can be used to ignore or reweight
-  // specific components)
-  // d := alpha_in * (y_hat - y)
-  math::Mul<float, CUDAContext>(
-      buff_.size(), buff_.data<float>(), alpha_in.data<float>(),
-      buff_.mutable_data<float>(), &context_);
-
-  // Element-wise smooth l1 loss
-  // l := SmoothL1(alpha_in * (y_hat - y))
-  SmoothL1Kernel<float>
-  <<<CAFFE_GET_BLOCKS(buff_.size()),
-     CAFFE_CUDA_NUM_THREADS,
-     0,
-     context_.cuda_stream()>>>(
-          buff_.size(), buff_.data<float>(), buff_.mutable_data<float>(),
-          beta_);
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-  // Element-wise weighted smooth l1 loss (can be used to specify a per-element
-  // loss weight)
-  // l := alpha_out * SmoothL1(alpha_in * (y_hat - y))
-  math::Mul<float, CUDAContext>(
-      buff_.size(), buff_.data<float>(), alpha_out.data<float>(),
-      buff_.mutable_data<float>(), &context_);
-  // Sum of all losses
-  // al := sum_i l_i
-  float* avg_loss_data = avg_loss->mutable_data<float>();
-  math::Sum<float, CUDAContext>(
-      buff_.size(), buff_.data<float>(), avg_loss_data, &context_);
-  // Average of input batch size
-  // al := 1/N * al
-  math::Scale<float, float, CUDAContext>(
-      1, scale_ / N, avg_loss_data, avg_loss_data, &context_);
-  return true;
-}
-
-template<>
-bool SmoothL1LossGradientOp<float, CUDAContext>::RunOnDevice() {
-  auto& Y_hat      = Input(0);
-  auto& Y          = Input(1);
-  auto& alpha_in   = Input(2);
-  auto& alpha_out  = Input(3);
-  auto& d_avg_loss = Input(4);  // gradient of net w.r.t. avg_loss ("gradOutput")
-  // We intentially don't compute gradients for Y, alpha_{in,out} since they
-  // are not needed (can change in the future if desired)
-
-  int N = Y.dim32(0);
-  // Require the same number of elements along axis 0 (batch size), but
-  // otherwise don't care about the shape (just the number of elements)
-  CAFFE_ENFORCE_EQ(Y_hat.dim32(0), Y.dim32(0),
-      "Y_hat and Y must have the same number of elements along axis 0");
-  CAFFE_ENFORCE_EQ(Y_hat.size(), Y.size(),
-      "Y_hat and Y must have the same number of elements");
-  CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_in.size());
-  CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_out.size());
-  CAFFE_ENFORCE_EQ(d_avg_loss.size(), 1);
-
-  auto* d_Y_hat = Output(0, Y_hat.sizes(), at::dtype<float>()); // gradient of net w.r.t. Y_hat ("gradInput")
-  buff_.ResizeLike(Y);
-
-  // Difference
-  // d := y_hat - y
-  math::Sub<float, CUDAContext>(
-      Y.size(), Y_hat.data<float>(), Y.data<float>(),
-      buff_.mutable_data<float>(), &context_);
-  // Element-wise weighted difference (can be used to ignore or reweight
-  // specific components)
-  // d := alpha_in * (y_hat - y)
-  math::Mul<float, CUDAContext>(
-      buff_.size(), buff_.data<float>(), alpha_in.data<float>(),
-      buff_.mutable_data<float>(), &context_);
-  // d_Y_hat := d_avg_loss / N * SmoothL1'(alpha_in * (y_hat - y))
-  SmoothL1GradientKernel<float>
-  <<<CAFFE_GET_BLOCKS(buff_.size()),
-     CAFFE_CUDA_NUM_THREADS,
-     0,
-     context_.cuda_stream()>>>(
-         buff_.size(), buff_.data<float>(), d_Y_hat->mutable_data<float>(),
-         d_avg_loss.data<float>(), scale_ / N, beta_);
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-  // Element-wise scale by alpha_in and alpha_out
-  math::Mul<float, CUDAContext>(
-      d_Y_hat->size(), d_Y_hat->data<float>(), alpha_in.data<float>(),
-      d_Y_hat->mutable_data<float>(), &context_);
-  math::Mul<float, CUDAContext>(
-      d_Y_hat->size(), d_Y_hat->data<float>(), alpha_out.data<float>(),
-      d_Y_hat->mutable_data<float>(), &context_);
-  return true;
-}
-
-
-REGISTER_CUDA_OPERATOR(SmoothL1Loss,
-                       SmoothL1LossOp<float, CUDAContext>);
-REGISTER_CUDA_OPERATOR(SmoothL1LossGradient,
-                       SmoothL1LossGradientOp<float, CUDAContext>);
-}  // namespace caffe2
--- a/modules/detectron/smooth_l1_loss_op.h
+++ b/modules/detectron/smooth_l1_loss_op.h
@ -1,75 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef SMOOTH_L1_LOSS_OP_H_
-#define SMOOTH_L1_LOSS_OP_H_
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-
-template <typename T, class Context>
-class SmoothL1LossOp final : public Operator<Context> {
- public:
-  SmoothL1LossOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        beta_(this->template GetSingleArgument<float>("beta", 1.)),
-        scale_(this->template GetSingleArgument<float>("scale", 1.)) {
-    CAFFE_ENFORCE(beta_ > 0);
-    CAFFE_ENFORCE(scale_ >= 0);
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  float beta_; // Transition point from L1 to L2 loss
-  float scale_; // Scale the loss by scale_
-  Tensor buff_{Context::GetDeviceType()}; // Buffer for element-wise differences
-};
-
-template <typename T, class Context>
-class SmoothL1LossGradientOp final : public Operator<Context> {
- public:
-  SmoothL1LossGradientOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws),
-        beta_(this->template GetSingleArgument<float>("beta", 1.)),
-        scale_(this->template GetSingleArgument<float>("scale", 1.)) {
-    CAFFE_ENFORCE(beta_ > 0);
-    CAFFE_ENFORCE(scale_ >= 0);
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  float beta_; // Transition point from L1 to L2 loss
-  float scale_; // Scale the loss by scale_
-  Tensor buff_{Context::GetDeviceType()}; // Buffer for element-wise differences
-};
-
-} // namespace caffe2
-
-#endif // SMOOTH_L1_LOSS_OP_H_
--- a/modules/detectron/softmax_focal_loss_op.cc
+++ b/modules/detectron/softmax_focal_loss_op.cc
@ -1,104 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "modules/detectron/softmax_focal_loss_op.h"
-
-#include "caffe2/operators/softmax_utils.h"
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(SoftmaxFocalLoss, SoftmaxFocalLossOp<float, CPUContext>);
-REGISTER_CPU_OPERATOR(
-    SoftmaxFocalLossGradient,
-    SoftmaxFocalLossGradientOp<float, CPUContext>);
-
-OPERATOR_SCHEMA(SoftmaxFocalLoss)
-    .NumInputs(3)
-    .NumOutputs(2)
-    .SetDoc(R"DOC(
-A multiclass form of Focal Loss designed for use in RetinaNet-like models.
-The input is assumed to be unnormalized scores (sometimes called 'logits')
-arranged in a 4D tensor with shape (N, C, H, W), where N is the number of
-elements in the batch, H and W are the height and width, and C = num_anchors *
-num_classes. The softmax is applied num_anchors times along the C axis.
-
-The softmax version of focal loss is:
-
-  FL(p_t) = -alpha * (1 - p_t)**gamma * log(p_t),
-
-where p_i = exp(s_i) / sum_j exp(s_j), t is the target (ground truth) class, and
-s_j is the unnormalized score for class j.
-
-See: https://arxiv.org/abs/1708.02002 for details.
-)DOC")
-    .Arg(
-        "scale",
-        "(float) default 1.0; multiply the loss by this scale factor.")
-    .Arg("alpha", "(float) default 0.25; Focal Loss's alpha hyper-parameter.")
-    .Arg("gamma", "(float) default 1.0; Focal Loss's gamma hyper-parameter.")
-    .Arg(
-        "num_classes",
-        "(int) default 81; number of classes in each softmax group.")
-    .Input(
-        0,
-        "scores",
-        "4D tensor of softmax inputs (called 'scores' or 'logits') with shape "
-        "(N, C, H, W), where C = num_anchors * num_classes defines num_anchors "
-        "groups of contiguous num_classes softmax inputs.")
-    .Input(
-        1,
-        "labels",
-        "4D tensor of labels with shape (N, num_anchors, H, W). Each entry is "
-        "a class label in [0, num_classes - 1] (inclusive).")
-    .Input(
-        2,
-        "normalizer",
-        "Scalar; the loss is normalized by 1 / max(1, normalizer).")
-    .Output(0, "loss", "Scalar loss.")
-    .Output(
-        1,
-        "probabilities",
-        "4D tensor of softmax probabilities with shape (N, C, H, W), where "
-        "C = num_anchors * num_classes, and softmax was applied to each of the "
-        "num_anchors groups; within a group the num_classes values sum to 1.");
-
-OPERATOR_SCHEMA(SoftmaxFocalLossGradient)
-    .NumInputs(5)
-    .NumOutputs(1)
-    .Input(0, "scores", "See SoftmaxFocalLoss.")
-    .Input(1, "labels", "See SoftmaxFocalLoss.")
-    .Input(2, "normalizer", "See SoftmaxFocalLoss.")
-    .Input(
-        3,
-        "probabilities",
-        "Output 1 from SoftmaxFocalLoss; See SoftmaxFocalLoss.")
-    .Input(4, "d_loss", "Gradient of forward output 0 (loss)")
-    .Output(0, "d_scores", "Gradient of forward input 0 (scores)");
-
-class GetSoftmaxFocalLossGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    return SingleGradientDef(
-        "SoftmaxFocalLossGradient",
-        "",
-        vector<string>{I(0), I(1), I(2), O(1), GO(0)},
-        vector<string>{GI(0)});
-  }
-};
-
-REGISTER_GRADIENT(SoftmaxFocalLoss, GetSoftmaxFocalLossGradient);
-
-} // namespace caffe2
--- a/modules/detectron/softmax_focal_loss_op.cu
+++ b/modules/detectron/softmax_focal_loss_op.cu
@ -1,256 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cfloat>
-
-#include "caffe2/core/context_gpu.h"
-#include "modules/detectron/softmax_focal_loss_op.h"
-
-namespace caffe2 {
-
-namespace {
-
-__global__ void SpatialSoftmaxKernel(const int N, const int A,
-    const int H, const int W, const float* Xdata, float* Pdata,
-    const int num_classes) {
-  CUDA_1D_KERNEL_LOOP(index, N * A * H * W) {
-    int D = num_classes * A;
-    int x = index % W;
-    int y = (index / W) % H;
-    int a = (index / (W * H)) % A;
-    int i = index / W / H / A;
-
-    // Subtract max on each cell for numerical reasons
-    float max_val = -FLT_MAX;
-    for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
-      int idx = i * (H * W * D) +  c * (H * W) + y * W + x;
-      max_val = max(max_val, Xdata[idx]);
-    }
-    // Exponentiate
-    float expsum = 0.0f;
-    for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
-      int idx = i * (H * W * D) + c * (H * W) + y * W + x;
-      float expx = exp(Xdata[idx] - max_val);
-      Pdata[idx] = expx;
-      expsum += expx;
-    }
-    // Normalize
-    for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
-      int idx = i * (H * W * D) + c * (H * W) + y * W + x;
-      Pdata[idx] /= expsum;
-    }
-  }
-}
-
-
-__global__ void SoftmaxFocalLossKernel(
-    const int N, const int A, const int H, const int W,
-    const float* Pdata, const int* targets, float* losses,
-    const float* weight_pos, const float gamma, const float alpha,
-    const int num_classes) {
-  CUDA_1D_KERNEL_LOOP(i, N * A * H * W) {
-    int D = A * num_classes;
-    int x = i % W;
-    int y = (i / W) % H;
-    int a = (i / (W * H)) % A;
-    int n = i / (W * H * A);
-    const int label = static_cast<int>(targets[i]);
-
-    float Np = c10::cuda::compat::max(weight_pos[0], static_cast<float>(1.0));
-    float z = (label == 0) * (1 - alpha) / Np +
-              (label >= 1) * alpha / Np;
-
-    losses[i] = 0.0;
-    if (label >= 0) {
-      int offset = a * num_classes;
-      int idx = n * (H * W * D) + (offset + label) * (H * W) + y * W + x;
-      losses[i] =
-          -(pow(1.0f - Pdata[idx], gamma) *
-          log(c10::cuda::compat::max(Pdata[idx], FLT_MIN))) * z;
-    }
-  }
-}
-
-
-__global__ void SoftmaxFocalLossGradientWeightKernel(
-    const int N, const int A, const int H, const int W,
-    const float* Pdata, const int* targets, float* buff,
-    const float* weight_pos, const float gamma, const float alpha,
-    const int num_classes) {
-  CUDA_1D_KERNEL_LOOP(i, N * A * H * W) {
-    int D = A * num_classes;
-    int x = i % W;
-    int y = (i / W) % H;
-    int a = (i / (W * H)) % A;
-    int n = i / (W * H * A);
-    const int label = static_cast<int>(targets[i]);
-    float Np = c10::cuda::compat::max(weight_pos[0], static_cast<float>(1.0));
-    float z =  (label == 0) * (1 - alpha) / Np +
-               (label >= 1) * alpha / Np;
-
-    buff[i] = 0.0;
-    if (label >= 0) {
-      int offset = a * num_classes;
-      int idx = n * (H * W * D) + (offset + label) * (H * W) + y * W + x;
-      float onemp = 1. - Pdata[idx];
-      float p = Pdata[idx];
-      buff[i] =
-          (-pow(onemp, gamma) +
-          gamma * pow(onemp, gamma - 1) * p * log(c10::cuda::compat::max(p, FLT_MIN))) * z;
-    }
-  }
-}
-
-
-__global__ void SoftmaxFocalLossGradientKernel(
-    const int N, const int D, const int H, const int W,
-    const float* Pdata, const int* targets, const float* buff,
-    const float* d_loss_data, float* dX, const int num_classes) {
-  CUDA_1D_KERNEL_LOOP(i, N * D * H * W) {
-    int A = D / num_classes;
-    int x = i % W;
-    int y = (i / W) % H;
-    int d = (i / (W * H)) % D;
-    int a = d / num_classes;
-    int c = d % num_classes;
-    int n = i / (W * H * D);
-    float d_loss = *d_loss_data;
-
-    int ind = n * (H * W * A) + a * (H * W) + y * W + x;
-    const int label = static_cast<int>(targets[ind]);
-
-    float c1 = (label >= 0) * 1.0;
-    float c2 = (label == c) * 1.0;
-    dX[i] = 0.0;
-    dX[i] = c1 * d_loss * buff[ind] * (c2 - Pdata[i]);
-  }
-}
-
-} // namespace
-
-
-template <>
-bool SoftmaxFocalLossOp<float, CUDAContext>::RunOnDevice() {
-  auto& X = Input(0);         // Logits
-  auto& T = Input(1);         // Labels
-  auto& wp = Input(2);        // num of foreground
-   // average loss as output
-          // softmax probability, going to be re-used in gradient
-
-  int N = X.dim32(0);
-  int D = X.dim32(1);
-  int H = X.dim32(2);
-  int W = X.dim32(3);
-  int A = D / num_classes_;
-
-  ReinitializeTensor(&losses_, {N * A * H * W}, at::dtype<float>().device(CUDA));
-  auto* P = Output(1, {N * D * H * W}, at::dtype<float>());
-  auto* avg_loss = Output(0, vector<int64_t>(), at::dtype<float>());
-  math::Set<float, CUDAContext>(
-      avg_loss->size(), 0.f, avg_loss->mutable_data<float>(), &context_);
-  math::Set<float, CUDAContext>(
-      P->size(), 0.f, P->mutable_data<float>(), &context_);
-  math::Set<float, CUDAContext>(
-      losses_.size(), 0.f, losses_.mutable_data<float>(), &context_);
-  TORCH_DCHECK_EQ(X.ndim(), 4);
-
-  const float* Xdata = X.data<float>();
-  const float* Wdata = wp.data<float>();
-
-
-  // Spatial Softmax Kernel
-  SpatialSoftmaxKernel
-      <<<CAFFE_GET_BLOCKS(N * A * H * W), CAFFE_CUDA_NUM_THREADS,
-         0, context_.cuda_stream()>>>(
-    N, A, H, W, Xdata, P->mutable_data<float>(), num_classes_);
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-  // Compute loss for each x,y location
-  const int* Tdata = T.data<int>();
-  SoftmaxFocalLossKernel
-  <<<CAFFE_GET_BLOCKS(N * A * H * W), CAFFE_CUDA_NUM_THREADS,
-      0, context_.cuda_stream()>>>(
-    N, A, H, W, P->data<float>(), Tdata, losses_.mutable_data<float>(),
-    Wdata, gamma_, alpha_, num_classes_);
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-  // sum the losses
-  float* avg_loss_data = avg_loss->mutable_data<float>();
-  math::Sum<float, CUDAContext>(
-      losses_.size(), losses_.data<float>(), avg_loss_data, &context_);
-  math::Scale<float, float, CUDAContext>(
-      1, scale_, avg_loss_data, avg_loss_data, &context_);
-
-  return true;
-}
-
-
-template<>
-bool SoftmaxFocalLossGradientOp<float, CUDAContext>::RunOnDevice() {
-  auto& X = Input(0);    // Logits
-  auto& T = Input(1);    // Label
-  auto& wp = Input(2);   // num of foreground example
-  auto& P = Input(3);    // Softmax Probability
-  auto& d_avg_loss = Input(4);
-
-
-  int N = X.dim32(0);
-  int D = X.dim32(1);
-  int H = X.dim32(2);
-  int W = X.dim32(3);
-  int A = D / num_classes_;
-
-  ReinitializeTensor(&buff_, {N * A * H * W}, at::dtype<float>().device(CUDA));
-
-  auto* dX = Output(0, X.sizes(), at::dtype<float>()); // gradient wrt logits
-
-  const float* Xdata = X.data<float>();
-  const int* Tdata = T.data<int>();
-  const float* Pdata = P.data<float>();
-  const float* Wdata = wp.data<float>();
-
-
-  // Compute the weight for gradients
-  SoftmaxFocalLossGradientWeightKernel
-      <<<CAFFE_GET_BLOCKS(N * A * H * W), CAFFE_CUDA_NUM_THREADS,
-         0, context_.cuda_stream()>>>(
-    N, A, H, W, Pdata, Tdata, buff_.mutable_data<float>(),
-    Wdata, gamma_, alpha_, num_classes_);
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-  // Compute the gradient with the weights
-  const float* Bdata = buff_.data<float>();
-  SoftmaxFocalLossGradientKernel
-      <<<CAFFE_GET_BLOCKS(N * D * H * W), CAFFE_CUDA_NUM_THREADS,
-         0, context_.cuda_stream()>>>(
-    N, D, H, W, Pdata, Tdata, Bdata, d_avg_loss.data<float>(),
-    dX->mutable_data<float>(), num_classes_);
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-  math::Scale<float, float, CUDAContext>(
-      dX->size(),
-      scale_,
-      dX->data<float>(),
-      dX->mutable_data<float>(),
-      &context_);
-  return true;
-}
-
-
-REGISTER_CUDA_OPERATOR(SoftmaxFocalLoss,
-                       SoftmaxFocalLossOp<float, CUDAContext>);
-REGISTER_CUDA_OPERATOR(SoftmaxFocalLossGradient,
-                       SoftmaxFocalLossGradientOp<float, CUDAContext>);
-} // namespace caffe2
--- a/modules/detectron/softmax_focal_loss_op.h
+++ b/modules/detectron/softmax_focal_loss_op.h
@ -1,91 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef SOFTMAX_FOCAL_LOSS_OP_H_
-#define SOFTMAX_FOCAL_LOSS_OP_H_
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-
-template <typename T, class Context>
-class SoftmaxFocalLossOp final : public Operator<Context> {
- public:
-  SoftmaxFocalLossOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        scale_(this->template GetSingleArgument<float>("scale", 1.)),
-        gamma_(this->template GetSingleArgument<float>("gamma", 1.)),
-        alpha_(this->template GetSingleArgument<float>("alpha", 0.25)),
-        num_classes_(this->template GetSingleArgument<int>("num_classes", 81)),
-        order_(StringToStorageOrder(
-            this->template GetSingleArgument<string>("order", "NCHW"))) {
-    CAFFE_ENFORCE(scale_ >= 0);
-    CAFFE_ENFORCE_EQ(
-        order_, StorageOrder::NCHW, "Only NCHW order is supported right now.");
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  float scale_;
-  float gamma_;
-  float alpha_;
-  int num_classes_;
-  StorageOrder order_;
-  Tensor losses_;
-};
-
-template <typename T, class Context>
-class SoftmaxFocalLossGradientOp final : public Operator<Context> {
- public:
-  SoftmaxFocalLossGradientOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws),
-        scale_(this->template GetSingleArgument<float>("scale", 1.)),
-        gamma_(this->template GetSingleArgument<float>("gamma", 1.)),
-        alpha_(this->template GetSingleArgument<float>("alpha", 0.25)),
-        num_classes_(this->template GetSingleArgument<int>("num_classes", 81)),
-        order_(StringToStorageOrder(
-            this->template GetSingleArgument<string>("order", "NCHW"))) {
-    CAFFE_ENFORCE(scale_ >= 0);
-    CAFFE_ENFORCE_EQ(
-        order_, StorageOrder::NCHW, "Only NCHW order is supported right now.");
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  float scale_;
-  float gamma_;
-  float alpha_;
-  int num_classes_;
-  StorageOrder order_;
-  Tensor buff_;
-};
-
-} // namespace caffe2
-
-#endif // SOFTMAX_FOCAL_LOSS_OP_H_
--- a/modules/detectron/spatial_narrow_as_op.cc
+++ b/modules/detectron/spatial_narrow_as_op.cc
@ -1,79 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "spatial_narrow_as_op.h"
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(SpatialNarrowAs, SpatialNarrowAsOp<CPUContext>);
-REGISTER_CPU_OPERATOR(
-    SpatialNarrowAsGradient,
-    SpatialNarrowAsGradientOp<CPUContext>);
-
-OPERATOR_SCHEMA(SpatialNarrowAs)
-    .NumInputs(2)
-    .NumOutputs(1)
-    .SetDoc(R"DOC(
-Reduces ("narrows") the spatial extent of A to that of B by removing rows and
-columns from the bottom and right.
-)DOC")
-    .Input(
-        0,
-        "A",
-        "3D or 4D input of shape (N, H0, W0) or (N, C, H0, W0).")
-    .Input(
-        1,
-        "B",
-        "3D or 4D input of shape (N, H1, W1) or (N, C, H1, W1), where H1 <= H0 "
-        "and W1 <= W0.")
-    .Output(
-        0,
-        "C",
-        "Sub window of A containing rows [0, H1 - 1] (inclusive) and columns "
-        "[0, W1 - 1] (inclusive).");
-
-OPERATOR_SCHEMA(SpatialNarrowAsGradient)
-    .NumInputs(3)
-    .NumOutputs(1)
-    .Input(
-        0,
-        "A",
-        "See SpatialNarrowAs.")
-    .Input(
-        1,
-        "B",
-        "See SpatialNarrowAs.")
-    .Input(
-        2,
-        "dC",
-        "Gradient of forward output 0 (C).")
-    .Output(
-        0,
-        "dA",
-        "Gradient of forward input 0 (A)");
-
-class SpatialNarrowAsGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    return SingleGradientDef(
-        "SpatialNarrowAsGradient", "",
-        vector<string>{I(0), I(1), GO(0)},
-        vector<string>{GI(0)});
-  }
-};
-REGISTER_GRADIENT(SpatialNarrowAs, SpatialNarrowAsGradient);
-
-} // namespace caffe2
--- a/modules/detectron/spatial_narrow_as_op.cu
+++ b/modules/detectron/spatial_narrow_as_op.cu
@ -1,165 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/core/operator.h"
-#include "modules/detectron/spatial_narrow_as_op.h"
-
-namespace caffe2 {
-
-namespace {
-template <typename T>
-__global__ void CopyKernel(
-    const int N,
-    const int C,
-    const int in_H,
-    const int in_W,
-    const int out_H,
-    const int out_W,
-    const T* in_data,
-    T* out_data) {
-  CUDA_1D_KERNEL_LOOP(index, N * C * out_H * out_W) {
-    int w = index % out_W;
-    int h = (index / out_W) % out_H;
-    int c = (index / out_W / out_H) % C;
-    int n = (index / out_W / out_H / C);
-    int in_index = n * C * in_H * in_W + c * in_H * in_W + h * in_W + w;
-    int out_index = n * C * out_H * out_W + c * out_H * out_W + h * out_W + w;
-    out_data[out_index] = in_data[in_index];
-  }
-}
-
-template <typename T>
-__global__ void CopyGradientKernel(
-    const int N,
-    const int C,
-    const int in_H,
-    const int in_W,
-    const int out_H,
-    const int out_W,
-    const T* in_data,
-    T* out_data) {
-  CUDA_1D_KERNEL_LOOP(index, N * C * in_H * in_W) {
-    int w = index % in_W;
-    int h = (index / in_W) % in_H;
-    int c = (index / in_W / in_H) % C;
-    int n = (index / in_W / in_H / C);
-    int in_index = n * C * in_H * in_W + c * in_H * in_W + h * in_W + w;
-    int out_index = n * C * out_H * out_W + c * out_H * out_W + h * out_W + w;
-    out_data[out_index] = in_data[in_index];
-  }
-}
-} // namespace
-
-
-template <>
-bool SpatialNarrowAsOp<CUDAContext>::RunOnDevice() {
-  return DispatchHelper<TensorTypes<float_t, int32_t>>::call(this, Input(0));
-}
-
-template <>
-template <typename T>
-bool SpatialNarrowAsOp<CUDAContext>::DoRunWithType() {
-  // Narrows input 0 (A) spatially to match input 1 (B)
-  auto& A = Input(0);
-  auto& B = Input(1);
-
-
-  CAFFE_ENFORCE_EQ(A.dim32(0), B.dim32(0), "Input dim 0 must be equal.");
-  std::vector<int64_t> sizes;
-  if (A.ndim() == B.ndim()) {
-    CAFFE_ENFORCE_EQ(A.dim32(1), B.dim32(1), "Input dim 1 must be equal.");
-    CAFFE_ENFORCE_GE(
-        A.dim32(2), B.dim32(2), "Input 0 height must be >= input 1 height.");
-    CAFFE_ENFORCE_GE(
-        A.dim32(3), B.dim32(3), "Input 0 width must be >= input 1 width.");
-    sizes = B.sizes().vec();
-  } else {
-    // For (N, H, W) case
-    CAFFE_ENFORCE_EQ(A.ndim() - 1, B.ndim(), "Dimension mismatch.");
-    CAFFE_ENFORCE_GE(
-        A.dim32(2), B.dim32(1), "Input 0 height must be >= input 1 height.");
-    CAFFE_ENFORCE_GE(
-        A.dim32(3), B.dim32(2), "Input 0 width must be >= input 1 width.");
-    sizes = {A.dim32(0), A.dim32(1), B.dim32(1), B.dim32(2)};
-  }
-  auto* C = Output(0, sizes, at::dtype<T>());
-  int out_width = C->dim32(3);
-  int out_height = C->dim32(2);
-  int in_width = A.dim32(3);
-  int in_height = A.dim32(2);
-
-  CopyKernel<T><<<
-      CAFFE_GET_BLOCKS(C->size()),
-      CAFFE_CUDA_NUM_THREADS,
-      0,
-      context_.cuda_stream()>>>(
-      C->dim32(0),
-      C->dim32(1),
-      in_height,
-      in_width,
-      out_height,
-      out_width,
-      A.template data<T>(),
-      C->template mutable_data<T>());
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-  return true;
-}
-
-template <>
-bool SpatialNarrowAsGradientOp<CUDAContext>::RunOnDevice() {
-  return DispatchHelper<TensorTypes<float_t, int32_t>>::call(this, Input(0));
-}
-
-template <>
-template <typename T>
-bool SpatialNarrowAsGradientOp<CUDAContext>::DoRunWithType() {
-  auto& A = Input(0);
-  auto& B = Input(1);
-  auto& dC = Input(2); // Gradient of net w.r.t. output of forward op
-  auto* dA = Output(0, A.sizes(), at::dtype<T>()); // Gradient of net w.r.t. input to forward op
-
-  math::Set<T, CUDAContext>(
-      dA->size(), 0.f, dA->template mutable_data<T>(), &context_);
-  int out_width = dA->dim32(3);
-  int out_height = dA->dim32(2);
-  int in_width = dC.dim32(3);
-  int in_height = dC.dim32(2);
-
-  CopyGradientKernel<T><<<
-      CAFFE_GET_BLOCKS(dC.size()),
-      CAFFE_CUDA_NUM_THREADS,
-      0,
-      context_.cuda_stream()>>>(
-      dA->dim32(0),
-      dA->dim32(1),
-      in_height,
-      in_width,
-      out_height,
-      out_width,
-      dC.template data<T>(),
-      dA->template mutable_data<T>());
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-  return true;
-}
-
-REGISTER_CUDA_OPERATOR(SpatialNarrowAs, SpatialNarrowAsOp<CUDAContext>);
-REGISTER_CUDA_OPERATOR(
-    SpatialNarrowAsGradient,
-    SpatialNarrowAsGradientOp<CUDAContext>);
-} // namespace caffe2
--- a/modules/detectron/spatial_narrow_as_op.h
+++ b/modules/detectron/spatial_narrow_as_op.h
@ -1,63 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef SPATIAL_NARROW_AS_OP_H_
-#define SPATIAL_NARROW_AS_OP_H_
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-
-template <class Context>
-class SpatialNarrowAsOp final : public Operator<Context> {
- public:
-  SpatialNarrowAsOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws) {}
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  USE_DISPATCH_HELPER;
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  template <typename T>
-  bool DoRunWithType();
-};
-
-template <class Context>
-class SpatialNarrowAsGradientOp final : public Operator<Context> {
- public:
-  SpatialNarrowAsGradientOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws) {}
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  USE_DISPATCH_HELPER;
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  template <typename T>
-  bool DoRunWithType();
-};
-
-} // namespace caffe2
-
-#endif // SPATIAL_NARROW_AS_OP_H_
--- a/modules/detectron/upsample_nearest_op.cc
+++ b/modules/detectron/upsample_nearest_op.cc
@ -1,83 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "upsample_nearest_op.h"
-#ifdef USE_MKLDNN
-#include "caffe2/ideep/operators/operator_fallback_ideep.h"
-#include "caffe2/ideep/utils/ideep_operator.h"
-#endif
-
-namespace caffe2 {
-#ifdef USE_MKLDNN
-REGISTER_IDEEP_OPERATOR(
-    UpsampleNearest,
-    IDEEPFallbackOp<UpsampleNearestOp<float, CPUContext>>);
-#endif
-
-REGISTER_CPU_OPERATOR(UpsampleNearest, UpsampleNearestOp<float, CPUContext>);
-REGISTER_CPU_OPERATOR(
-    UpsampleNearestGradient,
-    UpsampleNearestGradientOp<float, CPUContext>);
-
-OPERATOR_SCHEMA(UpsampleNearest)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .SetDoc(R"DOC(
-Nearest neighbor upsampling operation. Implementation taken from THCUNN.
-)DOC")
-    .Arg(
-        "scale",
-        "(int) default 2; integer upsampling factor.")
-    .Input(
-        0,
-        "X",
-        "4D feature map input of shape (N, C, H, W).")
-    .Output(
-        0,
-        "Y",
-        "4D feature map of shape (N, C, scale * H, scale * W); Values are "
-        "neareast neighbor samples from X.");
-
-OPERATOR_SCHEMA(UpsampleNearestGradient)
-    .NumInputs(2)
-    .NumOutputs(1)
-    .Input(
-        0,
-        "X",
-        "See UpsampleNearest.")
-    .Input(
-        1,
-        "dY",
-        "Gradient of forward output 0 (Y).")
-    .Output(
-        0,
-        "dX",
-        "Gradient of forward input 0 (X).");
-
-class GetUpsampleNearestGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    return SingleGradientDef(
-        "UpsampleNearestGradient",
-        "",
-        vector<string>{I(0), GO(0)},
-        vector<string>{GI(0)});
-  }
-};
-
-REGISTER_GRADIENT(UpsampleNearest, GetUpsampleNearestGradient);
-
-} // namespace caffe2
--- a/modules/detectron/upsample_nearest_op.cu
+++ b/modules/detectron/upsample_nearest_op.cu
@ -1,223 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Adapted from https://github.com/torch/cunn/blob/master/lib/THCUNN/SpatialUpSamplingNearest.cu
- *
- * Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
- * Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
- * Copyright (c) 2011-2013 NYU (Clement Farabet)
- * Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert,
- *                         Leon Bottou, Iain Melvin, Jason Weston)
- * Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
- * Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert,
- *                         Samy Bengio, Johnny Mariethoz)
- *
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * 3. Neither the names of NEC Laboratories American and IDIAP Research
- *    Institute nor the names of its contributors may be used to endorse or
- *    promote products derived from this software without specific prior
- *    written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-
-#include "caffe2/core/context_gpu.h"
-#include "modules/detectron/upsample_nearest_op.h"
-
-namespace caffe2 {
-
-namespace {
-__device__ int translate_idx(int ii, int d1, int d2, int d3, int scale_factor) {
-  int x, y, z, w;
-  w = ii % d3;
-  ii = ii/d3;
-  z = ii % d2;
-  ii = ii/d2;
-  y = ii % d1;
-  ii = ii/d1;
-  x = ii;
-  w = w/scale_factor;
-  z = z/scale_factor;
-  d2 /= scale_factor;
-  d3 /= scale_factor;
-  return (((x*d1+y)*d2)+z)*d3+w;
-}
-
-__device__ int translate_idx_inv(
-    int ii, int d1, int d2, int d3, int scale_factor, int off_x, int off_y) {
-  int x, y, z, w;
-  w = ii % d3;
-  ii = ii/d3;
-  z = ii % d2;
-  ii = ii/d2;
-  y = ii % d1;
-  ii = ii/d1;
-  x = ii;
-  w = w*scale_factor+off_x;
-  z = z*scale_factor+off_y;
-  d2 *= scale_factor;
-  d3 *= scale_factor;
-  return (((x*d1+y)*d2)+z)*d3+w;
-}
-
-__global__ void upscale(const float *input, float *output, long no_elements,
-                        int scale_factor, int d1, int d2, int d3) {
-  long ii = threadIdx.x + blockDim.x * blockIdx.x;
-  ii += threadIdx.y + blockDim.y * (blockDim.x * gridDim.x) * blockIdx.y;
-  if (ii >= no_elements) return;
-  int ipidx = translate_idx(ii, d1, d2, d3, scale_factor);
-  output[ii]=input[ipidx];
-}
-
-__global__ void downscale(float *gradInput_data, const float *gradOutput_data,
-                          long no_elements, int scale_factor, int d1, int d2,
-                          int d3) {
-  long ii = threadIdx.x + blockDim.x * blockIdx.x;
-  ii += threadIdx.y + blockDim.y * (blockDim.x * gridDim.x) * blockIdx.y;
-  if (ii >= no_elements) return;
-  for (int i=0; i < scale_factor; i++){
-    for(int j=0; j < scale_factor; j++){
-      int ipidx = translate_idx_inv(ii, d1, d2, d3, scale_factor, i, j);
-      gradInput_data[ii] += gradOutput_data[ipidx];
-    }
-  }
-}
-} // namespace
-
-template<>
-bool UpsampleNearestOp<float, CUDAContext>::RunOnDevice() {
-  auto& X = Input(0);
-  auto* Y = Output(0);
-
-  vector<int64_t> out_shape;
-  for (int i = 0; i < X.ndim(); ++i) {
-    out_shape.push_back(X.dim32(i));
-  }
-  out_shape[X.ndim() - 1] *= scale_;
-  out_shape[X.ndim() - 2] *= scale_;
-  Y->Resize(out_shape);
-
-  int d1;
-  int d2;
-  int d3;
-  if (X.ndim() == 3) {
-    d1 = Y->dim32(0);
-    d2 = Y->dim32(1);
-    d3 = Y->dim32(2);
-  } else {
-    d1 = Y->dim32(1);
-    d2 = Y->dim32(2);
-    d3 = Y->dim32(3);
-  }
-  long no_elements = Y->size();
-
-  const float *input_data = X.data<float>();
-  float *output_data = Y->mutable_data<float>();
-
-  // cuda blocks & threads:
-  long nthreads = 256;
-  // Max number of blocks: http://en.wikipedia.org/wiki/CUDA
-  // 65535 for SM 2.x, 2^32 -1 for >= 3.0
-  // TODO: When we move to SM 3.5 we should update this
-  long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535);
-  long n_yblocks = (long)ceil(
-      (float)no_elements / (float)(n_xblocks * nthreads));
-  CAFFE_ENFORCE(n_yblocks <= 65535);
-  dim3 blocks(n_xblocks, n_yblocks);
-  dim3 threads(nthreads);
-
-  upscale<<<blocks, threads, 0, context_.cuda_stream()>>>(
-      input_data, output_data, no_elements, scale_, d1, d2, d3);
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-  return true;
-}
-
-
-template<>
-bool UpsampleNearestGradientOp<float, CUDAContext>::RunOnDevice() {
-  auto& X  = Input(0);   // Original input to "forward" op
-  auto& dY = Input(1);   // Gradient of net w.r.t. output of "forward" op
-                         // (aka "gradOutput")
-  auto* dX = Output(0);  // Gradient of net w.r.t. input to "forward" op
-                         // (aka "gradInput")
-
-  dX->ResizeLike(X);
-  float *gradInput_data = dX->mutable_data<float>();
-  const float *gradOutput_data = dY.data<float>();
-
-  int d1;
-  int d2;
-  int d3;
-  if (dX->ndim() == 3) {
-    d1 = dX->dim32(0);
-    d2 = dX->dim32(1);
-    d3 = dX->dim32(2);
-  } else {
-    d1 = dX->dim32(1);
-    d2 = dX->dim32(2);
-    d3 = dX->dim32(3);
-  }
-  long no_elements = dX->size();
-
-  // cuda blocks & threads:
-  long nthreads = 256;
-  // Max number of blocks: http://en.wikipedia.org/wiki/CUDA
-  // 65535 for SM 2.x, 2^32 -1 for >= 3.0
-  // TODO: When we move to SM 3.5 we should update this
-  long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535);
-  long n_yblocks = (long)ceil(
-      (float)no_elements / (float)(n_xblocks * nthreads));
-  CAFFE_ENFORCE(n_yblocks <= 65535);
-  dim3 blocks(n_xblocks, n_yblocks);
-  dim3 threads(nthreads);
-
-  math::Set<float, CUDAContext>(no_elements, 0.f, gradInput_data, &context_);
-  downscale<<<blocks, threads, 0, context_.cuda_stream()>>>(
-      gradInput_data, gradOutput_data, no_elements, scale_, d1, d2, d3);
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-  return true;
-}
-
-REGISTER_CUDA_OPERATOR(UpsampleNearest,
-                       UpsampleNearestOp<float, CUDAContext>);
-REGISTER_CUDA_OPERATOR(UpsampleNearestGradient,
-                       UpsampleNearestGradientOp<float, CUDAContext>);
-} // namespace caffe2
--- a/modules/detectron/upsample_nearest_op.h
+++ b/modules/detectron/upsample_nearest_op.h
@ -1,106 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef UPSAMPLE_NEAREST_OP_H_
-#define UPSAMPLE_NEAREST_OP_H_
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-
-template <typename T, class Context>
-class UpsampleNearestOp final : public Operator<Context> {
- public:
-  UpsampleNearestOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        scale_(this->template GetSingleArgument<int>("scale", 2)) {
-    TORCH_DCHECK_GE(scale_, 1);
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    auto& X = Input(0);
-
-    auto out_shape = X.sizes().vec();
-    out_shape[X.dim() - 1] *= scale_;
-    out_shape[X.dim() - 2] *= scale_;
-    auto* Y = Output(0, out_shape, at::dtype<T>());
-
-    int d1;
-    int d2;
-    int d3;
-    if (X.dim() == 3) {
-      d1 = Y->dim32(0);
-      d2 = Y->dim32(1);
-      d3 = Y->dim32(2);
-    } else {
-      d1 = Y->dim32(0) * Y->dim32(1);
-      d2 = Y->dim32(2);
-      d3 = Y->dim32(3);
-    }
-
-    const T *input_data = X.template data<T>();
-    T *output_data = Y->template mutable_data<T>();
-    int scaled_d2 = d2 / scale_;
-    int scaled_d3 = d3 / scale_;
-
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-    for (int i = 0; i < d1; ++i) {
-      for (int j = 0; j < d2; ++j) {
-        for (int u = 0; u < d3; ++u) {
-          int ii = (i * d2 + j) * d3 + u;
-          int scaled_u = u / scale_;
-          int scaled_j = j / scale_;
-          int ipidx = ((i * scaled_d2) + scaled_j) * scaled_d3 + scaled_u;
-          output_data[ii] = input_data[ipidx];
-        }
-      }
-    }
-
-    return true;
-  }
-
- protected:
-  int scale_;
-};
-
-template <typename T, class Context>
-class UpsampleNearestGradientOp final : public Operator<Context> {
- public:
-  UpsampleNearestGradientOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws),
-        scale_(this->template GetSingleArgument<int>("scale", 2)) {
-    TORCH_DCHECK_GE(scale_, 1);
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  int scale_;
-};
-
-} // namespace caffe2
-
-#endif // UPSAMPLE_NEAREST_OP_H_
--- a/modules/detectron/upsample_nearest_op_test.py
+++ b/modules/detectron/upsample_nearest_op_test.py
@ -1,42 +0,0 @@
-import unittest
-
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-import numpy as np
-from caffe2.python import core, dyndep
-from hypothesis import given, settings
-
-
-dyndep.InitOpsLibrary("@/caffe2/modules/detectron:detectron_ops")
-
-
-class TestUpsampleNearestOp(hu.HypothesisTestCase):
-    @given(
-        N=st.integers(1, 3),
-        H=st.integers(10, 300),
-        W=st.integers(10, 300),
-        scale=st.integers(1, 3),
-        **hu.gcs,
-    )
-    @settings(deadline=None, max_examples=20)
-    def test_upsample_nearest_op(self, N, H, W, scale, gc, dc):
-        C = 32
-        X = np.random.randn(N, C, H, W).astype(np.float32)
-        op = core.CreateOperator("UpsampleNearest", ["X"], ["Y"], scale=scale)
-
-        def ref(X):
-            outH = H * scale
-            outW = W * scale
-            outH_idxs, outW_idxs = np.meshgrid(
-                np.arange(outH), np.arange(outW), indexing="ij"
-            )
-            inH_idxs = (outH_idxs / scale).astype(np.int32)
-            inW_idxs = (outW_idxs / scale).astype(np.int32)
-            Y = X[:, :, inH_idxs, inW_idxs]
-            return [Y]
-
-        self.assertReferenceChecks(device_option=gc, op=op, inputs=[X], reference=ref)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/modules/module_test/CMakeLists.txt
+++ b/modules/module_test/CMakeLists.txt
@ -1,23 +0,0 @@
-if(NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
-  # If we are building the standalone module, we set the proper cmake variables.
-  cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
-  find_package(Caffe2 REQUIRED)
-  set(BUILD_TEST ON)
-  option(BUILD_SHARED_LIBS "Build shared libs." ON)
-endif()
-
-if(BUILD_TEST AND NOT BUILD_LITE_INTERPRETER)
-  add_library(
-      caffe2_module_test_dynamic
-      ${CMAKE_CURRENT_SOURCE_DIR}/module_test_dynamic.cc)
-
-  if(HAVE_SOVERSION)
-    set_target_properties(caffe2_module_test_dynamic PROPERTIES
-        VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION})
-  endif()
-  target_link_libraries(caffe2_module_test_dynamic torch_library)
-  install(TARGETS caffe2_module_test_dynamic DESTINATION lib)
-  if(MSVC AND BUILD_SHARED_LIBS)
-    install(FILES $<TARGET_PDB_FILE:caffe2_module_test_dynamic> DESTINATION lib OPTIONAL)
-  endif()
-endif()
--- a/modules/module_test/module_test_dynamic.cc
+++ b/modules/module_test/module_test_dynamic.cc
@ -1,41 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "caffe2/core/module.h"
-#include "caffe2/core/operator.h"
-
-// An explicitly defined module, testing correctness when we dynamically link a
-// module
-CAFFE2_MODULE(caffe2_module_test_dynamic, "Dynamic module only used for testing.");
-
-namespace caffe2 {
-
-class Caffe2ModuleTestDynamicDummyOp : public OperatorBase {
- public:
-  using OperatorBase::OperatorBase;
-  bool Run(int /* unused */ /*stream_id*/) override {
-    return true;
-  }
-  virtual string type() {
-    return "base";
-  }
-};
-
-REGISTER_CPU_OPERATOR(
-  Caffe2ModuleTestDynamicDummy, Caffe2ModuleTestDynamicDummyOp);
-OPERATOR_SCHEMA(Caffe2ModuleTestDynamicDummy);
-
-} // namespace caffe2
--- a/modules/observers/CMakeLists.txt
+++ b/modules/observers/CMakeLists.txt
@ -1,32 +0,0 @@
-if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
-  if(NOT USE_OBSERVERS)
-    return()
-  endif()
-else()
-  cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
-  project(caffe2_observers CXX)
-  find_package(Caffe2 REQUIRED)
-  option(BUILD_SHARED_LIBS "Build shared libs." ON)
-endif()
-
-add_library(caffe2_observers
-    "${CMAKE_CURRENT_SOURCE_DIR}/net_observer_reporter_print.cc"
-    "${CMAKE_CURRENT_SOURCE_DIR}/observer_config.cc"
-    "${CMAKE_CURRENT_SOURCE_DIR}/perf_observer.cc"
-    )
-if(HAVE_SOVERSION)
-  set_target_properties(caffe2_observers PROPERTIES
-      VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION})
-endif()
-target_link_libraries(caffe2_observers PUBLIC torch_library)
-target_include_directories(caffe2_observers PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/..)
-target_compile_options(caffe2_observers PRIVATE "-DCAFFE2_BUILD_OBSERVER_LIB")
-install(TARGETS caffe2_observers DESTINATION lib)
-caffe2_interface_library(caffe2_observers caffe2_observers_library)
-if(MSVC AND BUILD_SHARED_LIBS)
-  install(FILES $<TARGET_PDB_FILE:caffe2_observers> DESTINATION lib OPTIONAL)
-endif()
-
-if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
-  set(Caffe2_MODULES ${Caffe2_MODULES} caffe2_observers_library PARENT_SCOPE)
-endif()
--- a/modules/observers/macros.h
+++ b/modules/observers/macros.h
@ -1,7 +0,0 @@
-#include "c10/macros/Macros.h"
-
-#ifdef CAFFE2_BUILD_OBSERVER_LIB
-#define CAFFE2_OBSERVER_API C10_EXPORT
-#else
-#define CAFFE2_OBSERVER_API C10_IMPORT
-#endif
--- a/modules/observers/net_observer_reporter.h
+++ b/modules/observers/net_observer_reporter.h
@ -1,38 +0,0 @@
-#pragma once
-
-#include <map>
-
-#include "caffe2/core/common.h"
-#include "caffe2/core/net.h"
-#include "observers/macros.h"
-
-namespace caffe2 {
-
-struct PerformanceInformation {
-  // Analytic
-  int64_t flops = 0;
-  int64_t bytes_written = 0;
-  int64_t bytes_read = 0;
-  std::vector<TensorShape> tensor_shapes = {};
-  std::vector<Argument> args = {};
-  std::string engine = ""; // the engine used
-  std::string type = ""; // the type of the operator
-  // Measured
-  double latency = 0;
-  double cpuMilliseconds = 0;
-};
-
-class CAFFE2_OBSERVER_API NetObserverReporter {
- public:
-  virtual ~NetObserverReporter() = default;
-
-  /*
-    Report the delay metric collected by the observer.
-    The delays are saved in a map. The key is an identifier associated
-    with the reported delay. The value is the delay value in float
-  */
-  virtual void report(
-      NetBase* net,
-      std::map<std::string, PerformanceInformation>&) = 0;
-};
-}
--- a/modules/observers/net_observer_reporter_print.cc
+++ b/modules/observers/net_observer_reporter_print.cc
@ -1,158 +0,0 @@
-#include "observers/net_observer_reporter_print.h"
-
-#include <algorithm>
-#include <sstream>
-#include "caffe2/core/init.h"
-#include "observers/observer_config.h"
-
-#include <c10/util/irange.h>
-
-namespace caffe2 {
-
-const std::string NetObserverReporterPrint::IDENTIFIER = "Caffe2Observer ";
-static std::string get_op_args(PerformanceInformation p);
-static std::string get_tensor_shapes(PerformanceInformation p);
-static std::string sanatize(std::string json_s);
-
-void NetObserverReporterPrint::report(
-    NetBase* net,
-    std::map<std::string, PerformanceInformation>& info) {
-  // Not allowed to use json library
-  std::vector<std::map<std::string, std::string>> caffe2_perf;
-
-  for (auto& p : info) {
-    if ((p.first == "NET_DELAY") && (info.size() == 1)) {
-      // for Net_delay perf
-      caffe2_perf.push_back({{"type", "NET"},
-                             {"value", c10::to_string(p.second.latency * 1000)},
-                             {"unit", "us"},
-                             {"metric", "latency"}});
-      caffe2_perf.push_back({{"type", "NET_"},
-                             {
-                               "value",
-                               c10::to_string(
-                                   p.second.cpuMilliseconds /
-                                   p.second.latency *
-                                   100),
-                             },
-                             {"unit", "percent"},
-                             {"metric", "cpu_percent"}});
-    } else if (p.first != "NET_DELAY") {
-      // for operator perf
-      std::string shape_str = get_tensor_shapes(p.second);
-      std::string args_str = get_op_args(p.second);
-      std::string type = p.first;
-      caffe2_perf.push_back({{"type", type},
-                             {"value", c10::to_string(p.second.latency * 1000)},
-                             {"unit", "us"},
-                             {"metric", "latency"}});
-      caffe2_perf.push_back({{"type", type},
-                             {
-                               "value",
-                               c10::to_string(
-                                   p.second.cpuMilliseconds /
-                                   p.second.latency *
-                                   100),
-                             },
-                             {"unit", "percent"},
-                             {"metric", "cpu_percent"}});
-      if (p.second.flops > 0) {
-        caffe2_perf.push_back({{"type", type},
-                               {"value", c10::to_string(p.second.flops)},
-                               {"unit", "flop"},
-                               {"metric", "flops"}});
-      }
-      if (shape_str != "") {
-        caffe2_perf.push_back({{"type", type},
-                               {"info_string", shape_str},
-                               {"unit", ""},
-                               {"metric", "tensor_shapes"}});
-      }
-      if (args_str != "") {
-        caffe2_perf.push_back({{"type", type},
-                               {"info_string", args_str},
-                               {"unit", ""},
-                               {"metric", "op_args"}});
-      }
-    }
-  }
-
-  // NOLINTNEXTLINE(modernize-loop-convert)
-  for (auto it = caffe2_perf.begin(); it != caffe2_perf.end(); it++) {
-    std::stringstream buffer;
-    auto entry = *it;
-    buffer << IDENTIFIER << "{";
-    // NOLINTNEXTLINE(modernize-raw-string-literal)
-    buffer << "\"type\": \"" << sanatize(entry["type"]) << "\","
-           // NOLINTNEXTLINE(modernize-raw-string-literal)
-           << "\"unit\": \"" << sanatize(entry["unit"]) << "\","
-           // NOLINTNEXTLINE(modernize-raw-string-literal)
-           << "\"metric\": \"" << sanatize(entry["metric"]) << "\",";
-    if (entry.find("value") != entry.end()) {
-      // NOLINTNEXTLINE(modernize-raw-string-literal)
-      buffer << "\"value\": \"" << sanatize(entry["value"]) << "\"";
-    } else if (entry.find("info_string") != entry.end()) {
-      // NOLINTNEXTLINE(modernize-raw-string-literal)
-      buffer << "\"info_string\": \"" << sanatize(entry["info_string"]) << "\"";
-    }
-    buffer << "}";
-    LOG(INFO) << buffer.str();
-  }
-}
-
-static std::string get_tensor_shapes(PerformanceInformation p) {
-  std::string shape_str;
-  std::stringstream shape_stream;
-  if (!p.tensor_shapes.empty()) {
-    shape_stream << "[";
-    for (const auto i : c10::irange(p.tensor_shapes.size())) {
-      shape_stream << "[";
-      for (int j = 0; j < p.tensor_shapes[i].dims_size(); j++) {
-        shape_stream << p.tensor_shapes[i].dims(j) << ", ";
-      }
-      shape_stream << "], ";
-    }
-    shape_stream << "]";
-    shape_str = shape_stream.str();
-  } else {
-    shape_str = "";
-  }
-  return shape_str;
-}
-
-static std::string get_op_args(PerformanceInformation p) {
-  std::string args_str;
-  if (!p.args.empty()) {
-    std::stringstream args;
-    args << "[";
-    for (const auto i : c10::irange(p.args.size())) {
-      args << "{" << p.args[i].name() << ": ";
-      if (p.args[i].has_i()) {
-        args << p.args[i].i();
-      } else if (p.args[i].has_s()) {
-        args << p.args[i].s();
-      } else if (p.args[i].has_n()) {
-        args << &p.args[i].n();
-      } else if (p.args[i].has_f()) {
-        args << p.args[i].f();
-      } else {
-        args << "None";
-      }
-      args << "}, ";
-    }
-    args << "]";
-    args_str = args.str();
-  } else {
-    args_str = "";
-  }
-  return args_str;
-}
-
-static std::string sanatize(std::string json_s) {
-  // Remove illegal characters from the name that would cause json string to
-  // become invalid
-  json_s.erase(std::remove(json_s.begin(), json_s.end(), '"'), json_s.end());
-  json_s.erase(std::remove(json_s.begin(), json_s.end(), '\\'), json_s.end());
-  return json_s;
-}
-}
--- a/modules/observers/net_observer_reporter_print.h
+++ b/modules/observers/net_observer_reporter_print.h
@ -1,16 +0,0 @@
-#pragma once
-
-#include "observers/macros.h"
-#include "observers/net_observer_reporter.h"
-
-#include "caffe2/core/common.h"
-
-namespace caffe2 {
-
-class CAFFE2_OBSERVER_API NetObserverReporterPrint : public NetObserverReporter {
- public:
-  static const std::string IDENTIFIER;
-  void report(NetBase* net, std::map<std::string, PerformanceInformation>&) override;
-};
-
-} // namespace caffe2
--- a/modules/observers/observer_config.cc
+++ b/modules/observers/observer_config.cc
@ -1,12 +0,0 @@
-#include "observers/observer_config.h"
-
-namespace caffe2 {
-
-int ObserverConfig::netInitSampleRate_ = 0;
-int ObserverConfig::netFollowupSampleRate_ = 0;
-int ObserverConfig::netFollowupSampleCount_ = 0;
-int ObserverConfig::operatorNetSampleRatio_ = 0;
-int ObserverConfig::skipIters_ = 0;
-unique_ptr<NetObserverReporter> ObserverConfig::reporter_ = nullptr;
-int ObserverConfig::marker_ = -1;
-}
--- a/modules/observers/observer_config.h
+++ b/modules/observers/observer_config.h
@ -1,99 +0,0 @@
-#pragma once
-
-#include "observers/macros.h"
-#include "observers/net_observer_reporter.h"
-
-#include "caffe2/core/common.h"
-
-namespace caffe2 {
-
-/*
-  netInitSampleRate_ == 1 && operatorNetSampleRatio_ == 1 :
-      Log operator metrics in every iteration
-  netInitSampleRate_ == 1 && operatorNetSampleRatio_ == 0 :
-      Log net metrics in every iterationn
-  netInitSampleRate_ == n && netFollowupSampleRate_ == m &&
-          netFollowupSampleCount == c && operatorNetSampleRatio_ == 1 :
-      Log operator metrics first at odds of 1 / n. Once first logged,
-      the following c logs are at odds of 1 / min(n, m). Then repeat
-  netInitSampleRate_ == n && netFollowupSampleRate_ == m &&
-          netFollowupSampleCount == c && operatorNetSampleRatio_ == 0 :
-      Log net metrics first at odds of 1 / n. Once first logged,
-      the following c logs are at odds of 1 / min(n, m). Then repeat
-  netInitSampleRate_ == n && netFollowupSampleRate_ == m &&
-          netFollowupSampleCount == c && operatorNetSampleRatio_ == o :
-      Log net metrics first at odds of 1 / n. Once first logged,
-      the following c logs are at odds of 1 / min(n, m), if the random number
-      is multiples of o, log operator metrics instead. Then repeat
-  skipIters_ == n: skip the first n iterations of the net.
-*/
-class CAFFE2_OBSERVER_API ObserverConfig {
- public:
-  static void initSampleRate(
-      int netInitSampleRate,
-      int netFollowupSampleRate,
-      int netFollowupSampleCount,
-      int operatorNetSampleRatio,
-      int skipIters) {
-    CAFFE_ENFORCE(netFollowupSampleRate <= netInitSampleRate);
-    CAFFE_ENFORCE(netFollowupSampleRate >= 1 || netInitSampleRate == 0);
-    netInitSampleRate_ = netInitSampleRate;
-    netFollowupSampleRate_ = netFollowupSampleRate;
-    netFollowupSampleCount_ = netFollowupSampleCount;
-    operatorNetSampleRatio_ = operatorNetSampleRatio;
-    skipIters_ = skipIters;
-  }
-  static int getNetInitSampleRate() {
-    return netInitSampleRate_;
-  }
-  static int getNetFollowupSampleRate() {
-    return netFollowupSampleRate_;
-  }
-  static int getNetFollowupSampleCount() {
-    return netFollowupSampleCount_;
-  }
-  static int getOpoeratorNetSampleRatio() {
-    return operatorNetSampleRatio_;
-  }
-  static int getSkipIters() {
-    return skipIters_;
-  }
-  static void setReporter(unique_ptr<NetObserverReporter> reporter) {
-    reporter_ = std::move(reporter);
-  }
-  static NetObserverReporter* getReporter() {
-    CAFFE_ENFORCE(reporter_);
-    return reporter_.get();
-  }
-  static void setMarker(int marker) {
-    marker_ = marker;
-  }
-  static int getMarker() {
-    return marker_;
-  }
-
- private:
-  /* The odds of log net metric initially or immediately after reset */
-  static int netInitSampleRate_;
-
-  /* The odds of log net metric after log once after start of reset */
-  static int netFollowupSampleRate_;
-
-  /* The number of follow up logs to be collected for odds of
-     netFollowupSampleRate_ */
-  static int netFollowupSampleCount_;
-
-  /* The odds to log the operator metric instead of the net metric.
-     When the operator is logged the net is not logged. */
-  static int operatorNetSampleRatio_;
-
-  /* skip the first few iterations */
-  static int skipIters_;
-
-  static unique_ptr<NetObserverReporter> reporter_;
-
-  /* marker used in identifying the metrics in certain reporters */
-  static int marker_;
-};
-
-}
--- a/modules/observers/perf_observer.cc
+++ b/modules/observers/perf_observer.cc
@ -1,330 +0,0 @@
-#include "observers/perf_observer.h"
-#include "observers/observer_config.h"
-#ifndef C10_MOBILE
-#include "caffe2/core/flags.h"
-#include "observers/net_observer_reporter_print.h"
-#endif
-
-#include <random>
-// NOLINTNEXTLINE(modernize-deprecated-headers)
-#include <time.h>
-#include "caffe2/core/common.h"
-#include "caffe2/core/init.h"
-#include "caffe2/core/operator.h"
-
-#if defined(TARGET_OS_MAC) || \
-defined(TARGET_OS_IPHONE) || \
-defined(TARGET_IPHONE_SIMULATOR)
-#define _APPLE 1
-#endif
-
-#ifdef _WIN32
-#ifndef WIN32_LEAN_AND_MEAN
-#define WIN32_LEAN_AND_MEAN
-#endif
-#include <windows.h>
-#endif
-
-#ifdef _APPLE
-#include <mach/mach_time.h>
-#include <sys/time.h>
-#include <sys/resource.h>
-#endif
-
-#ifndef C10_MOBILE
-C10_DEFINE_int64(
-    aiBench_netInitSampleRate,
-    0,
-    "One in N sampling rate for net delay");
-
-C10_DEFINE_int64(
-    aiBench_netFollowupSampleRate,
-    0,
-    "One in N sampling rate for net delay");
-
-C10_DEFINE_int64(
-    aiBench_netFollowupSampleCount,
-    0,
-    "control the following c logs");
-
-C10_DEFINE_int64(
-    aiBench_operatorNetSampleRatio,
-    0,
-    "One in N sampling rate for operator delay");
-
-C10_DEFINE_int64(
-    aiBench_skipIters,
-    0,
-    "skip the first N iterations of the net run");
-#endif
-
-namespace caffe2 {
-namespace {
-
-bool registerGlobalPerfNetObserverCreator(int* /*pargc*/, char*** /*pargv*/) {
-  AddGlobalNetObserverCreator([](NetBase* subject) {
-    return std::make_unique<PerfNetObserver>(subject);
-  });
-
-#if !defined(C10_MOBILE)
-  // for aibench usage
-  caffe2::ObserverConfig::setReporter(
-      std::make_unique<caffe2::NetObserverReporterPrint>());
-
-  caffe2::ObserverConfig::initSampleRate(
-      FLAGS_aiBench_netInitSampleRate,
-      FLAGS_aiBench_netFollowupSampleRate,
-      FLAGS_aiBench_netFollowupSampleCount,
-      FLAGS_aiBench_operatorNetSampleRatio,
-      FLAGS_aiBench_skipIters);
-#endif
-
-  return true;
-}
-} // namespace
-
-#ifdef _WIN32
-double getTicksPerMillisecond() {
-  static LARGE_INTEGER ticks_per_sec;
-  if (!ticks_per_sec.QuadPart) {
-    QueryPerformanceFrequency(&ticks_per_sec);
-    if (!ticks_per_sec.QuadPart) {
-      return 0.0;
-    }
-  }
-
-  return static_cast<double>(ticks_per_sec.QuadPart) / 1000.0;
-}
-#elif !defined _APPLE
-double getClockTimeMilliseconds(clockid_t clk_id) {
-  int result;
-  struct timespec tp;
-  result = clock_gettime(clk_id, &tp);
-  if (result == -1) {
-    return 0.0;
-  } else {
-    return tp.tv_sec * 1000.0 + tp.tv_nsec / 1000000.0;
-  }
-}
-#endif
-
-double getWallClockTimeMilliseconds() {
-#ifdef _WIN32
-  double ticks_per_ms = getTicksPerMillisecond();
-  if (ticks_per_ms) {
-    LARGE_INTEGER ticks;
-    if (QueryPerformanceCounter(&ticks)) {
-      return static_cast<double>(ticks.QuadPart) / ticks_per_ms;
-    }
-  }
-
-  return 0.0;
-#elif defined _APPLE
-  static mach_timebase_info_data_t info;
-  if (info.denom == 0) {
-    mach_timebase_info(&info);
-  }
-
-  uint64_t now = mach_absolute_time();
-  now = now * info.numer / info.denom; // convert to nanoseconds
-  return now / 1000000.0;
-#else
-  return getClockTimeMilliseconds(CLOCK_MONOTONIC);
-#endif
-}
-
-double getCpuTimeMilliseconds() {
-#ifdef _WIN32
-  FILETIME creation_time;
-  FILETIME exit_time;
-  FILETIME kernel_time;
-  FILETIME user_time;
-  if (GetProcessTimes(
-      GetCurrentProcess(),
-      &creation_time,
-      &exit_time,
-      &kernel_time,
-      &user_time)) {
-    ULARGE_INTEGER kernel;
-    ULARGE_INTEGER user;
-    kernel.HighPart = kernel_time.dwHighDateTime;
-    kernel.LowPart = kernel_time.dwLowDateTime;
-    user.HighPart = user_time.dwHighDateTime;
-    user.LowPart = user_time.dwLowDateTime;
-    return (static_cast<double>(kernel.QuadPart) +
-        static_cast<double>(user.QuadPart)) / 10000.0;
-  }
-
-  return 0.0;
-#elif defined _APPLE
-  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-  struct rusage ru;
-  if (getrusage(RUSAGE_SELF, &ru)) {
-    return 0.0;
-  }
-
-  return ru.ru_utime.tv_sec * 1000.0
-      + ru.ru_utime.tv_usec / 1000.0
-      + ru.ru_stime.tv_sec * 1000.0
-      + ru.ru_stime.tv_usec / 1000.0;
-#else
-  return getClockTimeMilliseconds(CLOCK_PROCESS_CPUTIME_ID);
-#endif
-}
-
-REGISTER_CAFFE2_EARLY_INIT_FUNCTION(
-    registerGlobalPerfNetObserverCreator,
-    &registerGlobalPerfNetObserverCreator,
-    "Caffe2 net global observer creator");
-
-// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-PerfNetObserver::PerfNetObserver(NetBase* subject_)
-    : NetObserver(subject_), numRuns_(0) {}
-
-// NOLINTNEXTLINE(modernize-use-equals-default)
-PerfNetObserver::~PerfNetObserver() {}
-
-void PerfNetObserver::Start() {
-  static int visitCount = 0;
-  // Select whether to log the operator or the net.
-  // We have one sample rate for the entire app.
-  int netInitSampleRate = ObserverConfig::getNetInitSampleRate();
-  int netFollowupSampleRate = ObserverConfig::getNetFollowupSampleRate();
-  int netFollowupSampleCount = ObserverConfig::getNetFollowupSampleCount();
-  int operatorNetSampleRatio = ObserverConfig::getOpoeratorNetSampleRatio();
-  int skipIters = ObserverConfig::getSkipIters();
-  int sampleRate = visitCount > 0 ? netFollowupSampleRate : netInitSampleRate;
-  // NOLINTNEXTLINE(clang-analyzer-security.insecureAPI.rand)
-  if (skipIters <= static_cast<int>(numRuns_) && sampleRate > 0 && rand() % sampleRate == 0) {
-    visitCount++;
-    if (visitCount == netFollowupSampleCount) {
-      visitCount = 0;
-    }
-    // NOLINTNEXTLINE(clang-analyzer-security.insecureAPI.rand)
-    if (operatorNetSampleRatio > 0 && rand() % operatorNetSampleRatio == 0) {
-      logType_ = PerfNetObserver::OPERATOR_DELAY;
-    } else {
-      logType_ = PerfNetObserver::NET_DELAY;
-    }
-  } else {
-    logType_ = PerfNetObserver::NONE;
-  }
-  numRuns_++;
-
-  if (logType_ == PerfNetObserver::OPERATOR_DELAY) {
-    /* Always recreate new operator  observers
-       whenever we measure operator delay */
-    const auto& operators = subject_->GetOperators();
-    for (auto* op : operators) {
-      observerMap_[op] = op->AttachObserver(
-          std::make_unique<PerfOperatorObserver>(op, this));
-    }
-  }
-
-  wallMilliseconds_ = getWallClockTimeMilliseconds();
-  cpuMilliseconds_ = getCpuTimeMilliseconds();
-}
-
-void PerfNetObserver::Stop() {
-  if (logType_ == PerfNetObserver::NONE) {
-    return;
-  }
-  std::map<std::string, PerformanceInformation> info;
-  PerformanceInformation net_perf;
-  net_perf.cpuMilliseconds =
-      getCpuTimeMilliseconds() - cpuMilliseconds_;
-  net_perf.latency =
-      getWallClockTimeMilliseconds() - wallMilliseconds_;
-
-  if (logType_ == PerfNetObserver::OPERATOR_DELAY) {
-    const auto& operators = subject_->GetOperators();
-    for (unsigned idx = 0; idx < operators.size(); ++idx) {
-      const auto* op = operators[idx];
-      auto name = getObserverName(op, static_cast<int>(idx));
-      PerformanceInformation p;
-      const PerfOperatorObserver* opObserver =
-          static_cast<const PerfOperatorObserver*>(observerMap_[op]);
-      p.latency = opObserver->getWallMilliseconds();
-      p.cpuMilliseconds = opObserver->getCpuMilliseconds();
-      p.engine = op->engine();
-      p.type = op->type();
-      p.tensor_shapes =
-          static_cast<const PerfOperatorObserver*>(observerMap_[op])
-              ->getTensorShapes();
-
-      if (op->has_debug_def()) {
-        // NOLINTNEXTLINE(performance-for-range-copy)
-        for (auto arg : op->debug_def().arg()) {
-          p.args.emplace_back(arg);
-        }
-      }
-
-      info.insert({name, p});
-    }
-
-    /* clear all operator delay after use so that we don't spent time
-       collecting the operator delay info in later runs */
-    for (auto* op : operators) {
-      op->DetachObserver(observerMap_[op]);
-    }
-    observerMap_.clear();
-  }
-  info.insert({"NET_DELAY", net_perf});
-  ObserverConfig::getReporter()->report(subject_, info);
-}
-
-caffe2::string PerfNetObserver::getObserverName(const OperatorBase* op, int idx)
-    const {
-  string opType = op->has_debug_def() ? op->debug_def().type() : "NO_TYPE";
-  string displayName =
-      (op->has_debug_def() ? op->debug_def().name().size()
-               ? op->debug_def().name()
-               : (op->debug_def().output_size() ? op->debug_def().output(0)
-                                                : "NO_OUTPUT")
-                           : "NO_DEF");
-  caffe2::string name =
-      "ID_" + c10::to_string(idx) + "_" + opType + "_" + displayName;
-  return name;
-}
-
-PerfOperatorObserver::PerfOperatorObserver(
-    OperatorBase* op,
-    PerfNetObserver* netObserver)
-    : ObserverBase<OperatorBase>(op),
-      netObserver_(netObserver),
-      wallMilliseconds_(0),
-      cpuMilliseconds_(0) {
-  CAFFE_ENFORCE(netObserver_, "Observers can't operate outside of the net");
-}
-
-// NOLINTNEXTLINE(modernize-use-equals-default)
-PerfOperatorObserver::~PerfOperatorObserver() {}
-
-void PerfOperatorObserver::Start() {
-  wallMilliseconds_ = getWallClockTimeMilliseconds();
-  cpuMilliseconds_ = getCpuTimeMilliseconds();
-}
-
-void PerfOperatorObserver::Stop() {
-  /* Time from the start of the net minus the time spent on all other
-     operators is the time spent on this operator */
-  cpuMilliseconds_ =
-      getCpuTimeMilliseconds() - cpuMilliseconds_;
-  wallMilliseconds_ =
-      getWallClockTimeMilliseconds() - wallMilliseconds_;
-  tensor_shapes_ = subject_->InputTensorShapes();
-}
-
-double PerfOperatorObserver::getWallMilliseconds() const {
-  return wallMilliseconds_;
-}
-
-double PerfOperatorObserver::getCpuMilliseconds() const {
-  return cpuMilliseconds_;
-}
-
-std::vector<TensorShape> PerfOperatorObserver::getTensorShapes() const {
-  return tensor_shapes_;
-}
-
-} // namespace caffe2
--- a/modules/observers/perf_observer.h
+++ b/modules/observers/perf_observer.h
@ -1,66 +0,0 @@
-#pragma once
-
-#include "caffe2/core/common.h"
-#include "caffe2/core/net.h"
-#include "caffe2/core/observer.h"
-#include "caffe2/core/timer.h"
-#include "observers/macros.h"
-
-#include <unordered_map>
-
-namespace caffe2 {
-
-double getClockTimeMilliseconds();
-
-class CAFFE2_OBSERVER_API PerfNetObserver : public NetObserver {
- public:
-  explicit PerfNetObserver(NetBase* subject_);
-  virtual ~PerfNetObserver();
-
- private:
-  void Start() override;
-  void Stop() override;
-
-  caffe2::string getObserverName(const OperatorBase* op, int idx) const;
-
- private:
-  enum LogType {
-    NONE,
-    OPERATOR_DELAY,
-    NET_DELAY,
-  };
-  LogType logType_;
-  unsigned int numRuns_;
-  std::unordered_map<const OperatorBase*, const ObserverBase<OperatorBase>*>
-      observerMap_;
-
-  double wallMilliseconds_;
-  double cpuMilliseconds_;
-};
-
-class PerfOperatorObserver : public ObserverBase<OperatorBase> {
- public:
-  PerfOperatorObserver(OperatorBase* op, PerfNetObserver* netObserver);
-  virtual ~PerfOperatorObserver();
-
-  double getWallMilliseconds() const;
-  double getCpuMilliseconds() const;
-  std::vector<TensorShape> getTensorShapes() const;
-
- private:
-  void Start() override;
-  void Stop() override;
-
- private:
-  // Observer of a net that owns corresponding op. We make sure net is never
-  // destructed while operator observer is still alive. First operator observer
-  // gets destructed, then the op, then the net and its observer.
-  // We do this trick in order to get access to net's name and other fields
-  // without storing inside the operator observer. Each field is memory
-  // costly here and a raw pointer is a cheapest sholution
-  PerfNetObserver* netObserver_;
-  double wallMilliseconds_;
-  double cpuMilliseconds_;
-  std::vector<TensorShape> tensor_shapes_;
-};
-} // namespace caffe2
--- a/setup.py
+++ b/setup.py
@ -88,12 +88,6 @@
 #     disables use of system-wide nccl (we will use our submoduled
 #     copy in third_party/nccl)
 #
-#   BUILD_CAFFE2_OPS=0
-#     disable Caffe2 operators build
-#
-#   BUILD_CAFFE2=0
-#     disable Caffe2 build
-#
 #   USE_IBVERBS
 #     toggle features related to distributed support
 #
--- a/torch/csrc/jit/serialization/export.cpp
+++ b/torch/csrc/jit/serialization/export.cpp
@ -145,26 +145,6 @@ void validateBlock(
            "\n\nDefined at:\n" + getNodeStackTraceString(node))
      }
    } else {
-#ifdef BUILD_CAFFE2
-      // Assuming this is a Caffe2 change as it only modifies an aten op
-      // for operator_export_type == ONNX_ATEN_FALLBACK, which is a common
-      // pattern for Caffe2-specific scenarios.
-      if (node->kind() == aten::expand) {
-        if (operator_export_type ==
-            onnx_torch::OperatorExportTypes::ONNX_ATEN_FALLBACK) {
-          WithInsertPoint guard(node);
-          auto* new_node =
-              b->owningGraph()->insertNode(b->owningGraph()->create(
-                  Symbol(::c10::aten::ATen),
-                  node->inputs(),
-                  node->outputs().size()));
-          for (size_t i = 0; i < node->outputs().size(); ++i) {
-            node->output(i)->replaceAllUsesWith(new_node->output(i));
-          }
-          new_node->s_(Symbol::fromQualString("attr::operator"), "expand");
-        }
-      }
-#endif
      if (node->kind() == prim::PackPadded || node->kind() == prim::PadPacked) {
        if (operator_export_type !=
            onnx_torch::OperatorExportTypes::ONNX_FALLTHROUGH) {
--- a/torch/csrc/onnx/init.cpp
+++ b/torch/csrc/onnx/init.cpp
@ -293,10 +293,6 @@ void initONNXBindings(PyObject* module) {

  onnx.attr("PRODUCER_VERSION") = py::str(TORCH_VERSION);

-#ifdef BUILD_CAFFE2
-  onnx.attr("_CAFFE2_ATEN_FALLBACK") = true;
-#else
  onnx.attr("_CAFFE2_ATEN_FALLBACK") = false;
-#endif
 }
 } // namespace torch::onnx
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@ -350,11 +350,6 @@ def export(
                    %3 : Float = onnx::Mul(%2, %0)
                    return (%3)

-                If PyTorch was built with Caffe2 (i.e. with ``BUILD_CAFFE2=1``), then
-                Caffe2-specific behavior will be enabled, including special support
-                for ops are produced by the modules described in
-                `Quantization <https://pytorch.org/docs/stable/quantization.html>`_.
-
                .. warning::

                    Models exported this way are probably runnable only by Caffe2.
@ -1802,9 +1797,8 @@ def _add_output_to_block(block: _C.Block, value: _C.Value) -> int:
 def _should_aten_fallback(
    name: str, opset_version: int, operator_export_type: _C_onnx.OperatorExportTypes
 ):
-    # For BUILD_CAFFE2=0 builds, if domain=="aten" and operator_export_type==ONNX_ATEN,
+    # For all builds, if domain=="aten" and operator_export_type==ONNX_ATEN,
    #   an aten::ATen operator is created regardless of symbolics existence
-    # For BUILD_CAFFE2=1, the same applies only if there is no symbolic available

    is_exportable_aten_op = registration.registry.is_registered_op(name, opset_version)
    is_onnx_aten_export = operator_export_type == _C_onnx.OperatorExportTypes.ONNX_ATEN