[Caffe2] Remove remaining unused perfkernels (#128477)

Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/128477 Approved by: https://github.com/ezyang, https://github.com/r-barnes
2025-10-20 12:54:11 +08:00 · 2024-06-12 22:19:36 +00:00
parent 55a6b38f52
commit 3008644297
10 changed files with 0 additions and 9898 deletions
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -461,7 +461,6 @@ filegroup(
 filegroup(
    name = "caffe2_perfkernels_srcs",
    srcs = [
-        "caffe2/perfkernels/embedding_lookup.cc",
        "caffe2/perfkernels/embedding_lookup_idx.cc",
    ],
 )
@ -499,7 +498,6 @@ cc_library(
    hdrs = [
        "caffe2/core/common.h",
        "caffe2/perfkernels/common.h",
-        "caffe2/perfkernels/embedding_lookup.h",
        "caffe2/perfkernels/embedding_lookup_idx.h",
        "caffe2/utils/fixed_divisor.h",
    ] + glob([
--- a/caffe2/core/common.cc
+++ b/caffe2/core/common.cc
@ -1,33 +1,7 @@
-#include <atomic>
-
 #include "caffe2/core/common.h"

 namespace caffe2 {

-// A global variable to mark if Caffe2 has cuda linked to the current runtime.
-// Do not directly use this variable, but instead use the HasCudaRuntime()
-// function below.
-std::atomic<bool> g_caffe2_has_cuda_linked{false};
-std::atomic<bool> g_caffe2_has_hip_linked{false};
-
-bool HasCudaRuntime() {
-  return g_caffe2_has_cuda_linked.load();
-}
-
-bool HasHipRuntime() {
-  return g_caffe2_has_hip_linked.load();
-}
-
-namespace internal {
-void SetCudaRuntimeFlag() {
-  g_caffe2_has_cuda_linked.store(true);
-}
-
-void SetHipRuntimeFlag() {
-  g_caffe2_has_hip_linked.store(true);
-}
-} // namespace internal
-
 const std::map<string, string>& GetBuildOptions() {
 #ifndef CAFFE2_BUILD_STRINGS
 #define CAFFE2_BUILD_STRINGS {}
--- a/caffe2/core/common.h
+++ b/caffe2/core/common.h
@ -32,15 +32,6 @@

 namespace caffe2 {

-// Note(Yangqing): NVCC does not play well with unordered_map on some platforms,
-// forcing us to use std::map instead of unordered_map. This may affect speed
-// in some cases, but in most of the computation code we do not access map very
-// often, so it should be fine for us. I am putting a CaffeMap alias so we can
-// change it more easily if things work out for unordered_map down the road.
-template <typename Key, typename Value>
-using CaffeMap = std::map<Key, Value>;
-// using CaffeMap = std::unordered_map;
-
 // Using statements for common classes that we refer to in caffe2 very often.
 // Note that we only place it inside caffe2 so the global namespace is not
 // polluted.
@ -50,38 +41,11 @@ using std::string;
 using std::unique_ptr;
 using std::vector;

-// Just in order to mark things as not implemented. Do not use in final code.
-#define CAFFE_NOT_IMPLEMENTED CAFFE_THROW("Not Implemented.")
-
-// suppress an unused variable.
-#if defined(_MSC_VER) && !defined(__clang__)
-#define CAFFE2_UNUSED __pragma(warning(suppress : 4100 4101))
-#define CAFFE2_USED
-#else
-#define CAFFE2_UNUSED __attribute__((__unused__))
-#define CAFFE2_USED __attribute__((__used__))
-#endif //_MSC_VER
-
 // Define alignment macro that is cross platform
-#if defined(_MSC_VER) && !defined(__clang__)
-#define CAFFE2_ALIGNED(x) __declspec(align(x))
-#else
-#define CAFFE2_ALIGNED(x) __attribute__((aligned(x)))
-#endif
-
 #if (defined _MSC_VER && !defined NOMINMAX)
 #define NOMINMAX
 #endif

-#if defined(__has_cpp_attribute)
-#if __has_cpp_attribute(nodiscard)
-#define CAFFE2_NODISCARD [[nodiscard]]
-#endif
-#endif
-#if !defined(CAFFE2_NODISCARD)
-#define CAFFE2_NODISCARD
-#endif
-
 using std::make_unique;

 #if defined(__ANDROID__) && !defined(__NDK_MAJOR__)
@ -90,58 +54,6 @@ using ::round;
 using std::round;
 #endif // defined(__ANDROID__) && !defined(__NDK_MAJOR__)

-// dynamic cast reroute: if RTTI is disabled, go to reinterpret_cast
-template <typename Dst, typename Src>
-inline Dst dynamic_cast_if_rtti(Src ptr) {
-#ifdef __GXX_RTTI
-  return dynamic_cast<Dst>(ptr);
-#else
-  return static_cast<Dst>(ptr);
-#endif
-}
-
-// SkipIndices are used in operator_fallback_gpu.h and operator_fallback_mkl.h
-// as utility functions that marks input / output indices to skip when we use a
-// CPU operator as the fallback of GPU/MKL operator option.
-template <int... values>
-class SkipIndices {
- private:
-  template <int V>
-  static inline bool ContainsInternal(const int i) {
-    return (i == V);
-  }
-  template <int First, int Second, int... Rest>
-  static inline bool ContainsInternal(const int i) {
-    return (i == First) || ContainsInternal<Second, Rest...>(i);
-  }
-
- public:
-  static inline bool Contains(const int i) {
-    return ContainsInternal<values...>(i);
-  }
-};
-
-template <>
-class SkipIndices<> {
- public:
-  static inline bool Contains(const int /*i*/) {
-    return false;
-  }
-};
-
-// HasCudaRuntime() tells the program whether the binary has Cuda runtime
-// linked. This function should not be used in static initialization functions
-// as the underlying boolean variable is going to be switched on when one
-// loads libtorch_gpu.so.
-TORCH_API bool HasCudaRuntime();
-TORCH_API bool HasHipRuntime();
-namespace internal {
-// Sets the Cuda Runtime flag that is used by HasCudaRuntime(). You should
-// never use this function - it is only used by the Caffe2 gpu code to notify
-// Caffe2 core that cuda runtime has been loaded.
-TORCH_API void SetCudaRuntimeFlag();
-TORCH_API void SetHipRuntimeFlag();
-} // namespace internal
 // Returns which setting Caffe2 was configured and built with (exported from
 // CMake)
 TORCH_API const std::map<string, string>& GetBuildOptions();
--- a/caffe2/perfkernels/CMakeLists.txt
+++ b/caffe2/perfkernels/CMakeLists.txt
@ -22,72 +22,26 @@ set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${common_srcs})
 # We will only build the perf kernel files if the compiler supports avx2
 # extensions.
 if(CXX_AVX2_FOUND)
-  add_library(Caffe2_perfkernels_avx STATIC ${avx_srcs})
  add_library(Caffe2_perfkernels_avx2 STATIC ${avx2_srcs})
-  target_link_libraries(Caffe2_perfkernels_avx PRIVATE c10)
  target_link_libraries(Caffe2_perfkernels_avx2 PRIVATE c10)
-  install(TARGETS Caffe2_perfkernels_avx Caffe2_perfkernels_avx2
-      ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}")

  if(MSVC AND NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-    target_compile_options(Caffe2_perfkernels_avx
-        PRIVATE "/arch:AVX"
-        PRIVATE "/D__F16C__")
    target_compile_options(Caffe2_perfkernels_avx2
        PRIVATE "/arch:AVX2"
        PRIVATE "/D__FMA__"
        PRIVATE "/D__F16C__")
  else()
-    target_compile_options(Caffe2_perfkernels_avx
-        PRIVATE "-mavx"
-        PRIVATE "-mf16c")
    target_compile_options(Caffe2_perfkernels_avx2
        PRIVATE "-mavx2"
        PRIVATE "-mfma"
        PRIVATE "-mavx"
        PRIVATE "-mf16c")
  endif()
-  caffe2_interface_library(
-      Caffe2_perfkernels_avx Caffe2_perfkernels_avx_interface)
  caffe2_interface_library(
      Caffe2_perfkernels_avx2 Caffe2_perfkernels_avx2_interface)
-  list(APPEND
-       Caffe2_DEPENDENCY_WHOLE_LINK_LIBS
-       "Caffe2_perfkernels_avx_interface")
  list(APPEND
       Caffe2_DEPENDENCY_WHOLE_LINK_LIBS
       "Caffe2_perfkernels_avx2_interface")
-
-  if(CAFFE2_COMPILER_SUPPORTS_AVX512_EXTENSIONS)
-    add_library(Caffe2_perfkernels_avx512 STATIC ${avx512_srcs})
-    target_link_libraries(Caffe2_perfkernels_avx512 PRIVATE c10)
-    install(TARGETS Caffe2_perfkernels_avx512
-        ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}")
-
-    if(MSVC AND NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-      target_compile_options(Caffe2_perfkernels_avx512
-          PRIVATE "/D__AVX512F__"
-          PRIVATE "/D__AVX512DQ__"
-          PRIVATE "/D__AVX512VL__"
-          PRIVATE "/arch:AVX2"
-          PRIVATE "/D__FMA__"
-          PRIVATE "/D__F16C__")
-    else()
-      target_compile_options(Caffe2_perfkernels_avx512
-          PRIVATE "-mavx512f"
-          PRIVATE "-mavx512dq"
-          PRIVATE "-mavx512vl"
-          PRIVATE "-mavx2"
-          PRIVATE "-mfma"
-          PRIVATE "-mavx"
-          PRIVATE "-mf16c")
-    endif()
-    caffe2_interface_library(
-        Caffe2_perfkernels_avx512 Caffe2_perfkernels_avx512_interface)
-    list(APPEND
-         Caffe2_DEPENDENCY_WHOLE_LINK_LIBS
-         "Caffe2_perfkernels_avx512_interface")
-  endif()
 endif()

 # TODO(jiayq): currently, we only implement the very base files for the
--- a/caffe2/perfkernels/embedding_lookup.cc
+++ b/caffe2/perfkernels/embedding_lookup.cc
@ -1,227 +0,0 @@
-#include "caffe2/perfkernels/embedding_lookup.h"
-
-#include "caffe2/perfkernels/common.h"
-
-#include <c10/util/Half.h>
-#include <c10/util/Logging.h>
-#include <c10/util/irange.h>
-
-namespace caffe2 {
-
-/**
- * Base implementation does runtime dispatch for each segment of reduction
- * @return false if there is an out-of-bound error
- */
-template <
-    typename IndexType,
-    typename InType,
-    typename OutType,
-    bool IS_WEIGHT_POSITIONAL = false>
-static bool EmbeddingLookupGenericSlow(
-    const int64_t block_size,
-    const int64_t output_size,
-    const int64_t index_size,
-    const int64_t data_size,
-    const InType* input,
-    const IndexType* indices,
-    const int* lengths,
-    const float* weights, // optional, can be null for sum reducer
-    const float* scale_bias, // optional scale & bias params for uint8 input
-    bool normalize_by_lengths,
-    OutType* out) {
-  int64_t current = 0;
-  for (const auto m : c10::irange(output_size)) {
-    memset(out, 0, sizeof(OutType) * block_size);
-    if (current + lengths[m] > index_size) {
-      return false;
-    }
-    for (int i = 0; i < lengths[m]; ++i) {
-      int64_t idx = indices[current];
-      if (idx < 0 || idx >= data_size) {
-        return false;
-      }
-#ifdef __GNUC__
-      if (current + 1 < index_size) {
-        __builtin_prefetch(input + block_size * indices[current + 1], 0, 1);
-      }
-#endif // __GNUC__
-
-      float w = 1.f, b = 0.f;
-      if (weights) {
-        w = weights[IS_WEIGHT_POSITIONAL ? i : current];
-      }
-      if (scale_bias) {
-        b = w * scale_bias[2 * indices[current] + 1];
-        w = w * scale_bias[2 * indices[current]];
-      }
-
-      for (const auto j : c10::irange(block_size)) {
-        out[j] += w * input[block_size * indices[current] + j] + b;
-      }
-
-      ++current;
-    }
-    if (normalize_by_lengths && lengths[m]) {
-      // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
-      float scale = 1.f / lengths[m];
-      for (const auto j : c10::irange(block_size)) {
-        out[j] *= scale;
-      }
-    }
-    out += block_size;
-  }
-  return current == index_size;
-}
-
-// clang-format off
-// Proxy back to generic implementation
-#define EMBEDDING_SPECIALIZATION(                                                                  \
-    IndexType, InTypeName, InType, OutType, IS_WEIGHT_POSITIONAL)                                  \
-  bool                                                                                             \
-      EmbeddingLookup_##IndexType##_##InTypeName##_##OutType##_##IS_WEIGHT_POSITIONAL##__base(     \
-          const int64_t block_size,                                                                \
-          const int64_t output_size,                                                               \
-          const int64_t index_size,                                                                \
-          const int64_t data_size,                                                                 \
-          const InType* input,                                                                     \
-          const IndexType* indices,                                                                \
-          const int* lengths,                                                                      \
-          const float* weights,                                                                    \
-          const float* scale_bias,                                                                 \
-          bool normalize_by_lengths,                                                               \
-          OutType* out) {                                                                          \
-    return EmbeddingLookupGenericSlow<                                                             \
-        IndexType,                                                                                 \
-        InType,                                                                                    \
-        OutType,                                                                                   \
-        IS_WEIGHT_POSITIONAL>(                                                                     \
-        block_size,                                                                                \
-        output_size,                                                                               \
-        index_size,                                                                                \
-        data_size,                                                                                 \
-        input,                                                                                     \
-        indices,                                                                                   \
-        lengths,                                                                                   \
-        weights,                                                                                   \
-        scale_bias,                                                                                \
-        normalize_by_lengths,                                                                      \
-        out);                                                                                      \
-  }                                                                                                \
-  decltype(                                                                                        \
-      EmbeddingLookup_##IndexType##_##InTypeName##_##OutType##_##IS_WEIGHT_POSITIONAL##__base)     \
-      EmbeddingLookup_##IndexType##_##InTypeName##_##OutType##_##IS_WEIGHT_POSITIONAL##__avx2_fma; \
-  bool                                                                                             \
-      EmbeddingLookup_##IndexType##_##InTypeName##_##OutType##_##IS_WEIGHT_POSITIONAL(             \
-          const int64_t block_size,                                                                \
-          const int64_t output_size,                                                               \
-          const int64_t index_size,                                                                \
-          const int64_t data_size,                                                                 \
-          const InType* input,                                                                     \
-          const IndexType* indices,                                                                \
-          const int* lengths,                                                                      \
-          const float* weights,                                                                    \
-          const float* scale_bias,                                                                 \
-          bool normalize_by_lengths,                                                               \
-          OutType* out) {                                                                          \
-    if (std::is_same<InType, uint8_t>::value) {                                                    \
-      CAFFE_ENFORCE(scale_bias != nullptr, "scale_bias must not be nullptr");                      \
-    } else {                                                                                       \
-      CAFFE_ENFORCE(scale_bias == nullptr, "scale_bias must be nullptr");                          \
-    }                                                                                              \
-    AVX2_FMA_DO(                                                                                   \
-        EmbeddingLookup_##IndexType##_##InTypeName##_##OutType##_##IS_WEIGHT_POSITIONAL,           \
-        block_size,                                                                                \
-        output_size,                                                                               \
-        index_size,                                                                                \
-        data_size,                                                                                 \
-        input,                                                                                     \
-        indices,                                                                                   \
-        lengths,                                                                                   \
-        weights,                                                                                   \
-        scale_bias,                                                                                \
-        normalize_by_lengths,                                                                      \
-        out);                                                                                      \
-    BASE_DO(                                                                                       \
-        EmbeddingLookup_##IndexType##_##InTypeName##_##OutType##_##IS_WEIGHT_POSITIONAL,           \
-        block_size,                                                                                \
-        output_size,                                                                               \
-        index_size,                                                                                \
-        data_size,                                                                                 \
-        input,                                                                                     \
-        indices,                                                                                   \
-        lengths,                                                                                   \
-        weights,                                                                                   \
-        scale_bias,                                                                                \
-        normalize_by_lengths,                                                                      \
-        out);                                                                                      \
-  }                                                                                                \
-  template <>                                                                                      \
-  void EmbeddingLookup<IndexType, InType, OutType, IS_WEIGHT_POSITIONAL>(                          \
-      const int64_t block_size,                                                                    \
-      const int64_t output_size,                                                                   \
-      const int64_t index_size,                                                                    \
-      const int64_t data_size,                                                                     \
-      const InType* input,                                                                         \
-      const IndexType* indices,                                                                    \
-      const int* lengths,                                                                          \
-      const float* weights,                                                                        \
-      const float* scale_bias,                                                                     \
-      bool normalize_by_lengths,                                                                   \
-      OutType* out) {                                                                              \
-    bool success =                                                                                 \
-        EmbeddingLookup_##IndexType##_##InTypeName##_##OutType##_##IS_WEIGHT_POSITIONAL(           \
-            block_size,                                                                            \
-            output_size,                                                                           \
-            index_size,                                                                            \
-            data_size,                                                                             \
-            input,                                                                                 \
-            indices,                                                                               \
-            lengths,                                                                               \
-            weights,                                                                               \
-            scale_bias,                                                                            \
-            normalize_by_lengths,                                                                  \
-            out);                                                                                  \
-    if (success) {                                                                                 \
-      return;                                                                                      \
-    }                                                                                              \
-    int64_t current = 0;                                                                           \
-    for (int m = 0; m < output_size; ++m) {                                                        \
-      for (int i = 0; i < lengths[m]; ++i) {                                                       \
-        CAFFE_ENFORCE_LT(current, index_size);                                                     \
-        IndexType idx = indices[current];                                                          \
-        CAFFE_ENFORCE(                                                                             \
-            0 <= idx && idx < data_size,                                                           \
-            "Index ",                                                                              \
-            current,                                                                               \
-            " is out of bounds: ",                                                                 \
-            idx,                                                                                   \
-            ", range 0 to ",                                                                       \
-            data_size);                                                                            \
-        ++current;                                                                                 \
-      }                                                                                            \
-    }                                                                                              \
-    CAFFE_ENFORCE_EQ(                                                                              \
-        current,                                                                                   \
-        index_size,                                                                                \
-        "Your input seems to be incorrect: the sum of lengths values should be "                   \
-        "the size of the indices tensor, but it appears not.");                                    \
-  }
-// clang-format on
-
-EMBEDDING_SPECIALIZATION(int32_t, float, float, float, false);
-EMBEDDING_SPECIALIZATION(int64_t, float, float, float, false);
-EMBEDDING_SPECIALIZATION(int32_t, half, at::Half, float, false);
-EMBEDDING_SPECIALIZATION(int64_t, half, at::Half, float, false);
-EMBEDDING_SPECIALIZATION(int32_t, uint8_t, uint8_t, float, false);
-EMBEDDING_SPECIALIZATION(int64_t, uint8_t, uint8_t, float, false);
-
-EMBEDDING_SPECIALIZATION(int32_t, float, float, float, true);
-EMBEDDING_SPECIALIZATION(int64_t, float, float, float, true);
-EMBEDDING_SPECIALIZATION(int32_t, half, at::Half, float, true);
-EMBEDDING_SPECIALIZATION(int64_t, half, at::Half, float, true);
-EMBEDDING_SPECIALIZATION(int32_t, uint8_t, uint8_t, float, true);
-EMBEDDING_SPECIALIZATION(int64_t, uint8_t, uint8_t, float, true);
-
-#undef EMBEDDING_SPECIALIZATION
-
-} // namespace caffe2
--- a/caffe2/perfkernels/embedding_lookup.h
+++ b/caffe2/perfkernels/embedding_lookup.h
@ -1,53 +0,0 @@
-#pragma once
-
-#include <cstdint>
-
-namespace caffe2 {
-
-/**
- * Embedding lookup with reduction.
- *
- * `input` of size data_size * block_size
- * `indices` of size index_size
- * `lengths` of size output_size
- * `weights` nullptr or array of size index_size
- * `out` of size output_size * block_size
- * sum(lengths[i]) == index_size
- *
- * Behavior is roughly equivalent to pseudocode:
- *
- * pos = 0
- * for (i = 0..output_size-1)
- *   for (k = 0..block_size-1)
- *     out[i*block_size + k] = 0
- *   for (j = 0..lengths[i]-1)
- *     for (k = 0..block_size-1)
- *       out[i*block_size + k] += input[indices[pos]*block_size + k] *
- *           (weights ? weights[IS_WEIGHT_POSITIONAL ? j : pos] : 1.0)
- *     pos += 1
- *   if (normalize_weights && lengths[i] > 0)
- *     for (k = 0..block_size-1)
- *       out[i*block_size + k] /= lengths[i]
- *
- * TODO: make this API also take "offsets" rather than "lengths" to match the
- *       API for PyTorch's EmbeddingBag
- */
-template <
-    typename IndexType,
-    typename InType,
-    typename OutType,
-    bool IS_WEIGHT_POSITIONAL = false>
-void EmbeddingLookup(
-    const std::int64_t block_size,
-    const std::int64_t output_size,
-    const std::int64_t index_size,
-    const std::int64_t data_size,
-    const InType* input,
-    const IndexType* indices,
-    const int* lengths,
-    const float* weights, // optional, can be null for non-weighted sum
-    const float* scale_bias, // optional scale & bias params for uint8 input
-    bool normalize_by_lengths,
-    OutType* out);
-
-} // namespace caffe2
--- a/caffe2/perfkernels/embedding_lookup_avx2.cc
+++ b/caffe2/perfkernels/embedding_lookup_avx2.cc
--- a/caffe2/perfkernels/embedding_lookup_fused_8bit_rowwise_avx2.cc
+++ b/caffe2/perfkernels/embedding_lookup_fused_8bit_rowwise_avx2.cc
--- a/caffe2/perfkernels/embedding_lookup_fused_8bit_rowwise_idx_avx2.cc
+++ b/caffe2/perfkernels/embedding_lookup_fused_8bit_rowwise_idx_avx2.cc
--- a/caffe2/utils/conversions.h
+++ b/caffe2/utils/conversions.h
@ -1,35 +0,0 @@
-#pragma once
-
-// See Note [hip-clang differences to hcc]
-
-#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) || \
-    defined(__HIP__) || (defined(__clang__) && defined(__CUDA__))
-#define CONVERSIONS_DECL __host__ __device__ inline
-#else
-#define CONVERSIONS_DECL inline
-#endif
-
-#ifdef _MSC_VER
-#undef IN
-#undef OUT
-#endif
-
-namespace caffe2 {
-
-namespace convert {
-
-template <typename IN, typename OUT>
-CONVERSIONS_DECL OUT To(const IN in) {
-  return static_cast<OUT>(in);
-}
-
-template <typename OUT, typename IN>
-CONVERSIONS_DECL OUT Get(IN x) {
-  return static_cast<OUT>(x);
-}
-
-} // namespace convert
-
-} // namespace caffe2
-
-#undef CONVERSIONS_DECL