[Caffe2] Remove remaining unused perfkernels (#128477)

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/128477
Approved by: https://github.com/ezyang, https://github.com/r-barnes
This commit is contained in:
cyy
2024-06-12 22:19:36 +00:00
committed by PyTorch MergeBot
parent 55a6b38f52
commit 3008644297
10 changed files with 0 additions and 9898 deletions

View File

@ -461,7 +461,6 @@ filegroup(
filegroup(
name = "caffe2_perfkernels_srcs",
srcs = [
"caffe2/perfkernels/embedding_lookup.cc",
"caffe2/perfkernels/embedding_lookup_idx.cc",
],
)
@ -499,7 +498,6 @@ cc_library(
hdrs = [
"caffe2/core/common.h",
"caffe2/perfkernels/common.h",
"caffe2/perfkernels/embedding_lookup.h",
"caffe2/perfkernels/embedding_lookup_idx.h",
"caffe2/utils/fixed_divisor.h",
] + glob([

View File

@ -1,33 +1,7 @@
#include <atomic>
#include "caffe2/core/common.h"
namespace caffe2 {
// A global variable to mark if Caffe2 has cuda linked to the current runtime.
// Do not directly use this variable, but instead use the HasCudaRuntime()
// function below.
std::atomic<bool> g_caffe2_has_cuda_linked{false};
std::atomic<bool> g_caffe2_has_hip_linked{false};
bool HasCudaRuntime() {
return g_caffe2_has_cuda_linked.load();
}
bool HasHipRuntime() {
return g_caffe2_has_hip_linked.load();
}
namespace internal {
void SetCudaRuntimeFlag() {
g_caffe2_has_cuda_linked.store(true);
}
void SetHipRuntimeFlag() {
g_caffe2_has_hip_linked.store(true);
}
} // namespace internal
const std::map<string, string>& GetBuildOptions() {
#ifndef CAFFE2_BUILD_STRINGS
#define CAFFE2_BUILD_STRINGS {}

View File

@ -32,15 +32,6 @@
namespace caffe2 {
// Note(Yangqing): NVCC does not play well with unordered_map on some platforms,
// forcing us to use std::map instead of unordered_map. This may affect speed
// in some cases, but in most of the computation code we do not access map very
// often, so it should be fine for us. I am putting a CaffeMap alias so we can
// change it more easily if things work out for unordered_map down the road.
template <typename Key, typename Value>
using CaffeMap = std::map<Key, Value>;
// using CaffeMap = std::unordered_map;
// Using statements for common classes that we refer to in caffe2 very often.
// Note that we only place it inside caffe2 so the global namespace is not
// polluted.
@ -50,38 +41,11 @@ using std::string;
using std::unique_ptr;
using std::vector;
// Just in order to mark things as not implemented. Do not use in final code.
#define CAFFE_NOT_IMPLEMENTED CAFFE_THROW("Not Implemented.")
// suppress an unused variable.
#if defined(_MSC_VER) && !defined(__clang__)
#define CAFFE2_UNUSED __pragma(warning(suppress : 4100 4101))
#define CAFFE2_USED
#else
#define CAFFE2_UNUSED __attribute__((__unused__))
#define CAFFE2_USED __attribute__((__used__))
#endif //_MSC_VER
// Define alignment macro that is cross platform
#if defined(_MSC_VER) && !defined(__clang__)
#define CAFFE2_ALIGNED(x) __declspec(align(x))
#else
#define CAFFE2_ALIGNED(x) __attribute__((aligned(x)))
#endif
#if (defined _MSC_VER && !defined NOMINMAX)
#define NOMINMAX
#endif
#if defined(__has_cpp_attribute)
#if __has_cpp_attribute(nodiscard)
#define CAFFE2_NODISCARD [[nodiscard]]
#endif
#endif
#if !defined(CAFFE2_NODISCARD)
#define CAFFE2_NODISCARD
#endif
using std::make_unique;
#if defined(__ANDROID__) && !defined(__NDK_MAJOR__)
@ -90,58 +54,6 @@ using ::round;
using std::round;
#endif // defined(__ANDROID__) && !defined(__NDK_MAJOR__)
// dynamic cast reroute: if RTTI is disabled, go to reinterpret_cast
template <typename Dst, typename Src>
inline Dst dynamic_cast_if_rtti(Src ptr) {
#ifdef __GXX_RTTI
return dynamic_cast<Dst>(ptr);
#else
return static_cast<Dst>(ptr);
#endif
}
// SkipIndices are used in operator_fallback_gpu.h and operator_fallback_mkl.h
// as utility functions that marks input / output indices to skip when we use a
// CPU operator as the fallback of GPU/MKL operator option.
template <int... values>
class SkipIndices {
private:
template <int V>
static inline bool ContainsInternal(const int i) {
return (i == V);
}
template <int First, int Second, int... Rest>
static inline bool ContainsInternal(const int i) {
return (i == First) || ContainsInternal<Second, Rest...>(i);
}
public:
static inline bool Contains(const int i) {
return ContainsInternal<values...>(i);
}
};
template <>
class SkipIndices<> {
public:
static inline bool Contains(const int /*i*/) {
return false;
}
};
// HasCudaRuntime() tells the program whether the binary has Cuda runtime
// linked. This function should not be used in static initialization functions
// as the underlying boolean variable is going to be switched on when one
// loads libtorch_gpu.so.
TORCH_API bool HasCudaRuntime();
TORCH_API bool HasHipRuntime();
namespace internal {
// Sets the Cuda Runtime flag that is used by HasCudaRuntime(). You should
// never use this function - it is only used by the Caffe2 gpu code to notify
// Caffe2 core that cuda runtime has been loaded.
TORCH_API void SetCudaRuntimeFlag();
TORCH_API void SetHipRuntimeFlag();
} // namespace internal
// Returns which setting Caffe2 was configured and built with (exported from
// CMake)
TORCH_API const std::map<string, string>& GetBuildOptions();

View File

@ -22,72 +22,26 @@ set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${common_srcs})
# We will only build the perf kernel files if the compiler supports avx2
# extensions.
if(CXX_AVX2_FOUND)
add_library(Caffe2_perfkernels_avx STATIC ${avx_srcs})
add_library(Caffe2_perfkernels_avx2 STATIC ${avx2_srcs})
target_link_libraries(Caffe2_perfkernels_avx PRIVATE c10)
target_link_libraries(Caffe2_perfkernels_avx2 PRIVATE c10)
install(TARGETS Caffe2_perfkernels_avx Caffe2_perfkernels_avx2
ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}")
if(MSVC AND NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
target_compile_options(Caffe2_perfkernels_avx
PRIVATE "/arch:AVX"
PRIVATE "/D__F16C__")
target_compile_options(Caffe2_perfkernels_avx2
PRIVATE "/arch:AVX2"
PRIVATE "/D__FMA__"
PRIVATE "/D__F16C__")
else()
target_compile_options(Caffe2_perfkernels_avx
PRIVATE "-mavx"
PRIVATE "-mf16c")
target_compile_options(Caffe2_perfkernels_avx2
PRIVATE "-mavx2"
PRIVATE "-mfma"
PRIVATE "-mavx"
PRIVATE "-mf16c")
endif()
caffe2_interface_library(
Caffe2_perfkernels_avx Caffe2_perfkernels_avx_interface)
caffe2_interface_library(
Caffe2_perfkernels_avx2 Caffe2_perfkernels_avx2_interface)
list(APPEND
Caffe2_DEPENDENCY_WHOLE_LINK_LIBS
"Caffe2_perfkernels_avx_interface")
list(APPEND
Caffe2_DEPENDENCY_WHOLE_LINK_LIBS
"Caffe2_perfkernels_avx2_interface")
if(CAFFE2_COMPILER_SUPPORTS_AVX512_EXTENSIONS)
add_library(Caffe2_perfkernels_avx512 STATIC ${avx512_srcs})
target_link_libraries(Caffe2_perfkernels_avx512 PRIVATE c10)
install(TARGETS Caffe2_perfkernels_avx512
ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}")
if(MSVC AND NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
target_compile_options(Caffe2_perfkernels_avx512
PRIVATE "/D__AVX512F__"
PRIVATE "/D__AVX512DQ__"
PRIVATE "/D__AVX512VL__"
PRIVATE "/arch:AVX2"
PRIVATE "/D__FMA__"
PRIVATE "/D__F16C__")
else()
target_compile_options(Caffe2_perfkernels_avx512
PRIVATE "-mavx512f"
PRIVATE "-mavx512dq"
PRIVATE "-mavx512vl"
PRIVATE "-mavx2"
PRIVATE "-mfma"
PRIVATE "-mavx"
PRIVATE "-mf16c")
endif()
caffe2_interface_library(
Caffe2_perfkernels_avx512 Caffe2_perfkernels_avx512_interface)
list(APPEND
Caffe2_DEPENDENCY_WHOLE_LINK_LIBS
"Caffe2_perfkernels_avx512_interface")
endif()
endif()
# TODO(jiayq): currently, we only implement the very base files for the

View File

@ -1,227 +0,0 @@
#include "caffe2/perfkernels/embedding_lookup.h"
#include "caffe2/perfkernels/common.h"
#include <c10/util/Half.h>
#include <c10/util/Logging.h>
#include <c10/util/irange.h>
namespace caffe2 {
/**
* Base implementation does runtime dispatch for each segment of reduction
* @return false if there is an out-of-bound error
*/
template <
typename IndexType,
typename InType,
typename OutType,
bool IS_WEIGHT_POSITIONAL = false>
static bool EmbeddingLookupGenericSlow(
const int64_t block_size,
const int64_t output_size,
const int64_t index_size,
const int64_t data_size,
const InType* input,
const IndexType* indices,
const int* lengths,
const float* weights, // optional, can be null for sum reducer
const float* scale_bias, // optional scale & bias params for uint8 input
bool normalize_by_lengths,
OutType* out) {
int64_t current = 0;
for (const auto m : c10::irange(output_size)) {
memset(out, 0, sizeof(OutType) * block_size);
if (current + lengths[m] > index_size) {
return false;
}
for (int i = 0; i < lengths[m]; ++i) {
int64_t idx = indices[current];
if (idx < 0 || idx >= data_size) {
return false;
}
#ifdef __GNUC__
if (current + 1 < index_size) {
__builtin_prefetch(input + block_size * indices[current + 1], 0, 1);
}
#endif // __GNUC__
float w = 1.f, b = 0.f;
if (weights) {
w = weights[IS_WEIGHT_POSITIONAL ? i : current];
}
if (scale_bias) {
b = w * scale_bias[2 * indices[current] + 1];
w = w * scale_bias[2 * indices[current]];
}
for (const auto j : c10::irange(block_size)) {
out[j] += w * input[block_size * indices[current] + j] + b;
}
++current;
}
if (normalize_by_lengths && lengths[m]) {
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
float scale = 1.f / lengths[m];
for (const auto j : c10::irange(block_size)) {
out[j] *= scale;
}
}
out += block_size;
}
return current == index_size;
}
// clang-format off
// Proxy back to generic implementation
#define EMBEDDING_SPECIALIZATION( \
IndexType, InTypeName, InType, OutType, IS_WEIGHT_POSITIONAL) \
bool \
EmbeddingLookup_##IndexType##_##InTypeName##_##OutType##_##IS_WEIGHT_POSITIONAL##__base( \
const int64_t block_size, \
const int64_t output_size, \
const int64_t index_size, \
const int64_t data_size, \
const InType* input, \
const IndexType* indices, \
const int* lengths, \
const float* weights, \
const float* scale_bias, \
bool normalize_by_lengths, \
OutType* out) { \
return EmbeddingLookupGenericSlow< \
IndexType, \
InType, \
OutType, \
IS_WEIGHT_POSITIONAL>( \
block_size, \
output_size, \
index_size, \
data_size, \
input, \
indices, \
lengths, \
weights, \
scale_bias, \
normalize_by_lengths, \
out); \
} \
decltype( \
EmbeddingLookup_##IndexType##_##InTypeName##_##OutType##_##IS_WEIGHT_POSITIONAL##__base) \
EmbeddingLookup_##IndexType##_##InTypeName##_##OutType##_##IS_WEIGHT_POSITIONAL##__avx2_fma; \
bool \
EmbeddingLookup_##IndexType##_##InTypeName##_##OutType##_##IS_WEIGHT_POSITIONAL( \
const int64_t block_size, \
const int64_t output_size, \
const int64_t index_size, \
const int64_t data_size, \
const InType* input, \
const IndexType* indices, \
const int* lengths, \
const float* weights, \
const float* scale_bias, \
bool normalize_by_lengths, \
OutType* out) { \
if (std::is_same<InType, uint8_t>::value) { \
CAFFE_ENFORCE(scale_bias != nullptr, "scale_bias must not be nullptr"); \
} else { \
CAFFE_ENFORCE(scale_bias == nullptr, "scale_bias must be nullptr"); \
} \
AVX2_FMA_DO( \
EmbeddingLookup_##IndexType##_##InTypeName##_##OutType##_##IS_WEIGHT_POSITIONAL, \
block_size, \
output_size, \
index_size, \
data_size, \
input, \
indices, \
lengths, \
weights, \
scale_bias, \
normalize_by_lengths, \
out); \
BASE_DO( \
EmbeddingLookup_##IndexType##_##InTypeName##_##OutType##_##IS_WEIGHT_POSITIONAL, \
block_size, \
output_size, \
index_size, \
data_size, \
input, \
indices, \
lengths, \
weights, \
scale_bias, \
normalize_by_lengths, \
out); \
} \
template <> \
void EmbeddingLookup<IndexType, InType, OutType, IS_WEIGHT_POSITIONAL>( \
const int64_t block_size, \
const int64_t output_size, \
const int64_t index_size, \
const int64_t data_size, \
const InType* input, \
const IndexType* indices, \
const int* lengths, \
const float* weights, \
const float* scale_bias, \
bool normalize_by_lengths, \
OutType* out) { \
bool success = \
EmbeddingLookup_##IndexType##_##InTypeName##_##OutType##_##IS_WEIGHT_POSITIONAL( \
block_size, \
output_size, \
index_size, \
data_size, \
input, \
indices, \
lengths, \
weights, \
scale_bias, \
normalize_by_lengths, \
out); \
if (success) { \
return; \
} \
int64_t current = 0; \
for (int m = 0; m < output_size; ++m) { \
for (int i = 0; i < lengths[m]; ++i) { \
CAFFE_ENFORCE_LT(current, index_size); \
IndexType idx = indices[current]; \
CAFFE_ENFORCE( \
0 <= idx && idx < data_size, \
"Index ", \
current, \
" is out of bounds: ", \
idx, \
", range 0 to ", \
data_size); \
++current; \
} \
} \
CAFFE_ENFORCE_EQ( \
current, \
index_size, \
"Your input seems to be incorrect: the sum of lengths values should be " \
"the size of the indices tensor, but it appears not."); \
}
// clang-format on
EMBEDDING_SPECIALIZATION(int32_t, float, float, float, false);
EMBEDDING_SPECIALIZATION(int64_t, float, float, float, false);
EMBEDDING_SPECIALIZATION(int32_t, half, at::Half, float, false);
EMBEDDING_SPECIALIZATION(int64_t, half, at::Half, float, false);
EMBEDDING_SPECIALIZATION(int32_t, uint8_t, uint8_t, float, false);
EMBEDDING_SPECIALIZATION(int64_t, uint8_t, uint8_t, float, false);
EMBEDDING_SPECIALIZATION(int32_t, float, float, float, true);
EMBEDDING_SPECIALIZATION(int64_t, float, float, float, true);
EMBEDDING_SPECIALIZATION(int32_t, half, at::Half, float, true);
EMBEDDING_SPECIALIZATION(int64_t, half, at::Half, float, true);
EMBEDDING_SPECIALIZATION(int32_t, uint8_t, uint8_t, float, true);
EMBEDDING_SPECIALIZATION(int64_t, uint8_t, uint8_t, float, true);
#undef EMBEDDING_SPECIALIZATION
} // namespace caffe2

View File

@ -1,53 +0,0 @@
#pragma once
#include <cstdint>
namespace caffe2 {
/**
* Embedding lookup with reduction.
*
* `input` of size data_size * block_size
* `indices` of size index_size
* `lengths` of size output_size
* `weights` nullptr or array of size index_size
* `out` of size output_size * block_size
* sum(lengths[i]) == index_size
*
* Behavior is roughly equivalent to pseudocode:
*
* pos = 0
* for (i = 0..output_size-1)
* for (k = 0..block_size-1)
* out[i*block_size + k] = 0
* for (j = 0..lengths[i]-1)
* for (k = 0..block_size-1)
* out[i*block_size + k] += input[indices[pos]*block_size + k] *
* (weights ? weights[IS_WEIGHT_POSITIONAL ? j : pos] : 1.0)
* pos += 1
* if (normalize_weights && lengths[i] > 0)
* for (k = 0..block_size-1)
* out[i*block_size + k] /= lengths[i]
*
* TODO: make this API also take "offsets" rather than "lengths" to match the
* API for PyTorch's EmbeddingBag
*/
template <
typename IndexType,
typename InType,
typename OutType,
bool IS_WEIGHT_POSITIONAL = false>
void EmbeddingLookup(
const std::int64_t block_size,
const std::int64_t output_size,
const std::int64_t index_size,
const std::int64_t data_size,
const InType* input,
const IndexType* indices,
const int* lengths,
const float* weights, // optional, can be null for non-weighted sum
const float* scale_bias, // optional scale & bias params for uint8 input
bool normalize_by_lengths,
OutType* out);
} // namespace caffe2

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,35 +0,0 @@
#pragma once
// See Note [hip-clang differences to hcc]
#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) || \
defined(__HIP__) || (defined(__clang__) && defined(__CUDA__))
#define CONVERSIONS_DECL __host__ __device__ inline
#else
#define CONVERSIONS_DECL inline
#endif
#ifdef _MSC_VER
#undef IN
#undef OUT
#endif
namespace caffe2 {
namespace convert {
template <typename IN, typename OUT>
CONVERSIONS_DECL OUT To(const IN in) {
return static_cast<OUT>(in);
}
template <typename OUT, typename IN>
CONVERSIONS_DECL OUT Get(IN x) {
return static_cast<OUT>(x);
}
} // namespace convert
} // namespace caffe2
#undef CONVERSIONS_DECL