Files
pytorch/caffe2/perfkernels/batch_box_cox.cc
Nikita Shulga a229e78544 [BE] Enforce sign-compare (#96723)
Number of OSS PR were reverted, because new signed-unsigned comparison warnings, which are treated as errors in some internal builds.
Not sure how those selective rules are applied, but this PR removes `-Wno-sign-compare` from PyTorch codebase.

The only tricky part in this PR, as making sure that non-ASCII character detection works for both signed and unsigned chars  here:
6e3d51b08a/torch/csrc/jit/serialization/python_print.cpp (L926)

Exclude several files from sign-compare if flash attention is used, due to the violation in cutlass, to be fixed by https://github.com/NVIDIA/cutlass/pull/869
Do not try to fix sign compare violations in caffe2 codebase
Pull Request resolved: https://github.com/pytorch/pytorch/pull/96723
Approved by: https://github.com/albanD
2023-03-15 06:04:20 +00:00

114 lines
2.5 KiB
C++

#include "caffe2/perfkernels/common.h"
#include <algorithm>
#include <cstdint>
#include <cmath>
namespace caffe2 {
namespace {
template <typename T>
void BoxCoxNaive(
std::size_t N,
std::size_t D,
const T* data_ptr,
const T* __restrict lambda1_ptr,
const T* __restrict lambda2_ptr,
T* output_ptr) {
constexpr T k_eps = static_cast<T>(1e-6);
for (std::size_t i = 0; i < N; i++) {
for (std::size_t j = 0; j < D; j++, data_ptr++, output_ptr++) {
T lambda1_v = lambda1_ptr[j];
T lambda2_v = lambda2_ptr[j];
T tmp = std::max(*data_ptr + lambda2_v, k_eps);
if (lambda1_v == 0) {
*output_ptr = std::log(tmp);
} else {
T lambda_1 = 1 / lambda1_v;
T pow = std::pow(tmp, lambda1_v);
*output_ptr = lambda_1 * pow - lambda_1;
}
}
}
}
}
#if defined(CAFFE2_PERF_WITH_AVX2) && defined(CAFFE2_PERF_USE_MKL)
namespace details {
template <typename T>
void compute_batch_box_cox__avx2_fma(
std::size_t N,
std::size_t D,
std::size_t block_size,
const T* data_ptr,
const T* __restrict lambda1_ptr,
const T* __restrict lambda2_ptr,
T* output_ptr);
extern template
void compute_batch_box_cox__avx2_fma<float>(
std::size_t N,
std::size_t D,
std::size_t block_size,
const float* self_data,
const float* __restrict lambda1_data,
const float* __restrict lambda2_data,
float* output_data);
extern template
void compute_batch_box_cox__avx2_fma<double>(
std::size_t N,
std::size_t D,
std::size_t block_size,
const double* self_data,
const double* __restrict lambda1_data,
const double* __restrict lambda2_data,
double* output_data);
} // namespace detail
#endif
template <typename T>
void compute_batch_box_cox(
std::size_t N,
std::size_t D,
std::size_t block_size,
const T* data,
const T* lambda1_data,
const T* lambda2_data,
T* output_data) {
#ifdef CAFFE2_PERF_WITH_AVX2
AVX2_FMA_DO(
details::compute_batch_box_cox,
N,
D,
block_size,
data,
lambda1_data,
lambda2_data,
output_data);
#endif
BoxCoxNaive<T>(N, D, data, lambda1_data, lambda2_data, output_data);
}
template void compute_batch_box_cox<float>(
std::size_t N,
std::size_t D,
std::size_t block_size,
const float* data,
const float* lambda1_data,
const float* lambda2_data,
float* output_data);
template void compute_batch_box_cox<double>(
std::size_t N,
std::size_t D,
std::size_t block_size,
const double* data,
const double* lambda1_data,
const double* lambda2_data,
double* output_data);
} // namespace caffe2