Files
pytorch/caffe2/operators/minmax_ops.cu
Pruthvi Madugundu 085e2f7bdd [ROCm] Changes not to rely on CUDA_VERSION or HIP_VERSION (#65610)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/65610

- Replace HIP_PLATFORM_HCC with USE_ROCM
- Dont rely on CUDA_VERSION or HIP_VERSION and use USE_ROCM and ROCM_VERSION.

- In the next PR
   - Will be removing the mapping from CUDA_VERSION to HIP_VERSION and CUDA to HIP in hipify.
   - HIP_PLATFORM_HCC is deprecated, so will add HIP_PLATFORM_AMD to support HIP host code compilation on gcc.

cc jeffdaily sunway513 jithunnair-amd ROCmSupport amathews-amd

Reviewed By: jbschlosser

Differential Revision: D30909053

Pulled By: ezyang

fbshipit-source-id: 224a966ebf1aaec79beccbbd686fdf3d49267e06
2021-09-29 09:55:43 -07:00

58 lines
1.6 KiB
Plaintext

#include "caffe2/operators/minmax_ops.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
namespace {
template <typename T>
__global__ void SelectGradientCUDAKernel(
const int N,
const T* dY,
const T* X,
const T* Y,
T* dX) {
const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
if (i < N) {
#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM)
dX[i] = __ldg(X + i) == __ldg(Y + i) ? __ldg(dY + i) : T(0);
#else
dX[i] = X[i] == Y[i] ? dY[i] : T(0);
#endif
}
}
} // namespace
template <>
bool SelectGradientOpBase<float, CUDAContext>::RunOnDevice() {
const auto& Y = Input(0);
const auto& dY = Input(1);
const int N = Y.numel();
const int M = math::DivUp(N, CAFFE_CUDA_NUM_THREADS);
const float* dY_data = dY.data<float>();
const float* Y_data = Y.data<float>();
for (int i = 0; i < OutputSize(); i++) {
const auto& Xi = Input(i + 2);
auto* dXi = Output(i, Xi.sizes(), at::dtype<float>());
const float* Xi_data = Xi.data<float>();
float* dXi_data = dXi->mutable_data<float>();
if (N > 0) {
SelectGradientCUDAKernel<float>
<<<M, CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
N, dY_data, Xi_data, Y_data, dXi_data);
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
}
return true;
}
REGISTER_CUDA_OPERATOR(Min, MinOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(MinGradient, MinGradientOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(Max, MaxOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(MaxGradient, MaxGradientOp<float, CUDAContext>);
} // namespace caffe2