mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/65610 - Replace HIP_PLATFORM_HCC with USE_ROCM - Dont rely on CUDA_VERSION or HIP_VERSION and use USE_ROCM and ROCM_VERSION. - In the next PR - Will be removing the mapping from CUDA_VERSION to HIP_VERSION and CUDA to HIP in hipify. - HIP_PLATFORM_HCC is deprecated, so will add HIP_PLATFORM_AMD to support HIP host code compilation on gcc. cc jeffdaily sunway513 jithunnair-amd ROCmSupport amathews-amd Reviewed By: jbschlosser Differential Revision: D30909053 Pulled By: ezyang fbshipit-source-id: 224a966ebf1aaec79beccbbd686fdf3d49267e06
58 lines
1.6 KiB
Plaintext
58 lines
1.6 KiB
Plaintext
#include "caffe2/operators/minmax_ops.h"
|
|
|
|
#include "caffe2/core/context_gpu.h"
|
|
#include "caffe2/utils/math.h"
|
|
|
|
namespace caffe2 {
|
|
|
|
namespace {
|
|
|
|
template <typename T>
|
|
__global__ void SelectGradientCUDAKernel(
|
|
const int N,
|
|
const T* dY,
|
|
const T* X,
|
|
const T* Y,
|
|
T* dX) {
|
|
const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
|
|
if (i < N) {
|
|
#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM)
|
|
dX[i] = __ldg(X + i) == __ldg(Y + i) ? __ldg(dY + i) : T(0);
|
|
#else
|
|
dX[i] = X[i] == Y[i] ? dY[i] : T(0);
|
|
#endif
|
|
}
|
|
}
|
|
|
|
} // namespace
|
|
|
|
template <>
|
|
bool SelectGradientOpBase<float, CUDAContext>::RunOnDevice() {
|
|
const auto& Y = Input(0);
|
|
const auto& dY = Input(1);
|
|
const int N = Y.numel();
|
|
const int M = math::DivUp(N, CAFFE_CUDA_NUM_THREADS);
|
|
const float* dY_data = dY.data<float>();
|
|
const float* Y_data = Y.data<float>();
|
|
for (int i = 0; i < OutputSize(); i++) {
|
|
const auto& Xi = Input(i + 2);
|
|
auto* dXi = Output(i, Xi.sizes(), at::dtype<float>());
|
|
const float* Xi_data = Xi.data<float>();
|
|
float* dXi_data = dXi->mutable_data<float>();
|
|
if (N > 0) {
|
|
SelectGradientCUDAKernel<float>
|
|
<<<M, CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
|
|
N, dY_data, Xi_data, Y_data, dXi_data);
|
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
REGISTER_CUDA_OPERATOR(Min, MinOp<float, CUDAContext>);
|
|
REGISTER_CUDA_OPERATOR(MinGradient, MinGradientOp<float, CUDAContext>);
|
|
REGISTER_CUDA_OPERATOR(Max, MaxOp<float, CUDAContext>);
|
|
REGISTER_CUDA_OPERATOR(MaxGradient, MaxGradientOp<float, CUDAContext>);
|
|
|
|
} // namespace caffe2
|