mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Summary: This PR is to update PyTorch with the following cub changes: - Starting cub 1.13.1, cub requires users to define `CUB_NS_QUALIFIER` if `CUB_NS_PREFIX` is also defined. Besides that, a new mechanism `CUB_WRAPPED_NAMESPACE` is added. And I do the following change to PyTorch: - Starting CUDA 11.5, define `CUB_WRAPPED_NAMESPACE` globally as an nvcc flag. - Fix caffe2 failures caused by the above change. - Add a `aten/src/ATen/cuda/cub_definitions.cuh` that defines helper macros about feature availability. Pull Request resolved: https://github.com/pytorch/pytorch/pull/66219 Reviewed By: bdhirsh Differential Revision: D31626931 Pulled By: ngimel fbshipit-source-id: 97ebf5ef671ade8bf46d0860edc317f22660f26d
866 lines
24 KiB
Plaintext
866 lines
24 KiB
Plaintext
#include <cfloat>
|
|
#include <cub/block/block_reduce.cuh>
|
|
|
|
#include "caffe2/core/context_gpu.h"
|
|
#include "caffe2/operators/softmax_op.h"
|
|
#include "caffe2/operators/softmax_with_loss_op.h"
|
|
#include "caffe2/operators/spatial_softmax_with_loss_op.h"
|
|
#include "caffe2/utils/cub_namespace.cuh"
|
|
|
|
namespace caffe2 {
|
|
|
|
namespace {
|
|
|
|
__global__ void LabelCrossEntropyKernel(
|
|
const int N,
|
|
const int D,
|
|
const float* logPdata,
|
|
const int* labeldata,
|
|
const float* weights,
|
|
float* Ydata) {
|
|
CUDA_1D_KERNEL_LOOP(i, N) {
|
|
CUDA_KERNEL_ASSERT(labeldata[i] >= 0 && labeldata[i] < D);
|
|
float weight = weights ? weights[i] : 1.0;
|
|
Ydata[i] = -logPdata[i * D + labeldata[i]] * weight;
|
|
}
|
|
}
|
|
|
|
__global__ void LabelCrossEntropyGradientKernel(
|
|
const int N,
|
|
const int D,
|
|
const float* Pdata,
|
|
const int* labeldata,
|
|
float* dXdata) {
|
|
CUDA_1D_KERNEL_LOOP(i, N) {
|
|
int idx = i * D + labeldata[i];
|
|
dXdata[idx] = Pdata[idx] - 1.;
|
|
}
|
|
}
|
|
|
|
__global__ void LabelCrossEntropyGradientKernelWeighted(
|
|
const int N,
|
|
const int D,
|
|
const float* Pdata,
|
|
const int* labeldata,
|
|
float* dXdata,
|
|
const float* weights) {
|
|
CUDA_1D_KERNEL_LOOP(i, N * D) {
|
|
int row = i / D;
|
|
int d = i % D;
|
|
float val = Pdata[i] - 1.0 * (d == labeldata[row]);
|
|
float weight = weights[row];
|
|
dXdata[i] = val * weight;
|
|
}
|
|
}
|
|
|
|
__global__ void ProbCrossEntropyKernel(
|
|
const int N,
|
|
const int D,
|
|
const float* Pdata,
|
|
const float* labeldata,
|
|
const float* weights,
|
|
float* Ydata) {
|
|
typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce;
|
|
__shared__ typename BlockReduce::TempStorage temp_storage;
|
|
|
|
for (int i = blockIdx.x; i < N; i += gridDim.x) {
|
|
float weight = weights ? weights[i] : 1.0;
|
|
float sum = 0.0;
|
|
float total_prob = 0.0;
|
|
for (int j = threadIdx.x; j < D; j += blockDim.x) {
|
|
int idx = i * D + j;
|
|
CUDA_KERNEL_ASSERT(labeldata[idx] >= 0);
|
|
total_prob += labeldata[idx];
|
|
sum += -logf(fmaxf(Pdata[idx], FLT_MIN)) * labeldata[idx] * weight;
|
|
}
|
|
float tot = BlockReduce(temp_storage).Sum(sum);
|
|
__syncthreads();
|
|
float total_prob_sum = BlockReduce(temp_storage).Sum(total_prob);
|
|
if (threadIdx.x == 0) {
|
|
Ydata[i] = tot;
|
|
// Sanity check
|
|
CUDA_KERNEL_ASSERT(fabsf(1.0 - total_prob_sum) < 1e-5f);
|
|
}
|
|
__syncthreads();
|
|
}
|
|
}
|
|
|
|
__global__ void ProbCrossEntropyGradientKernel(
|
|
const int N,
|
|
const int D,
|
|
const float* Pdata,
|
|
const float* labeldata,
|
|
float* dXdata,
|
|
const float* weights) {
|
|
if (weights == NULL) {
|
|
CUDA_1D_KERNEL_LOOP(idx, N * D) {
|
|
dXdata[idx] = Pdata[idx] - labeldata[idx];
|
|
}
|
|
} else {
|
|
CUDA_1D_KERNEL_LOOP(idx, N * D) {
|
|
dXdata[idx] = (Pdata[idx] - labeldata[idx]) * weights[idx / D];
|
|
}
|
|
}
|
|
}
|
|
|
|
__global__ void SpatialSoftmaxKernel(
|
|
const int num,
|
|
const int D,
|
|
const int W,
|
|
const int H,
|
|
const float* Xdata,
|
|
float* Pdata) {
|
|
CUDA_1D_KERNEL_LOOP(index, num * W * H) {
|
|
int x = index % W;
|
|
int y = (index / W) % H;
|
|
int i = index / W / H;
|
|
|
|
// Subtract max on each cell for numerical reasons
|
|
float max_val = -FLT_MAX;
|
|
for (int c = 0; c < D; ++c) {
|
|
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
|
|
max_val = fmaxf(max_val, Xdata[idx]);
|
|
}
|
|
|
|
// Exponentiate
|
|
float expsum = 0.0f;
|
|
for (int c = 0; c < D; ++c) {
|
|
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
|
|
float expx = expf(Xdata[idx] - max_val);
|
|
Pdata[idx] = expx;
|
|
expsum += expx;
|
|
}
|
|
|
|
// Normalize
|
|
for (int c = 0; c < D; ++c) {
|
|
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
|
|
Pdata[idx] /= expsum;
|
|
}
|
|
}
|
|
}
|
|
|
|
#define DONTCARE (-1)
|
|
|
|
__global__ void SpatialCrossEntropyLossKernel(
|
|
const int N,
|
|
const int D,
|
|
const int W,
|
|
const int H,
|
|
const float* Pdata,
|
|
const int* label_data,
|
|
const float* weights,
|
|
float* loss_data,
|
|
float* weight_data) {
|
|
CUDA_1D_KERNEL_LOOP(index, N * W * H) {
|
|
int x = index % W;
|
|
int y = (index / W) % H;
|
|
int i = index / W / H;
|
|
const int label = static_cast<int>(label_data[index]);
|
|
|
|
if (label != DONTCARE) {
|
|
CUDA_KERNEL_ASSERT(label >= 0 && label < D);
|
|
float weight = (weights == NULL ? 1.0 : weights[index]);
|
|
loss_data[index] =
|
|
-logf(
|
|
fmaxf(Pdata[i * W * H * D + label * W * H + y * W + x], 1e-20f)) *
|
|
weight;
|
|
weight_data[index] = weight;
|
|
} else {
|
|
loss_data[index] = 0;
|
|
weight_data[index] = 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
__global__ void SpatialSoftmaxLossGradientKernel(
|
|
const int N,
|
|
const int D,
|
|
const int W,
|
|
const int H,
|
|
const int* label_data,
|
|
const float* weights,
|
|
float* dX_data,
|
|
float* weights_) {
|
|
CUDA_1D_KERNEL_LOOP(index, N * W * H) {
|
|
int x = index % W;
|
|
int y = (index / W) % H;
|
|
int i = index / W / H;
|
|
const int label = static_cast<int>(label_data[index]);
|
|
|
|
if (label != DONTCARE) {
|
|
int data_idx = i * (H * W * D) + label * (H * W) + y * W + x;
|
|
dX_data[data_idx] -= 1.0;
|
|
if (weights != NULL) {
|
|
float weight = weights[index];
|
|
for (int c = 0; c < D; ++c) {
|
|
int data_idx = i * (H * W * D) + c * (H * W) + y * W + x;
|
|
dX_data[data_idx] *= weight;
|
|
}
|
|
weights_[index] = weight;
|
|
} else {
|
|
weights_[index] = 1.0;
|
|
}
|
|
} else {
|
|
// Ignore-label, so set all gradients for this positions
|
|
// tp zero
|
|
for (int c = 0; c < D; ++c) {
|
|
int data_idx = i * (H * W * D) + c * (H * W) + y * W + x;
|
|
dX_data[data_idx] = 0.0;
|
|
}
|
|
weights_[index] = 0.0;
|
|
}
|
|
}
|
|
}
|
|
|
|
__global__ void SoftmaxNormalizeLogsKernel(
|
|
const int nthreads,
|
|
const int D,
|
|
const float* logits,
|
|
const float* rowmax,
|
|
const float* scales,
|
|
float* out_log) {
|
|
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
|
int n = index / D;
|
|
out_log[index] =
|
|
logits[index] - rowmax[n] - logf(fmaxf(scales[n], FLT_MIN));
|
|
}
|
|
}
|
|
|
|
__global__ void SoftmaxNormalizeKernel(
|
|
const int nthreads,
|
|
const int D,
|
|
const float* probs,
|
|
const float* scales,
|
|
float* out) {
|
|
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
|
int n = index / D;
|
|
out[index] = probs[index] / scales[n];
|
|
}
|
|
}
|
|
|
|
void Softmax(
|
|
const int N,
|
|
const int D,
|
|
const float* logits,
|
|
const float* sum_multiplier,
|
|
float* scales,
|
|
float* rowmax,
|
|
float* probs,
|
|
bool log_softmax,
|
|
CUDAContext* context) {
|
|
const int size = N * D;
|
|
|
|
math::RowwiseMax<float, CUDAContext>(N, D, logits, rowmax, context);
|
|
// Put the intermediate result X - max(X) into Y
|
|
context->CopySameDevice<float>(size, logits, probs);
|
|
// Subtract the scale
|
|
math::Gemm<float, CUDAContext>(
|
|
CblasNoTrans,
|
|
CblasNoTrans,
|
|
N,
|
|
D,
|
|
1,
|
|
-1,
|
|
rowmax,
|
|
sum_multiplier,
|
|
1,
|
|
probs,
|
|
context);
|
|
// Exponentiation
|
|
math::Exp<float, CUDAContext>(size, probs, probs, context);
|
|
// Sum exponentiated values
|
|
math::Gemv<float, CUDAContext>(
|
|
CblasNoTrans, N, D, 1, probs, sum_multiplier, 0, scales, context);
|
|
// Normalize
|
|
if (!log_softmax) {
|
|
SoftmaxNormalizeKernel<<<
|
|
CAFFE_GET_BLOCKS(size),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(size, D, probs, scales, probs);
|
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
|
} else {
|
|
SoftmaxNormalizeLogsKernel<<<
|
|
CAFFE_GET_BLOCKS(size),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(size, D, logits, rowmax, scales, probs);
|
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
|
}
|
|
}
|
|
|
|
} // namespace
|
|
|
|
template <>
|
|
bool SoftmaxWithLossOp<float, CUDAContext>::RunOnDevice() {
|
|
auto& X = Input(0); // Logits
|
|
auto& T = Input(1); // Labels / targets
|
|
|
|
const float* weights = (InputSize() > 2 ? Input(2).data<float>() : NULL);
|
|
const auto canonical_axis = X.canonical_axis_index(axis_);
|
|
int N, D;
|
|
N = X.size_to_dim(canonical_axis); // batch size
|
|
D = X.size_from_dim(canonical_axis);
|
|
|
|
auto* P =
|
|
Output(0, X.sizes(), at::dtype<float>()); // Probabilities from softmax
|
|
ReinitializeTensor(&total_weight_ptr_, {1}, at::dtype<float>().device(CUDA));
|
|
total_weight_ptr_.Resize(1);
|
|
|
|
if (label_prob_mode_) {
|
|
CAFFE_ENFORCE_GE(T.dim(), 2);
|
|
CAFFE_ENFORCE_EQ(T.size_to_dim(canonical_axis), N);
|
|
CAFFE_ENFORCE_EQ(T.size_from_dim(canonical_axis), D);
|
|
} else {
|
|
if (T.dim() == canonical_axis) {
|
|
CAFFE_ENFORCE_EQ(T.numel(), N);
|
|
} else {
|
|
CAFFE_ENFORCE_EQ(T.size_to_dim(canonical_axis), N);
|
|
CAFFE_ENFORCE_EQ(T.size_from_dim(canonical_axis), 1);
|
|
}
|
|
}
|
|
|
|
auto* avg_loss =
|
|
Output(1, vector<int64_t>(), at::dtype<float>()); // Average loss
|
|
if (!losses_.defined()) {
|
|
losses_ = caffe2::empty({N}, at::dtype<float>().device(CUDA));
|
|
} else if (losses_.numel() != N) {
|
|
losses_.Resize(N);
|
|
}
|
|
|
|
if (!rowmax_.defined()) {
|
|
rowmax_ = caffe2::empty({N}, at::dtype<float>().device(CUDA));
|
|
} else if (rowmax_.numel() != N) {
|
|
rowmax_.Resize(N);
|
|
}
|
|
|
|
if (!sum_multiplier_.defined()) {
|
|
sum_multiplier_ = caffe2::empty({D}, at::dtype<float>().device(CUDA));
|
|
math::Set<float, CUDAContext>(
|
|
D, 1.f, sum_multiplier_.mutable_data<float>(), &context_);
|
|
} else if (sum_multiplier_.numel() != D) {
|
|
sum_multiplier_.Resize(D);
|
|
math::Set<float, CUDAContext>(
|
|
D, 1.f, sum_multiplier_.mutable_data<float>(), &context_);
|
|
}
|
|
|
|
Softmax(
|
|
N,
|
|
D,
|
|
X.data<float>(),
|
|
sum_multiplier_.data<float>(),
|
|
losses_.mutable_data<float>(),
|
|
rowmax_.mutable_data<float>(),
|
|
P->template mutable_data<float>(),
|
|
!label_prob_mode_, // logarithmic output
|
|
&context_);
|
|
// Compute label xent loss per example
|
|
if (!label_prob_mode_) {
|
|
LabelCrossEntropyKernel<<<
|
|
CAFFE_GET_BLOCKS(N),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context_.cuda_stream()>>>(
|
|
N,
|
|
D,
|
|
P->data<float>(),
|
|
T.data<int>(),
|
|
weights,
|
|
losses_.mutable_data<float>());
|
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
|
|
|
// Since we had logarithmic output, we need to exponentiate
|
|
// them again.
|
|
math::Exp<float, CUDAContext>(
|
|
N * D, P->data<float>(), P->template mutable_data<float>(), &context_);
|
|
} else {
|
|
ProbCrossEntropyKernel<<<
|
|
std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context_.cuda_stream()>>>(
|
|
N,
|
|
D,
|
|
P->data<float>(),
|
|
T.data<float>(),
|
|
weights,
|
|
losses_.mutable_data<float>());
|
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
|
}
|
|
|
|
float total_weight = N;
|
|
if (weights) {
|
|
// Sum weights
|
|
math::Sum<float, CUDAContext>(
|
|
N,
|
|
weights,
|
|
total_weight_ptr_.mutable_data<float>(),
|
|
&context_,
|
|
&scratch_);
|
|
CUDA_CHECK(cudaMemcpyAsync(
|
|
&total_weight,
|
|
total_weight_ptr_.data<float>(),
|
|
sizeof(float),
|
|
cudaMemcpyDeviceToHost,
|
|
context_.cuda_stream()));
|
|
}
|
|
|
|
// Sum of all losses
|
|
float* avg_loss_data = avg_loss->template mutable_data<float>();
|
|
math::Sum<float, CUDAContext>(
|
|
losses_.numel(),
|
|
losses_.data<float>(),
|
|
avg_loss_data,
|
|
&context_,
|
|
&scratch_);
|
|
// Average of input batch size
|
|
if (total_weight > 0) {
|
|
math::Scale<float, float, CUDAContext>(
|
|
1, scale_ / total_weight, avg_loss_data, avg_loss_data, &context_);
|
|
}
|
|
if (OutputSize() > 2) {
|
|
OutputTensorAlias(2, losses_);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
template <>
|
|
bool SpatialSoftmaxWithLossOp<float, CUDAContext>::RunOnDevice() {
|
|
auto& X = Input(0); // Logits
|
|
auto& T = Input(1); // Labels / targets
|
|
|
|
const float* weights = (InputSize() > 2 ? Input(2).data<float>() : NULL);
|
|
int N, D;
|
|
N = X.dim32(0);
|
|
D = X.dim32(1);
|
|
|
|
auto* P =
|
|
Output(0, X.sizes(), at::dtype<float>()); // Probabilities from softmax
|
|
ReinitializeTensor(&total_weight_ptr_, {1}, at::dtype<float>().device(CUDA));
|
|
|
|
CAFFE_ENFORCE_EQ(X.dim(), 4);
|
|
CAFFE_ENFORCE_EQ(T.dim(), 3);
|
|
CAFFE_ENFORCE_EQ(T.dim32(0), N);
|
|
|
|
int H = X.dim32(2);
|
|
int W = X.dim32(3);
|
|
if (!losses_.defined()) {
|
|
losses_ = caffe2::empty({N * W * H}, at::dtype<float>().device(CUDA));
|
|
} else if (losses_.numel() != N * W * H) {
|
|
losses_.Resize(N * W * H);
|
|
}
|
|
|
|
if (!weights_.defined()) {
|
|
weights_ = caffe2::empty({N * W * H}, at::dtype<float>().device(CUDA));
|
|
} else if (weights_.numel() != N * W * H) {
|
|
weights_.Resize(N * W * H);
|
|
}
|
|
|
|
const float* Xdata = X.data<float>();
|
|
float* Pdata = P->template mutable_data<float>();
|
|
|
|
// Softmax for each x,y location
|
|
SpatialSoftmaxKernel<<<
|
|
CAFFE_GET_BLOCKS(N),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context_.cuda_stream()>>>(N, D, W, H, Xdata, Pdata);
|
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
|
|
|
// Cross entropy
|
|
auto* avg_loss =
|
|
Output(1, vector<int64_t>(), at::dtype<float>()); // Average loss
|
|
float* avg_loss_data = avg_loss->template mutable_data<float>();
|
|
math::Set<float, CUDAContext>(1, 0.0f, avg_loss_data, &context_);
|
|
|
|
const int* label_data = T.data<int>();
|
|
math::Set<float, CUDAContext>(
|
|
1, 0.0f, total_weight_ptr_.mutable_data<float>(), &context_);
|
|
|
|
SpatialCrossEntropyLossKernel<<<
|
|
CAFFE_GET_BLOCKS(N * W * H),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context_.cuda_stream()>>>(
|
|
N,
|
|
D,
|
|
W,
|
|
H,
|
|
P->data<float>(),
|
|
label_data,
|
|
weights,
|
|
losses_.mutable_data<float>(),
|
|
weights_.mutable_data<float>());
|
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
|
|
|
// Somewhat awkward scalar passing from device to host
|
|
float h_total_weight;
|
|
math::Sum<float, CUDAContext>(
|
|
weights_.numel(),
|
|
weights_.data<float>(),
|
|
total_weight_ptr_.mutable_data<float>(),
|
|
&context_,
|
|
&scratch_);
|
|
CUDA_CHECK(cudaMemcpyAsync(
|
|
&h_total_weight,
|
|
total_weight_ptr_.data<float>(),
|
|
sizeof(float),
|
|
cudaMemcpyDeviceToHost,
|
|
context_.cuda_stream()));
|
|
|
|
math::Sum<float, CUDAContext>(
|
|
losses_.numel(),
|
|
losses_.data<float>(),
|
|
avg_loss_data,
|
|
&context_,
|
|
&scratch_);
|
|
|
|
// Final scaling
|
|
if (h_total_weight > 0) {
|
|
math::Scale<float, float, CUDAContext>(
|
|
1, scale_ / h_total_weight, avg_loss_data, avg_loss_data, &context_);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
template <>
|
|
bool SoftmaxWithLossGradientOp<float, CUDAContext>::RunOnDevice() {
|
|
auto& X = Input(0); // Logits
|
|
auto& T = Input(1); // Labels / targets
|
|
// Input(2) is weights, if given
|
|
auto& P = Input(InputSize() - 2); // Probabilities from softmax
|
|
auto& d_avg_loss = Input(InputSize() - 1); // Gradient w.r.t. avg loss
|
|
const float* weights = (InputSize() > 4 ? Input(2).data<float>() : NULL);
|
|
|
|
Tensor* dX;
|
|
if (only_loss_) {
|
|
// Memory saving trick to share the buffer with the softmax output.
|
|
// Softmax output is thus overwritten.
|
|
dX = OutputTensorAlias(0, P);
|
|
dX->ResizeLike(X);
|
|
} else {
|
|
dX = Output(0, X.sizes(), at::dtype<float>());
|
|
}
|
|
|
|
const auto canonical_axis = X.canonical_axis_index(axis_);
|
|
int N, D;
|
|
N = X.size_to_dim(canonical_axis); // batch size
|
|
D = X.size_from_dim(canonical_axis);
|
|
|
|
ReinitializeTensor(&total_weight_ptr_, {1}, at::dtype<float>().device(CUDA));
|
|
|
|
if (label_prob_mode_) {
|
|
CAFFE_ENFORCE_GE(T.dim(), 2);
|
|
CAFFE_ENFORCE_EQ(T.size_to_dim(canonical_axis), N);
|
|
CAFFE_ENFORCE_EQ(T.size_from_dim(canonical_axis), D);
|
|
} else {
|
|
if (T.dim() == canonical_axis) {
|
|
CAFFE_ENFORCE_EQ(T.numel(), N);
|
|
} else {
|
|
CAFFE_ENFORCE_EQ(T.size_to_dim(canonical_axis), N);
|
|
CAFFE_ENFORCE_EQ(T.size_from_dim(canonical_axis), 1);
|
|
}
|
|
}
|
|
|
|
// Subtract 1 from labeled positions
|
|
if (!label_prob_mode_) {
|
|
if (weights == nullptr) {
|
|
// Copy softmax probabilities into dX
|
|
if (!only_loss_) {
|
|
context_.CopySameDevice<float>(
|
|
P.numel(), P.data<float>(), dX->template mutable_data<float>());
|
|
}
|
|
LabelCrossEntropyGradientKernel<<<
|
|
CAFFE_GET_BLOCKS(N),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context_.cuda_stream()>>>(
|
|
N,
|
|
D,
|
|
P.data<float>(),
|
|
T.data<int>(),
|
|
dX->template mutable_data<float>());
|
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
|
} else {
|
|
// Weighted version gets the Pdata values internally
|
|
LabelCrossEntropyGradientKernelWeighted<<<
|
|
CAFFE_GET_BLOCKS(N * D),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context_.cuda_stream()>>>(
|
|
N,
|
|
D,
|
|
P.data<float>(),
|
|
T.data<int>(),
|
|
dX->template mutable_data<float>(),
|
|
weights);
|
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
|
}
|
|
} else {
|
|
ProbCrossEntropyGradientKernel<<<
|
|
CAFFE_GET_BLOCKS(N * D),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context_.cuda_stream()>>>(
|
|
N,
|
|
D,
|
|
P.data<float>(),
|
|
T.data<float>(),
|
|
dX->template mutable_data<float>(),
|
|
weights);
|
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
|
}
|
|
float total_weight = N;
|
|
if (weights) {
|
|
// Sum weights
|
|
math::Sum<float, CUDAContext>(
|
|
N,
|
|
weights,
|
|
total_weight_ptr_.mutable_data<float>(),
|
|
&context_,
|
|
&scratch_);
|
|
CUDA_CHECK(cudaMemcpyAsync(
|
|
&total_weight,
|
|
total_weight_ptr_.data<float>(),
|
|
sizeof(float),
|
|
cudaMemcpyDeviceToHost,
|
|
context_.cuda_stream()));
|
|
}
|
|
|
|
// Scale by d_avg_loss / N
|
|
if (total_weight > 0) {
|
|
math::Scale<float, float, CUDAContext>(
|
|
dX->numel(),
|
|
scale_ / total_weight,
|
|
dX->data<float>(),
|
|
dX->template mutable_data<float>(),
|
|
&context_);
|
|
}
|
|
math::Scale<float, float, CUDAContext>(
|
|
dX->numel(),
|
|
d_avg_loss.data<float>(),
|
|
dX->data<float>(),
|
|
dX->template mutable_data<float>(),
|
|
&context_);
|
|
|
|
return true;
|
|
}
|
|
|
|
template <>
|
|
bool SpatialSoftmaxWithLossGradientOp<float, CUDAContext>::RunOnDevice() {
|
|
auto& X = Input(0); // Logits
|
|
auto& T = Input(1); // Labels / targets
|
|
// Input(2) is weights, if given
|
|
auto& P = Input(InputSize() - 2); // Probabilities from softmax
|
|
auto& d_avg_loss = Input(InputSize() - 1); // Gradient w.r.t. avg loss
|
|
const float* weights = (InputSize() > 4 ? Input(2).data<float>() : NULL);
|
|
|
|
Tensor* dX;
|
|
if (only_loss_) {
|
|
// Memory saving trick to share the buffer with the softmax output.
|
|
// Softmax output is thus overwritten.
|
|
dX = OutputTensorAlias(0, P);
|
|
dX->ResizeLike(X);
|
|
} else {
|
|
dX = Output(0, X.sizes(), at::dtype<float>());
|
|
}
|
|
|
|
const auto canonical_axis = X.canonical_axis_index(1);
|
|
int N, D;
|
|
N = X.dim32(0);
|
|
D = X.dim32(1);
|
|
|
|
ReinitializeTensor(&total_weight_ptr_, {1}, at::dtype<float>().device(CUDA));
|
|
// Spatial mode, compute softmax for each x, y location
|
|
CAFFE_ENFORCE_EQ(X.dim(), 4);
|
|
CAFFE_ENFORCE_EQ(T.dim(), 3);
|
|
|
|
int H = X.dim32(2);
|
|
int W = X.dim32(3);
|
|
dX->ResizeLike(X);
|
|
if (!weights_.defined()) {
|
|
weights_ = caffe2::empty({N * W * H}, at::dtype<float>().device(CUDA));
|
|
} else if (weights_.numel() != N * W * H) {
|
|
weights_.Resize(N * W * H);
|
|
}
|
|
|
|
const float* Pdata = P.data<float>();
|
|
float* dX_data = dX->template mutable_data<float>();
|
|
const int* label_data = T.data<int>();
|
|
const float* d_avg_loss_data = d_avg_loss.data<float>();
|
|
|
|
// Copy softmax probabilities into dX. All but the neuron
|
|
// corresponding to the correct label has gradient equaling e(x_j)
|
|
// which is the probability under softmax.
|
|
context_.CopySameDevice<float>(P.numel(), Pdata, dX_data);
|
|
|
|
math::Set<float, CUDAContext>(
|
|
1, 0.0f, total_weight_ptr_.mutable_data<float>(), &context_);
|
|
|
|
SpatialSoftmaxLossGradientKernel<<<
|
|
CAFFE_GET_BLOCKS(N * W * H),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context_.cuda_stream()>>>(
|
|
N, D, W, H, label_data, weights, dX_data, weights_.mutable_data<float>());
|
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
|
|
|
math::Sum<float, CUDAContext>(
|
|
weights_.numel(),
|
|
weights_.data<float>(),
|
|
total_weight_ptr_.mutable_data<float>(),
|
|
&context_,
|
|
&scratch_);
|
|
|
|
// Somewhat awkward scalar passing from device to host
|
|
float h_total_weight;
|
|
CUDA_CHECK(cudaMemcpyAsync(
|
|
&h_total_weight,
|
|
total_weight_ptr_.data<float>(),
|
|
sizeof(float),
|
|
cudaMemcpyDeviceToHost,
|
|
context_.cuda_stream()));
|
|
|
|
// Final scaling
|
|
if (h_total_weight > 0) {
|
|
math::Scale<float, float, CUDAContext>(
|
|
dX->numel(),
|
|
scale_ / h_total_weight,
|
|
dX->data<float>(),
|
|
dX->template mutable_data<float>(),
|
|
&context_);
|
|
}
|
|
math::Scale<float, float, CUDAContext>(
|
|
dX->numel(),
|
|
d_avg_loss.data<float>(),
|
|
dX->data<float>(),
|
|
dX->template mutable_data<float>(),
|
|
&context_);
|
|
|
|
return true;
|
|
}
|
|
|
|
// Implementation for the CUDA context.
|
|
template <>
|
|
bool SoftmaxOp<float, CUDAContext>::RunOnDevice() {
|
|
auto& X = Input(0);
|
|
|
|
const auto canonical_axis = X.canonical_axis_index(axis_);
|
|
const int N = X.size_to_dim(canonical_axis);
|
|
const int D = X.size_from_dim(canonical_axis);
|
|
auto* P = Output(0, X.sizes(), at::dtype<float>());
|
|
auto* P_data = P->mutable_data<float>();
|
|
if (N == 0 || D == 0) {
|
|
return true;
|
|
}
|
|
if (!sum_multiplier_.defined()) {
|
|
sum_multiplier_ = caffe2::empty({D}, at::dtype<float>().device(CUDA));
|
|
math::Set<float, CUDAContext>(
|
|
D, 1.f, sum_multiplier_.mutable_data<float>(), &context_);
|
|
} else if (sum_multiplier_.numel() != D) {
|
|
sum_multiplier_.Resize(D);
|
|
math::Set<float, CUDAContext>(
|
|
D, 1.f, sum_multiplier_.mutable_data<float>(), &context_);
|
|
}
|
|
if (!scale_.defined()) {
|
|
scale_ = caffe2::empty({N}, at::dtype<float>().device(CUDA));
|
|
} else if (scale_.numel() != N) {
|
|
scale_.Resize(N);
|
|
}
|
|
|
|
if (!rowmax_.defined()) {
|
|
rowmax_ = caffe2::empty({N}, at::dtype<float>().device(CUDA));
|
|
} else if (rowmax_.numel() != N) {
|
|
rowmax_.Resize(N);
|
|
}
|
|
|
|
Softmax(
|
|
N,
|
|
D,
|
|
X.data<float>(),
|
|
sum_multiplier_.data<float>(),
|
|
scale_.mutable_data<float>(),
|
|
rowmax_.mutable_data<float>(),
|
|
P_data,
|
|
false,
|
|
&context_);
|
|
return true;
|
|
}
|
|
#define SOFTMAX_NUM_THREADS 128
|
|
|
|
// The softmax gradient kernel. This kernel has to be called with the number of
|
|
// threads per block being no more than SOFTMAX_NUM_THREADS.
|
|
namespace {
|
|
__global__ void softmax_gradient_kernel(
|
|
const int dim,
|
|
const float* Y,
|
|
const float* dY,
|
|
float* dX) {
|
|
Y += blockIdx.x * dim;
|
|
dY += blockIdx.x * dim;
|
|
dX += blockIdx.x * dim;
|
|
const int idx = threadIdx.x;
|
|
__shared__ float reduction_buffer[SOFTMAX_NUM_THREADS];
|
|
float tmp;
|
|
|
|
// A two-level reduction to compute the inner products.
|
|
tmp = 0;
|
|
for (int i = idx; i < dim; i += blockDim.x) {
|
|
tmp += dY[i] * Y[i];
|
|
}
|
|
reduction_buffer[idx] = tmp;
|
|
__syncthreads();
|
|
if (idx == 0) {
|
|
tmp = reduction_buffer[0];
|
|
for (int i = 1; i < blockDim.x; ++i)
|
|
tmp += reduction_buffer[i];
|
|
reduction_buffer[0] = tmp;
|
|
}
|
|
__syncthreads();
|
|
// Compute gradient.
|
|
tmp = reduction_buffer[0];
|
|
for (int i = idx; i < dim; i += blockDim.x) {
|
|
dX[i] = Y[i] * (dY[i] - tmp);
|
|
}
|
|
}
|
|
} // namespace
|
|
|
|
template <>
|
|
bool SoftmaxGradientOp<float, CUDAContext>::RunOnDevice() {
|
|
auto& Y = Input(0);
|
|
auto& dY = Input(1);
|
|
|
|
const auto canonical_axis = Y.canonical_axis_index(axis_);
|
|
const int N = Y.size_to_dim(canonical_axis);
|
|
const int D = Y.size_from_dim(canonical_axis);
|
|
auto* dX = Output(0, Y.sizes(), at::dtype<float>());
|
|
auto* dX_data = dX->mutable_data<float>();
|
|
if (N == 0 || D == 0) {
|
|
return true;
|
|
}
|
|
softmax_gradient_kernel<<<
|
|
N,
|
|
SOFTMAX_NUM_THREADS,
|
|
0,
|
|
context_.cuda_stream()>>>(D, Y.data<float>(), dY.data<float>(), dX_data);
|
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
|
|
|
return true;
|
|
}
|
|
|
|
REGISTER_CUDA_OPERATOR(SoftmaxWithLoss, SoftmaxWithLossOp<float, CUDAContext>);
|
|
REGISTER_CUDA_OPERATOR(
|
|
SoftmaxWithLossGradient,
|
|
SoftmaxWithLossGradientOp<float, CUDAContext>);
|
|
REGISTER_CUDA_OPERATOR(
|
|
SpatialSoftmaxWithLoss,
|
|
SpatialSoftmaxWithLossOp<float, CUDAContext>);
|
|
REGISTER_CUDA_OPERATOR(
|
|
SpatialSoftmaxWithLossGradient,
|
|
SpatialSoftmaxWithLossGradientOp<float, CUDAContext>);
|
|
REGISTER_CUDA_OPERATOR(Softmax, SoftmaxOp<float, CUDAContext>);
|
|
REGISTER_CUDA_OPERATOR(SoftmaxGradient, SoftmaxGradientOp<float, CUDAContext>);
|
|
|
|
} // namespace caffe2
|