[caffe2] fix no matching function min/max Clang errors (#33563)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/33563 When NVCC or Clang are driving CUDA compilation many math functions are declared by default, with a small difference: Clang marks them as `__device__` only, while NVCC uses both `__host__` and `__device__`. This makes every un-elaborated `min` or `max` function call from a `__host__` function generate a syntax error when Clang is used. Fix the errors by using `std::min` and `std::max` from `<algorithm>`, since C++14 they are `constexpr` and can be used in the `__device__` code [1]. 1. https://llvm.org/docs/CompileCudaWithLLVM.html#algorithm Test Plan: ```lang=bash buck build mode/opt -c fbcode.cuda_use_clang=true //fblearner/flow/projects/dper:workflow buck build mode/opt //fblearner/flow/projects/dper:workflow ``` Execute tests on devgpu: ``` buck test mode/dev-nosan -j 8 //caffe2/caffe2/python/operator_test/... //caffe2/test:cuda ``` Reviewed By: ngimel Differential Revision: D20005795 fbshipit-source-id: 98a3f35e8a96c15d3ad3d2066396591f5cca1696
2025-10-20 21:14:14 +08:00 · 2020-02-28 11:20:54 -08:00
parent c6d301220a
commit 5dde8cd483
11 changed files with 51 additions and 21 deletions
--- a/aten/src/THC/generic/THCTensorMath.cu
+++ b/aten/src/THC/generic/THCTensorMath.cu
@ -2,6 +2,8 @@
 #define THC_GENERIC_FILE "THC/generic/THCTensorMath.cu"
 #else

+#include <algorithm>
+
 #include "ATen/cuda/CUDAContext.h"
 #include <ATen/MemoryOverlap.h>

@ -149,12 +151,16 @@ void THCTensor_(diag)(THCState *state, THCTensor *self_, THCTensor *src_, int64_
    int64_t stride1 = THCTensor_(stride)(state, src_, 1);
    int64_t size0 = THCTensor_(size)(state, src_, 0);
    int64_t size1 = THCTensor_(size)(state, src_, 1);
-    int64_t size = (k > 0) ? min((int64_t)size0, (int64_t)size1 - k) : min((int64_t)size0 + k, (int64_t)size1);
+    int64_t size = (k > 0) ? std::min((int64_t)size0, (int64_t)size1 - k)
+                           : std::min((int64_t)size0 + k, (int64_t)size1);
    THCTensor_(resize1d)(state, self_, size);
    if (size > 0) {
      int64_t strideSelf = THCTensor_(stride)(state, self_, 0);
-      const dim3 threads(min((int64_t)at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, (int64_t)size));
-      dim3 grid(min((int64_t)1024, (int64_t)THCCeilDiv(size, (int64_t)threads.x)));
+      const dim3 threads(std::min(
+          (int64_t)at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock,
+          (int64_t)size));
+      dim3 grid(std::min(
+          (int64_t)1024, (int64_t)THCCeilDiv(size, (int64_t)threads.x)));
      int64_t start = (k >= 0 ? k * stride1 : -k * stride0);
      THCTensor_copyFromDiagonal<scalar_t><<<grid, threads, 0, c10::cuda::getCurrentCUDAStream()>>>
      (THCTensor_(data)(state, self_), THCTensor_(data)(state, src_), start, size, stride0 + stride1, strideSelf);
@ -168,8 +174,11 @@ void THCTensor_(diag)(THCState *state, THCTensor *self_, THCTensor *src_, int64_
    if (size > 0) {
      int64_t stride0 = THCTensor_(stride)(state, self_, 0);
      int64_t stride1 = THCTensor_(stride)(state, self_, 1);
-      const dim3 threads(min((int64_t)at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, (int64_t)size));
-      dim3 grid(min((int64_t)1024, (int64_t)THCCeilDiv(size, (ptrdiff_t)threads.x)));
+      const dim3 threads(std::min(
+          (int64_t)at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock,
+          (int64_t)size));
+      dim3 grid(std::min(
+          (int64_t)1024, (int64_t)THCCeilDiv(size, (ptrdiff_t)threads.x)));
      ptrdiff_t start = (k >= 0 ? k * stride1 : -k * stride0);
      THCTensor_copyToDiagonal<scalar_t><<<grid, threads, 0, c10::cuda::getCurrentCUDAStream()>>>
      (THCTensor_(data)(state, self_), THCTensor_(data)(state, src_), start, totalElements, stride0 + stride1, strideSrc);
--- a/aten/src/THC/generic/THCTensorMathScan.cu
+++ b/aten/src/THC/generic/THCTensorMathScan.cu
@ -1,3 +1,5 @@
+#include <algorithm>
+
 #ifndef THC_GENERIC_FILE
 #define THC_GENERIC_FILE "THC/generic/THCTensorMathScan.cu"
 #else
@ -41,9 +43,11 @@ __host__ void THCTensor_(scanOuterDim)(THCState *state, THCTensor *tgt,
    num_irows *= THCTensor_(sizeLegacyNoScalars)(state, src, dim);
  }

-  dim3 threads(min(512, num_irows));
+  dim3 threads(std::min(512u, num_irows));
  unsigned maxGridDim = 1024;
-  dim3 grid(min(maxGridDim, num_orows), min(maxGridDim, THCCeilDiv(num_irows, threads.x)));
+  dim3 grid(
+      std::min(maxGridDim, num_orows),
+      std::min(maxGridDim, THCCeilDiv(num_irows, threads.x)));

  THCTensor_kernel_scanOuterDim<scalar_t><<<grid, threads, 0, c10::cuda::getCurrentCUDAStream()>>>(
    THCTensor_(data)(state, tgt), THCTensor_(data)(state, src),
@ -66,7 +70,7 @@ __host__ void THCTensor_(scanInnermostDim)(THCState *state, THCTensor *tgt,
  unsigned row_size = THCTensor_(sizeLegacyNoScalars)(state, src, ndim - 1);

  dim3 threads(16, 32);
-  dim3 grid(min(1024, THCCeilDiv(num_rows, threads.y)));
+  dim3 grid(std::min(1024u, THCCeilDiv(num_rows, threads.y)));

  THCTensor_kernel_scanInnermostDim<scalar_t, 16, 32><<<grid, threads, 0, c10::cuda::getCurrentCUDAStream()>>>(
    THCTensor_(data)(state, tgt), THCTensor_(data)(state, src), num_rows, row_size, init, binary_op);
--- a/aten/src/THCUNN/RReLU.cu
+++ b/aten/src/THCUNN/RReLU.cu
@ -1,3 +1,6 @@
+#include <algorithm>
+#include <utility>
+
 #include <THCUNN/THCUNN.h>
 #include <TH/THHalf.h>
 #include <THC/THCNumerics.cuh>
@ -7,12 +10,12 @@
 #include <curand.h>
 #include <curand_kernel.h>
 #include <curand_philox4x32_x.h>
-#include <utility>

 // copied from cutorch/lib/THC/THCTensorRandom.cu
 #define MAX_NUM_BLOCKS 64
 #define BLOCK_SIZE 256
-#define NUM_BLOCKS(n) min((int)THCCeilDiv(n, (ptrdiff_t) BLOCK_SIZE), MAX_NUM_BLOCKS)
+#define NUM_BLOCKS(n) \
+  (std::min((int)THCCeilDiv(n, (ptrdiff_t)BLOCK_SIZE), MAX_NUM_BLOCKS))

 template<typename T>
 inline T __device__ curand_uniform_type(curandStatePhilox4_32_10_t *state);
--- a/caffe2/core/context_gpu.cu
+++ b/caffe2/core/context_gpu.cu
@ -455,7 +455,7 @@ void TrackMemoryAlloc(size_t nbytes) {
  int this_gpu = CaffeCudaGetDevice();
  g_total_by_gpu_map[this_gpu] += nbytes;
  g_max_by_gpu_map[this_gpu] =
-      max(g_max_by_gpu_map[this_gpu], g_total_by_gpu_map[this_gpu]);
+      std::max(g_max_by_gpu_map[this_gpu], g_total_by_gpu_map[this_gpu]);
  g_total_mem += nbytes;
  if (g_total_mem - g_last_rep >
      FLAGS_caffe2_gpu_memory_report_interval_mb * 1024 * 1024) {
--- a/caffe2/operators/boolean_mask_ops.cu
+++ b/caffe2/operators/boolean_mask_ops.cu
@ -1,3 +1,5 @@
+#include <algorithm>
+
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/operators/boolean_mask_ops.h"

@ -91,7 +93,7 @@ class BooleanMaskOp<CUDAContext> final : public Operator<CUDAContext> {

    if (numOfOutput > 0) {
      BooleanMaskCopyKernel<<<
-          min(numOfOutput, static_cast<int64_t>(CAFFE_MAXIMUM_NUM_BLOCKS)),
+          std::min(numOfOutput, static_cast<int64_t>(CAFFE_MAXIMUM_NUM_BLOCKS)),
          CAFFE_CUDA_NUM_THREADS,
          0,
          context_.cuda_stream()>>>(
--- a/caffe2/operators/boolean_unmask_ops.cu
+++ b/caffe2/operators/boolean_unmask_ops.cu
@ -1,3 +1,5 @@
+#include <algorithm>
+
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/operators/boolean_unmask_ops.h"

@ -87,7 +89,7 @@ class BooleanUnmaskOp<CUDAContext> final : public Operator<CUDAContext> {
    auto* indicesData = indices_.mutable_data<int>();

    ComputeIndicesKernel<<<
-        min(maskSize, CAFFE_MAXIMUM_NUM_BLOCKS),
+        std::min(maskSize, CAFFE_MAXIMUM_NUM_BLOCKS),
        CAFFE_CUDA_NUM_THREADS,
        0,
        context_.cuda_stream()>>>(
@ -95,7 +97,7 @@ class BooleanUnmaskOp<CUDAContext> final : public Operator<CUDAContext> {

    auto* valueSizesData = valueSizes_.mutable_data<int>();
    FillValuesKernel<<<
-        min(numMasks, CAFFE_MAXIMUM_NUM_BLOCKS),
+        std::min(numMasks, CAFFE_MAXIMUM_NUM_BLOCKS),
        CAFFE_CUDA_NUM_THREADS,
        0,
        context_.cuda_stream()>>>(
--- a/caffe2/operators/normalize_ops.cu
+++ b/caffe2/operators/normalize_ops.cu
@ -1,3 +1,5 @@
+#include <algorithm>
+
 #include <cub/block/block_reduce.cuh>

 #include "caffe2/core/context_gpu.h"
@ -89,7 +91,7 @@ void NormalizeOp<float, CUDAContext>::DoNormalize(
    const int n,
    const int sf) {
  NormalizeKernel<<<
-      min(n, CAFFE_MAXIMUM_NUM_BLOCKS),
+      std::min(n, CAFFE_MAXIMUM_NUM_BLOCKS),
      CAFFE_CUDA_NUM_THREADS,
      0,
      context_.cuda_stream()>>>(m, n, sf, xData, yData, kEps_);
@ -108,7 +110,7 @@ bool NormalizeGradientOp<float, CUDAContext>::RunOnDevice() {
  int M = X.numel() / N;
  const int SF = X.size_from_dim(canonical_axis + 1);
  NormalizeGradientKernel<<<
-      min(M, CAFFE_MAXIMUM_NUM_BLOCKS),
+      std::min(M, CAFFE_MAXIMUM_NUM_BLOCKS),
      CAFFE_CUDA_NUM_THREADS,
      0,
      context_.cuda_stream()>>>(
@ -165,7 +167,7 @@ void NormalizeL1Op<float, CUDAContext>::DoNormalize(
    const int n,
    const int sf) {
  NormalizeL1Kernel<<<
-      min(n, CAFFE_MAXIMUM_NUM_BLOCKS),
+      std::min(n, CAFFE_MAXIMUM_NUM_BLOCKS),
      CAFFE_CUDA_NUM_THREADS,
      0,
      context_.cuda_stream()>>>(m, n, sf, xData, yData);
--- a/caffe2/operators/scale_blobs_op.cu
+++ b/caffe2/operators/scale_blobs_op.cu
@ -1,3 +1,5 @@
+#include <algorithm>
+
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/operators/scale_blobs_op.h"

@ -47,7 +49,7 @@ bool ScaleBlobsOp<CUDAContext>::DoRunWithType() {
  for (int i = 0; i < numBlobs; ++i) {
    hostBlobSizesData[i] = Input(i).numel();
    totalSize += hostBlobSizesData[i];
-    maxSize = max(maxSize, hostBlobSizesData[i]);
+    maxSize = std::max(maxSize, hostBlobSizesData[i]);
    hostInputsData[i] = Input(i).template data<T>();
    hostOutputsData[i] = Output(i)->template mutable_data<T>();
  }
--- a/caffe2/operators/segment_reduction_op_gpu.cu
+++ b/caffe2/operators/segment_reduction_op_gpu.cu
@ -1,3 +1,5 @@
+#include <algorithm>
+
 #include <cub/block/block_reduce.cuh>
 #include <cub/device/device_reduce.cuh>
 #include <cub/device/device_scan.cuh>
@ -1177,7 +1179,7 @@ class SortedSegmentRangeMeanOp : public Operator<Context> {
        K,
        context_.cuda_stream());
    sorted_segment_mean_kernel<T, SIndex, LOGEXP>
-        <<<min(K, CAFFE_MAXIMUM_NUM_BLOCKS),
+        <<<std::min(K, CAFFE_MAXIMUM_NUM_BLOCKS),
           CAFFE_CUDA_NUM_THREADS,
           0,
           context_.cuda_stream()>>>(
--- a/caffe2/operators/sequence_ops.cu
+++ b/caffe2/operators/sequence_ops.cu
@ -1,3 +1,5 @@
+#include <algorithm>
+
 #include <cub/cub.cuh>

 #include "caffe2/core/context_gpu.h"
@ -350,7 +352,7 @@ void GatherPaddingOp<CUDAContext>::GatherPadding(
        &lengths_prefix_sum_,
        &context_);
    gather_padding_kernel<T>
-        <<<min(block_size, CAFFE_MAXIMUM_NUM_BLOCKS),
+        <<<std::min(block_size, CAFFE_MAXIMUM_NUM_BLOCKS),
           CAFFE_CUDA_NUM_THREADS,
           0,
           context_.cuda_stream()>>>(
--- a/caffe2/sgd/adagrad_op_gpu.cu
+++ b/caffe2/sgd/adagrad_op_gpu.cu
@ -1,3 +1,5 @@
+#include <algorithm>
+
 #include <cub/block/block_reduce.cuh>
 #include "caffe2/core/common_gpu.h"
 #include "caffe2/core/context_gpu.h"
@ -203,7 +205,7 @@ bool RowWiseSparseAdagradOp<float, CUDAContext>::DoRunWithType() {

  // each thread block will handle multiple rows of the input and output
  RowWiseSparseAdagradKernel<<<
-      min(GRAD_M, CAFFE_MAXIMUM_NUM_BLOCKS),
+      std::min(GRAD_M, CAFFE_MAXIMUM_NUM_BLOCKS),
      num_threads,
      0,
      context_.cuda_stream()>>>(