number of blocks now makes more sense.

2025-10-20 21:14:14 +08:00 · 2015-12-15 10:46:50 -08:00
parent 3b0cc79465
commit f714ad0a70
1 changed files with 28 additions and 7 deletions
--- a/caffe2/core/common_gpu.h
+++ b/caffe2/core/common_gpu.h
@ -31,6 +31,7 @@ bool HasCudaGPU();
 * called, GPU 0 will be the default GPU id.
 */
 void SetDefaultGPUID(const int deviceid);
+
 /**
 * Gets the default GPU id for Caffe2.
 */
@ -85,15 +86,35 @@ const char* curandGetErrorString(curandStatus_t error);
       i < (n);                                                                \
       i += blockDim.x * gridDim.x)

-// The number of threads Caffe uses for cuda. This is kind of legacy:
-// hard-coding the number of threads might not be an optimal case, and might
-// even fail for specific hardware platforms.
-// TODO(Yangqing): Yuck. Figure out a better way?
-const int CAFFE_CUDA_NUM_THREADS = 512;
+// The following helper functions are here so that you can write a kernel call
+// when you are not particularly interested in maxing out the kernels'
+// performance. Usually, this will give you a reasonable speed, but if you
+// really want to find the best performance, it is advised that you tune the
+// size of the blocks and grids more reasonably.
+// A legacy note: this is derived from the old good Caffe days, when I simply
+// hard-coded the number of threads and wanted to keep backward compability for
+// different computation capabilities.
+// For more info on CUDA compute capabilities, visit the NVidia website at:
+//    http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities

-// Compute the number of blocks needed to run N threads.
+// The number of cuda threads to use. 512 is used for backward compatibility,
+// and it is observed that setting it to 1024 usually does not bring much
+// performance gain (which makes sense, because warp size being 32 means that
+// blindly setting a huge block for a random kernel isn't optimal).
+constexpr int CAFFE_CUDA_NUM_THREADS = 512;
+// The maximum number of blocks to use in the default kernel call. We set it to
+// 32768 which would work for compute capability 2.x (where 65536 is the limit).
+// This number is very carelessly chosen. Ideally, one would like to look at
+// the hardware at runtime, and pick the number of blocks that makes most
+// sense for the specific runtime environment. This is a todo item.
+constexpr int CAFFE_MAXIMUM_NUM_BLOCKS = 32768;
+
+/**
+ * @brief Compute the number of blocks needed to run N threads.
+ */
 inline int CAFFE_GET_BLOCKS(const int N) {
-  return (N + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS;
+  return std::min((N + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS,
+                  CAFFE_MAXIMUM_NUM_BLOCKS);
 }

 }  // namespace caffe2