[ROCm] Improve performance of reductions on 1D and 2D tensors. (#137737)

This patch improves the performance of individual reductions on MI300X. These improvements are measured on individual sum reduction operations of varying sizes. The patch impacts the following tensor types: - 1D tensors - 2D tensors when reducing along dimension 0 - 2D tensors when reducing along dimension 1 Runtime reduction between 0 and 75% depending on tensor shape. The patch uses the maximum number of threads per CU and the number of CUs itself to control the number of threadblocks in various situations (i.e. for various reduction types and tensor dimensions). Pull Request resolved: https://github.com/pytorch/pytorch/pull/137737 Approved by: https://github.com/eqy, https://github.com/jeffdaily, https://github.com/pruthvistony, https://github.com/xw285cornell
2025-11-11 22:34:53 +08:00 · 2024-10-24 03:41:16 +00:00
parent d8f22a1141
commit e5c3d7ab77
1 changed files with 29 additions and 5 deletions
--- a/aten/src/ATen/native/cuda/Reduce.cuh
+++ b/aten/src/ATen/native/cuda/Reduce.cuh
@ -1092,11 +1092,7 @@ ReduceConfig setReduceConfig(const TensorIterator& iter){
  }

  constexpr int min_values_per_thread = 16;
-#ifndef USE_ROCM
  constexpr int max_values_per_thread = 256;
-#else
-  constexpr int max_values_per_thread = 1024;
-#endif

  if (config.values_per_thread() >= block_height * 16 || config.values_per_thread() >= max_values_per_thread) {
    // Divide the input across warps in a thread-block, if that leaves at least
@ -1108,7 +1104,18 @@ ReduceConfig setReduceConfig(const TensorIterator& iter){
    config.output_mult[1] = config.split_output(block_height);
  }

-  const int blocks_per_sm = at::cuda::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor / config.num_threads;
+  int max_threads_per_mp =
+      at::cuda::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor;
+#ifdef USE_ROCM
+  // Control the number of threadblocks by adjusting the maximum number of
+  // threads per multi-processor. These numbers better reflect the maximum
+  // theoretical achievable threads per MP for the reduction operation.
+  if (iter.ndim() == 1)
+    max_threads_per_mp = 512;
+  if (iter.ndim() == 2)
+    max_threads_per_mp = 256;
+#endif
+  const int blocks_per_sm = max_threads_per_mp / config.num_threads;
  const int num_mp = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
  const int target_grid_size = num_mp * blocks_per_sm;
  int grid = config.grid().x;
@ -1126,6 +1133,23 @@ ReduceConfig setReduceConfig(const TensorIterator& iter){
    // a large number of values to deal with. But we don't want values_per_thread to be larger than
    // max_values_per_thread
    config.ctas_per_output = std::max(std::min<int>(ctas_per_output1, ctas_per_output2), ctas_per_output3);
+#ifdef USE_ROCM
+    // In cases where a number of threadblocks along the y direction of the grid
+    // is needed then make sure they are reduced to the number of MPs. For
+    // smaller sizes, use half the number of MPs. For smaller sizes than half
+    // the number of MPs use the original value unless the value is less than 16
+    // blocks in which case it is more profitable to use just 1 block.
+    if (config.ctas_per_output > num_mp)
+      if (num_mp < 128)
+        config.ctas_per_output =
+            num_mp * (config.ctas_per_output > 512 ? 4 : 2);
+      else
+        config.ctas_per_output = num_mp;
+    else if (config.ctas_per_output > div_up(num_mp, 2))
+      config.ctas_per_output = div_up(num_mp, 2);
+    else if (config.ctas_per_output < 16)
+      config.ctas_per_output = 1;
+#endif
    if (config.ctas_per_output > 1) {
      config.input_mult[2] = config.split_input(config.ctas_per_output);
    }