Fix CUDA_MAX_THREADS_PER_SM for sm_87 (#88644)

#88326 CC @ngimel @ptrblck Pull Request resolved: https://github.com/pytorch/pytorch/pull/88644 Approved by: https://github.com/ngimel
2025-10-20 21:14:14 +08:00 · 2022-11-08 19:44:23 +00:00
parent 6bb7f4f29f
commit 3e30a9ea1c
1 changed files with 5 additions and 5 deletions
--- a/c10/macros/Macros.h
+++ b/c10/macros/Macros.h
@ -255,13 +255,13 @@ using namespace c10::hip;
 // constants from
 // (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications)
 // The maximum number of threads per multiprocessor is 1024 for Turing
-// architecture (7.5), 1536 for Geforce Ampere (8.6), and 2048 for all other
-// architectures. You'll get warnings if you exceed these constants. Hence, the
-// following macros adjust the input values from the user to resolve potential
-// warnings.
+// architecture (7.5), 1536 for Geforce Ampere (8.6)/Jetson Orin (8.7), and
+// 2048 for all other architectures. You'll get warnings if you exceed these
+// constants. Hence, the following macros adjust the input values from the user
+// to resolve potential warnings.
 #if __CUDA_ARCH__ == 750
 constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1024;
-#elif __CUDA_ARCH__ == 860
+#elif __CUDA_ARCH__ == 860 || __CUDA_ARCH__ == 870
 constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1536;
 #else
 constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 2048;