Revert "Increase C10_COMPILE_TIME_MAX_GPUS to 128 (#144138)"

This reverts commit 6cfc08167595e27ee9a5701c6426a7a8a7e387ef. Reverted https://github.com/pytorch/pytorch/pull/144138 on behalf of https://github.com/albanD due to This seems to impact the caffe2 code ([comment](https://github.com/pytorch/pytorch/pull/144138#issuecomment-2590891200))
2025-10-20 21:14:14 +08:00 · 2025-01-14 19:04:12 +00:00
parent b4b4e57469
commit bdd942efd7
4 changed files with 9 additions and 24 deletions
--- a/c10/core/Device.cpp
+++ b/c10/core/Device.cpp
@ -132,14 +132,7 @@ Device::Device(const std::string& device_string) : Device(Type::CPU) {

  try {
    if (!device_index_str.empty()) {
-      auto index = std::stoi(device_index_str);
-      TORCH_CHECK(
-          index <=
-              static_cast<int>(std::numeric_limits<c10::DeviceIndex>::max()),
-          "Device index '",
-          device_index_str,
-          "' is out of range");
-      index_ = static_cast<c10::DeviceIndex>(index);
+      index_ = static_cast<c10::DeviceIndex>(std::stoi(device_index_str));
    }
  } catch (const std::exception&) {
    TORCH_CHECK(
--- a/c10/core/Device.h
+++ b/c10/core/Device.h
@ -169,7 +169,7 @@ struct C10_API Device final {
 private:
  DeviceType type_;
  DeviceIndex index_ = -1;
-  void validate() const {
+  void validate() {
    // Removing these checks in release builds noticeably improves
    // performance in micro-benchmarks.
    // This is safe to do, because backends that use the DeviceIndex
--- a/c10/cuda/CUDAMacros.h
+++ b/c10/cuda/CUDAMacros.h
@ -1,6 +1,4 @@
 #pragma once
-#include <cstdint>
-#include <limits>

 #ifndef C10_USING_CUSTOM_GENERATED_MACROS

@ -49,6 +47,5 @@ o */
 // fbcode depends on this value being 16
 #define C10_COMPILE_TIME_MAX_GPUS 16
 #else
-constexpr std::int64_t C10_COMPILE_TIME_MAX_GPUS =
-    std::numeric_limits<int8_t>::max() + 1;
+#define C10_COMPILE_TIME_MAX_GPUS 120
 #endif
--- a/c10/cuda/CUDAStream.cpp
+++ b/c10/cuda/CUDAStream.cpp
@ -9,7 +9,6 @@
 #include <array>
 #include <atomic>
 #include <cstdint>
-#include <limits>

 namespace c10::cuda {

@ -175,16 +174,12 @@ static void initGlobalStreamState() {
  num_gpus = device_count();
  // Check if the number of GPUs matches the expected compile-time max number
  // of GPUs.
-  if constexpr (
-      C10_COMPILE_TIME_MAX_GPUS <
-      std::numeric_limits<decltype(num_gpus)>::max()) {
-    TORCH_CHECK(
-        num_gpus <= C10_COMPILE_TIME_MAX_GPUS,
-        "Number of CUDA devices on the machine is larger than the compiled "
-        "max number of gpus expected (",
-        C10_COMPILE_TIME_MAX_GPUS,
-        "). Increase that and recompile.");
-  }
+  TORCH_CHECK(
+      num_gpus <= C10_COMPILE_TIME_MAX_GPUS,
+      "Number of CUDA devices on the machine is larger than the compiled "
+      "max number of gpus expected (",
+      C10_COMPILE_TIME_MAX_GPUS,
+      "). Increase that and recompile.");
  int leastPriority = -1, greatestPriority = -1;
  C10_CUDA_CHECK(
      cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));