Revert "[torch][cuda][device_limits] Library for querying device hardware limits for flops and bandwidth (#162942)"

This reverts commit 627482a7b7780752c0e7aea034a2eb2db5899fcc. Reverted https://github.com/pytorch/pytorch/pull/162942 on behalf of https://github.com/huydhn due to Sorry for reverting your change but it needs some fixes for CUDA 13 ([comment](https://github.com/pytorch/pytorch/pull/162942#issuecomment-3308784448))
2025-10-20 21:14:14 +08:00 · 2025-09-18 17:49:16 +00:00
parent 1aeac304b8
commit 4b7aed89d8
4 changed files with 1 additions and 147 deletions
--- a/torch/_C/init.pyi.in
+++ b/torch/_C/init.pyi.in
@ -2204,9 +2204,6 @@ class _CudaDeviceProperties:
    warp_size: _int
    uuid: str
    L2_cache_size: _int
-    clock_rate: _int
-    memory_clock_rate: _int
-    memory_bus_width: _int

 # Functions related to SDPA
 class _SDPAParams:
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@ -1053,9 +1053,6 @@ static void registerCudaDeviceProperties(PyObject* module) {
      .def_readonly("warp_size", &cudaDeviceProp::warpSize)
 #ifndef USE_ROCM
      // NVIDIA-only properties
-      .def_readonly("clock_rate", &cudaDeviceProp::clockRate)
-      .def_readonly("memory_clock_rate", &cudaDeviceProp::memoryClockRate)
-      .def_readonly("memory_bus_width", &cudaDeviceProp::memoryBusWidth)
      .def_readonly(
          "shared_memory_per_block", &cudaDeviceProp::sharedMemPerBlock)
      .def_readonly(
--- a/torch/cuda/init.py
+++ b/torch/cuda/init.py
@ -25,7 +25,7 @@ import torch._C
 from torch import device as _device
 from torch._utils import _dummy_type, _LazySeedTracker, classproperty

-from . import _device_limits, gds
+from . import gds
 from ._utils import _get_device_index
 from .graphs import (
    CUDAGraph,
--- a/torch/cuda/_device_limits.py
+++ b/torch/cuda/_device_limits.py
@ -1,140 +0,0 @@
-import torch
-from torch._C import dtype
-
-
-__all__ = ["GPULimits"]
-
-
-class GPULimits:
-    r"""Utility class that provides the theoretical limits of Nvidia GPU devices. The
-    limits don't take into account thermal throttling (assume that the GPU run at its
-    peak rated frequency). This is because user hardware configuration may influence
-    power behavior.
-    """
-
-    def __init__(self, target_device: torch.device):
-        # The device properties object is obtained by calling 'cudaGetDeviceProperties' CUDA
-        # runtime function. We need the total memory bus width and the memory clock rate to
-        # calculate the memory bandwidth.
-        self.device_properties = torch.cuda.get_device_properties(target_device)
-
-        # The compute capability is needed to determine the number of FLOPs per cycle per SM
-        self.compute_capability = int(
-            f"{self.device_properties.major}{self.device_properties.minor}"
-        )
-
-    # FLOPs per cycle information derived from Table 2 in:
-    # https://resources.nvidia.com/en-us-hopper-architecture/nvidia-h100-tensor-c
-
-    # Returns the number of FMA instructions retired per cycle per SM for a given
-    # data type, when tensor cores are NOT used
-    def get_fma_per_cycle_per_sm_cuda_cores(self, data_type: dtype) -> int:
-        hardcoded_device_values = {
-            # Ampere Architecture
-            "fp16_80": 256,
-            "fp32_80": 64,
-            "fp64_80": 32,
-            # Hopper Architecture
-            "fp16_90": 64,
-            "fp32_90": 128,
-            "fp64_90": 64,
-            # Blackwell Architecture
-            "fp16_100": 256,
-            "fp32_100": 128,
-            "fp64_100": 64,
-        }
-        dict_key = ""
-        if data_type is torch.float16:
-            dict_key = f"fp16_{self.compute_capability}"
-        elif data_type is torch.float32:
-            dict_key = f"fp32_{self.compute_capability}"
-        elif data_type is torch.float64:
-            dict_key = f"fp64_{self.compute_capability}"
-        else:
-            dict_key = "unknown"
-
-        if dict_key not in hardcoded_device_values.keys():
-            raise RuntimeError(
-                f"No data for sm_{self.compute_capability} and {data_type}."
-            )
-
-        return hardcoded_device_values[dict_key]
-
-    # Returns the number of FMA instructions retired per cycle per SM for a given
-    # data type, when tensor cores ARE used
-    def get_fma_per_cycle_per_sm_tensor_cores(self, data_type: dtype) -> int:
-        hardcoded_device_values = {
-            # Ampere Architecture
-            "int8_80": 2048,
-            "fp16_80": 1024,
-            "fp32_80": 512,
-            "fp64_80": 64,
-            # Hopper Architecture
-            "int8_90": 4096,
-            "fp8_90": 4096,
-            "fp16_90": 2048,
-            "fp32_90": 1024,
-            "fp64_90": 128,
-            # Blackwell Architecture
-            "int8_100": 8192,
-            "fp8_100": 8192,
-            "fp16_100": 4096,
-            "fp32_100": 2048,
-        }
-        dict_key = ""
-        if data_type is torch.float16:
-            dict_key = f"fp16_{self.compute_capability}"
-        elif data_type is torch.bfloat16:
-            # FP16 and BF16 are equivalent in terms of FLOPs per cycle per SM
-            dict_key = f"fp16_{self.compute_capability}"
-        elif data_type is torch.float32:
-            dict_key = f"fp32_{self.compute_capability}"
-        elif data_type is torch.int8:
-            dict_key = f"int8_{self.compute_capability}"
-        elif data_type is torch.float64:
-            dict_key = f"fp64_{self.compute_capability}"
-        else:
-            dict_key = "unknown"
-
-        if dict_key not in hardcoded_device_values.keys():
-            raise RuntimeError(
-                f"No data for sm_{self.compute_capability} and {data_type}."
-            )
-
-        return hardcoded_device_values[dict_key]
-
-    def get_tflops_per_second(
-        self, data_type: dtype, use_tensor_cores: bool = True
-    ) -> float:
-        num_sms = self.device_properties.multi_processor_count
-        clock_rate = self.device_properties.clock_rate  # KHz
-
-        fma_per_cycle = 0
-        if use_tensor_cores:
-            fma_per_cycle = self.get_fma_per_cycle_per_sm_tensor_cores(data_type)
-        else:
-            fma_per_cycle = self.get_fma_per_cycle_per_sm_cuda_cores(data_type)
-
-        # 1 FMA counts as 2 floating point operations
-        # Clock rate is in KHz
-        tflops_per_second = num_sms * fma_per_cycle * 2 * clock_rate / 1e9
-        return tflops_per_second
-
-    def get_memory_bandwidth_Bps(self) -> int:
-        # DRAM devices are Double-Data which means they provide an output at both fronts of
-        # a clock beat
-        bus_bytes_per_cycle = int(2 * self.device_properties.memory_bus_width / 8)
-        mem_clock_rate_Hz = self.device_properties.memory_clock_rate * 1000
-        bytes_per_second = bus_bytes_per_cycle * mem_clock_rate_Hz * 2
-        return bytes_per_second
-
-    def get_shared_memory_bandwidth_Bps(self) -> int:
-        # Each warp can LD or ST 32 x 4 bytes per cycle. To calculate the
-        # device's throughput we need to multiply with frequency and number of SMs.
-        num_sms = self.device_properties.multi_processor_count
-        bytes_per_cycle_per_sm = 128
-        bytes_per_cycle_per_device = num_sms * bytes_per_cycle_per_sm
-        bytes_per_second = (
-            bytes_per_cycle_per_device * self.device_properties.clock_rate * 1000
-        )
-        return bytes_per_second