[torch][cuda][device_limits] Library for querying device hardware limits for flops and bandwidth (#162942)

In various benchmarks scattered across the repo, the limits for flops/second and memory bandwidth are usually hardcoded for a single device. This utility could help in providing a more structured way to query the device capabilities. If this is approved, we can use it when reporting flops efficiency and bandwidth relative to peak in the benchmarks and tests. The intent is to add more devices, more parameters (e.g. L2 cache bandwidth, NVLink, etc.) for both CPUs and accelerators. Testing: ``` import torch if torch.cuda.is_available(): device = torch.cuda.current_device() mod = torch.get_device_module('cuda') hw = mod._device_limits.GPULimits(device) print(hw.get_tflops_per_second(torch.float16)) print(hw.get_tflops_per_second(torch.float32)) print(hw.get_tflops_per_second(torch.float64)) print(hw.get_tflops_per_second(torch.bfloat16)) print(hw.get_tflops_per_second(torch.int8)) print(hw.get_memory_bandwidth_Bps() / 1e9) print(hw.get_shared_memory_bandwidth_Bps() / 1e9) # Output on an H100 GPU 1070.53056 535.26528 66.90816 1070.53056 2141.06112 4893.696 33454.08 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/162942 Approved by: https://github.com/ngimel
2025-10-20 21:14:14 +08:00 · 2025-09-18 06:40:04 +00:00
parent c5e7bb08b0
commit 627482a7b7
4 changed files with 147 additions and 1 deletions
--- a/torch/_C/init.pyi.in
+++ b/torch/_C/init.pyi.in
@ -2204,6 +2204,9 @@ class _CudaDeviceProperties:
    warp_size: _int
    uuid: str
    L2_cache_size: _int
+    clock_rate: _int
+    memory_clock_rate: _int
+    memory_bus_width: _int

 # Functions related to SDPA
 class _SDPAParams:
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@ -1053,6 +1053,9 @@ static void registerCudaDeviceProperties(PyObject* module) {
      .def_readonly("warp_size", &cudaDeviceProp::warpSize)
 #ifndef USE_ROCM
      // NVIDIA-only properties
+      .def_readonly("clock_rate", &cudaDeviceProp::clockRate)
+      .def_readonly("memory_clock_rate", &cudaDeviceProp::memoryClockRate)
+      .def_readonly("memory_bus_width", &cudaDeviceProp::memoryBusWidth)
      .def_readonly(
          "shared_memory_per_block", &cudaDeviceProp::sharedMemPerBlock)
      .def_readonly(
--- a/torch/cuda/init.py
+++ b/torch/cuda/init.py
@ -25,7 +25,7 @@ import torch._C
 from torch import device as _device
 from torch._utils import _dummy_type, _LazySeedTracker, classproperty

-from . import gds
+from . import _device_limits, gds
 from ._utils import _get_device_index
 from .graphs import (
    CUDAGraph,
--- a/torch/cuda/_device_limits.py
+++ b/torch/cuda/_device_limits.py
@ -0,0 +1,140 @@
+import torch
+from torch._C import dtype
+
+
+__all__ = ["GPULimits"]
+
+
+class GPULimits:
+    r"""Utility class that provides the theoretical limits of Nvidia GPU devices. The
+    limits don't take into account thermal throttling (assume that the GPU run at its
+    peak rated frequency). This is because user hardware configuration may influence
+    power behavior.
+    """
+
+    def __init__(self, target_device: torch.device):
+        # The device properties object is obtained by calling 'cudaGetDeviceProperties' CUDA
+        # runtime function. We need the total memory bus width and the memory clock rate to
+        # calculate the memory bandwidth.
+        self.device_properties = torch.cuda.get_device_properties(target_device)
+
+        # The compute capability is needed to determine the number of FLOPs per cycle per SM
+        self.compute_capability = int(
+            f"{self.device_properties.major}{self.device_properties.minor}"
+        )
+
+    # FLOPs per cycle information derived from Table 2 in:
+    # https://resources.nvidia.com/en-us-hopper-architecture/nvidia-h100-tensor-c
+
+    # Returns the number of FMA instructions retired per cycle per SM for a given
+    # data type, when tensor cores are NOT used
+    def get_fma_per_cycle_per_sm_cuda_cores(self, data_type: dtype) -> int:
+        hardcoded_device_values = {
+            # Ampere Architecture
+            "fp16_80": 256,
+            "fp32_80": 64,
+            "fp64_80": 32,
+            # Hopper Architecture
+            "fp16_90": 64,
+            "fp32_90": 128,
+            "fp64_90": 64,
+            # Blackwell Architecture
+            "fp16_100": 256,
+            "fp32_100": 128,
+            "fp64_100": 64,
+        }
+        dict_key = ""
+        if data_type is torch.float16:
+            dict_key = f"fp16_{self.compute_capability}"
+        elif data_type is torch.float32:
+            dict_key = f"fp32_{self.compute_capability}"
+        elif data_type is torch.float64:
+            dict_key = f"fp64_{self.compute_capability}"
+        else:
+            dict_key = "unknown"
+
+        if dict_key not in hardcoded_device_values.keys():
+            raise RuntimeError(
+                f"No data for sm_{self.compute_capability} and {data_type}."
+            )
+
+        return hardcoded_device_values[dict_key]
+
+    # Returns the number of FMA instructions retired per cycle per SM for a given
+    # data type, when tensor cores ARE used
+    def get_fma_per_cycle_per_sm_tensor_cores(self, data_type: dtype) -> int:
+        hardcoded_device_values = {
+            # Ampere Architecture
+            "int8_80": 2048,
+            "fp16_80": 1024,
+            "fp32_80": 512,
+            "fp64_80": 64,
+            # Hopper Architecture
+            "int8_90": 4096,
+            "fp8_90": 4096,
+            "fp16_90": 2048,
+            "fp32_90": 1024,
+            "fp64_90": 128,
+            # Blackwell Architecture
+            "int8_100": 8192,
+            "fp8_100": 8192,
+            "fp16_100": 4096,
+            "fp32_100": 2048,
+        }
+        dict_key = ""
+        if data_type is torch.float16:
+            dict_key = f"fp16_{self.compute_capability}"
+        elif data_type is torch.bfloat16:
+            # FP16 and BF16 are equivalent in terms of FLOPs per cycle per SM
+            dict_key = f"fp16_{self.compute_capability}"
+        elif data_type is torch.float32:
+            dict_key = f"fp32_{self.compute_capability}"
+        elif data_type is torch.int8:
+            dict_key = f"int8_{self.compute_capability}"
+        elif data_type is torch.float64:
+            dict_key = f"fp64_{self.compute_capability}"
+        else:
+            dict_key = "unknown"
+
+        if dict_key not in hardcoded_device_values.keys():
+            raise RuntimeError(
+                f"No data for sm_{self.compute_capability} and {data_type}."
+            )
+
+        return hardcoded_device_values[dict_key]
+
+    def get_tflops_per_second(
+        self, data_type: dtype, use_tensor_cores: bool = True
+    ) -> float:
+        num_sms = self.device_properties.multi_processor_count
+        clock_rate = self.device_properties.clock_rate  # KHz
+
+        fma_per_cycle = 0
+        if use_tensor_cores:
+            fma_per_cycle = self.get_fma_per_cycle_per_sm_tensor_cores(data_type)
+        else:
+            fma_per_cycle = self.get_fma_per_cycle_per_sm_cuda_cores(data_type)
+
+        # 1 FMA counts as 2 floating point operations
+        # Clock rate is in KHz
+        tflops_per_second = num_sms * fma_per_cycle * 2 * clock_rate / 1e9
+        return tflops_per_second
+
+    def get_memory_bandwidth_Bps(self) -> int:
+        # DRAM devices are Double-Data which means they provide an output at both fronts of
+        # a clock beat
+        bus_bytes_per_cycle = int(2 * self.device_properties.memory_bus_width / 8)
+        mem_clock_rate_Hz = self.device_properties.memory_clock_rate * 1000
+        bytes_per_second = bus_bytes_per_cycle * mem_clock_rate_Hz * 2
+        return bytes_per_second
+
+    def get_shared_memory_bandwidth_Bps(self) -> int:
+        # Each warp can LD or ST 32 x 4 bytes per cycle. To calculate the
+        # device's throughput we need to multiply with frequency and number of SMs.
+        num_sms = self.device_properties.multi_processor_count
+        bytes_per_cycle_per_sm = 128
+        bytes_per_cycle_per_device = num_sms * bytes_per_cycle_per_sm
+        bytes_per_second = (
+            bytes_per_cycle_per_device * self.device_properties.clock_rate * 1000
+        )
+        return bytes_per_second