mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Revert "[torch][cuda][device_limits] Library for querying device hardware limits for flops and bandwidth (#162942)"
This reverts commit 627482a7b7780752c0e7aea034a2eb2db5899fcc. Reverted https://github.com/pytorch/pytorch/pull/162942 on behalf of https://github.com/huydhn due to Sorry for reverting your change but it needs some fixes for CUDA 13 ([comment](https://github.com/pytorch/pytorch/pull/162942#issuecomment-3308784448))
This commit is contained in:
@ -2204,9 +2204,6 @@ class _CudaDeviceProperties:
|
||||
warp_size: _int
|
||||
uuid: str
|
||||
L2_cache_size: _int
|
||||
clock_rate: _int
|
||||
memory_clock_rate: _int
|
||||
memory_bus_width: _int
|
||||
|
||||
# Functions related to SDPA
|
||||
class _SDPAParams:
|
||||
|
@ -1053,9 +1053,6 @@ static void registerCudaDeviceProperties(PyObject* module) {
|
||||
.def_readonly("warp_size", &cudaDeviceProp::warpSize)
|
||||
#ifndef USE_ROCM
|
||||
// NVIDIA-only properties
|
||||
.def_readonly("clock_rate", &cudaDeviceProp::clockRate)
|
||||
.def_readonly("memory_clock_rate", &cudaDeviceProp::memoryClockRate)
|
||||
.def_readonly("memory_bus_width", &cudaDeviceProp::memoryBusWidth)
|
||||
.def_readonly(
|
||||
"shared_memory_per_block", &cudaDeviceProp::sharedMemPerBlock)
|
||||
.def_readonly(
|
||||
|
@ -25,7 +25,7 @@ import torch._C
|
||||
from torch import device as _device
|
||||
from torch._utils import _dummy_type, _LazySeedTracker, classproperty
|
||||
|
||||
from . import _device_limits, gds
|
||||
from . import gds
|
||||
from ._utils import _get_device_index
|
||||
from .graphs import (
|
||||
CUDAGraph,
|
||||
|
@ -1,140 +0,0 @@
|
||||
import torch
|
||||
from torch._C import dtype
|
||||
|
||||
|
||||
__all__ = ["GPULimits"]
|
||||
|
||||
|
||||
class GPULimits:
|
||||
r"""Utility class that provides the theoretical limits of Nvidia GPU devices. The
|
||||
limits don't take into account thermal throttling (assume that the GPU run at its
|
||||
peak rated frequency). This is because user hardware configuration may influence
|
||||
power behavior.
|
||||
"""
|
||||
|
||||
def __init__(self, target_device: torch.device):
|
||||
# The device properties object is obtained by calling 'cudaGetDeviceProperties' CUDA
|
||||
# runtime function. We need the total memory bus width and the memory clock rate to
|
||||
# calculate the memory bandwidth.
|
||||
self.device_properties = torch.cuda.get_device_properties(target_device)
|
||||
|
||||
# The compute capability is needed to determine the number of FLOPs per cycle per SM
|
||||
self.compute_capability = int(
|
||||
f"{self.device_properties.major}{self.device_properties.minor}"
|
||||
)
|
||||
|
||||
# FLOPs per cycle information derived from Table 2 in:
|
||||
# https://resources.nvidia.com/en-us-hopper-architecture/nvidia-h100-tensor-c
|
||||
|
||||
# Returns the number of FMA instructions retired per cycle per SM for a given
|
||||
# data type, when tensor cores are NOT used
|
||||
def get_fma_per_cycle_per_sm_cuda_cores(self, data_type: dtype) -> int:
|
||||
hardcoded_device_values = {
|
||||
# Ampere Architecture
|
||||
"fp16_80": 256,
|
||||
"fp32_80": 64,
|
||||
"fp64_80": 32,
|
||||
# Hopper Architecture
|
||||
"fp16_90": 64,
|
||||
"fp32_90": 128,
|
||||
"fp64_90": 64,
|
||||
# Blackwell Architecture
|
||||
"fp16_100": 256,
|
||||
"fp32_100": 128,
|
||||
"fp64_100": 64,
|
||||
}
|
||||
dict_key = ""
|
||||
if data_type is torch.float16:
|
||||
dict_key = f"fp16_{self.compute_capability}"
|
||||
elif data_type is torch.float32:
|
||||
dict_key = f"fp32_{self.compute_capability}"
|
||||
elif data_type is torch.float64:
|
||||
dict_key = f"fp64_{self.compute_capability}"
|
||||
else:
|
||||
dict_key = "unknown"
|
||||
|
||||
if dict_key not in hardcoded_device_values.keys():
|
||||
raise RuntimeError(
|
||||
f"No data for sm_{self.compute_capability} and {data_type}."
|
||||
)
|
||||
|
||||
return hardcoded_device_values[dict_key]
|
||||
|
||||
# Returns the number of FMA instructions retired per cycle per SM for a given
|
||||
# data type, when tensor cores ARE used
|
||||
def get_fma_per_cycle_per_sm_tensor_cores(self, data_type: dtype) -> int:
|
||||
hardcoded_device_values = {
|
||||
# Ampere Architecture
|
||||
"int8_80": 2048,
|
||||
"fp16_80": 1024,
|
||||
"fp32_80": 512,
|
||||
"fp64_80": 64,
|
||||
# Hopper Architecture
|
||||
"int8_90": 4096,
|
||||
"fp8_90": 4096,
|
||||
"fp16_90": 2048,
|
||||
"fp32_90": 1024,
|
||||
"fp64_90": 128,
|
||||
# Blackwell Architecture
|
||||
"int8_100": 8192,
|
||||
"fp8_100": 8192,
|
||||
"fp16_100": 4096,
|
||||
"fp32_100": 2048,
|
||||
}
|
||||
dict_key = ""
|
||||
if data_type is torch.float16:
|
||||
dict_key = f"fp16_{self.compute_capability}"
|
||||
elif data_type is torch.bfloat16:
|
||||
# FP16 and BF16 are equivalent in terms of FLOPs per cycle per SM
|
||||
dict_key = f"fp16_{self.compute_capability}"
|
||||
elif data_type is torch.float32:
|
||||
dict_key = f"fp32_{self.compute_capability}"
|
||||
elif data_type is torch.int8:
|
||||
dict_key = f"int8_{self.compute_capability}"
|
||||
elif data_type is torch.float64:
|
||||
dict_key = f"fp64_{self.compute_capability}"
|
||||
else:
|
||||
dict_key = "unknown"
|
||||
|
||||
if dict_key not in hardcoded_device_values.keys():
|
||||
raise RuntimeError(
|
||||
f"No data for sm_{self.compute_capability} and {data_type}."
|
||||
)
|
||||
|
||||
return hardcoded_device_values[dict_key]
|
||||
|
||||
def get_tflops_per_second(
|
||||
self, data_type: dtype, use_tensor_cores: bool = True
|
||||
) -> float:
|
||||
num_sms = self.device_properties.multi_processor_count
|
||||
clock_rate = self.device_properties.clock_rate # KHz
|
||||
|
||||
fma_per_cycle = 0
|
||||
if use_tensor_cores:
|
||||
fma_per_cycle = self.get_fma_per_cycle_per_sm_tensor_cores(data_type)
|
||||
else:
|
||||
fma_per_cycle = self.get_fma_per_cycle_per_sm_cuda_cores(data_type)
|
||||
|
||||
# 1 FMA counts as 2 floating point operations
|
||||
# Clock rate is in KHz
|
||||
tflops_per_second = num_sms * fma_per_cycle * 2 * clock_rate / 1e9
|
||||
return tflops_per_second
|
||||
|
||||
def get_memory_bandwidth_Bps(self) -> int:
|
||||
# DRAM devices are Double-Data which means they provide an output at both fronts of
|
||||
# a clock beat
|
||||
bus_bytes_per_cycle = int(2 * self.device_properties.memory_bus_width / 8)
|
||||
mem_clock_rate_Hz = self.device_properties.memory_clock_rate * 1000
|
||||
bytes_per_second = bus_bytes_per_cycle * mem_clock_rate_Hz * 2
|
||||
return bytes_per_second
|
||||
|
||||
def get_shared_memory_bandwidth_Bps(self) -> int:
|
||||
# Each warp can LD or ST 32 x 4 bytes per cycle. To calculate the
|
||||
# device's throughput we need to multiply with frequency and number of SMs.
|
||||
num_sms = self.device_properties.multi_processor_count
|
||||
bytes_per_cycle_per_sm = 128
|
||||
bytes_per_cycle_per_device = num_sms * bytes_per_cycle_per_sm
|
||||
bytes_per_second = (
|
||||
bytes_per_cycle_per_device * self.device_properties.clock_rate * 1000
|
||||
)
|
||||
return bytes_per_second
|
Reference in New Issue
Block a user