Files
DeepSpeed/op_builder/fp_quantizer.py
Yuanyuan Chen 1c03d1b1bb Fix invalid f-strings (#7457)
Fix invalid f-strings detected by ruff.

---------

Signed-off-by: cyy <cyyever@outlook.com>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
Co-authored-by: Olatunji Ruwase <tunji.ruwase@snowflake.com>
Co-authored-by: Michael Wyatt <michael.wyatt@snowflake.com>
2025-08-16 18:22:19 +00:00

121 lines
4.0 KiB
Python

# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
try:
from packaging import version as pkg_version
except ImportError:
pkg_version = None
from .builder import CUDAOpBuilder, installed_cuda_version
class FPQuantizerBuilder(CUDAOpBuilder):
BUILD_VAR = "DS_BUILD_FP_QUANTIZER"
NAME = "fp_quantizer"
def __init__(self, name=None):
name = self.NAME if name is None else name
super().__init__(name=name)
def absolute_name(self):
return f'deepspeed.ops.fp_quantizer.{self.NAME}_op'
def is_compatible(self, verbose=False):
try:
import torch
except ImportError:
if verbose:
self.warning("Please install torch if trying to pre-compile inference kernels")
return False
cuda_okay = True
if not self.is_rocm_pytorch() and torch.cuda.is_available(): #ignore-cuda
sys_cuda_major, _ = installed_cuda_version()
torch_cuda_major = int(torch.version.cuda.split('.')[0])
cuda_capability = torch.cuda.get_device_properties(0).major #ignore-cuda
if cuda_capability < 8:
if verbose:
self.warning("NVIDIA Inference is only supported on Ampere and newer architectures")
cuda_okay = False
if cuda_capability >= 8:
if torch_cuda_major < 11 or sys_cuda_major < 11:
if verbose:
self.warning("On Ampere and higher architectures please use CUDA 11+")
cuda_okay = False
try:
import triton
except ImportError:
if verbose:
self.warning(
"please install triton==2.3.0, 2.3.1 or 3.0.0 if you want to use the FP Quantizer Kernels")
return False
# triton 2.3.{0,1} and 3.0.0 are ok.
allowed_versions = ("2.3", "3.0", "3.1", "3.2")
if pkg_version:
allowed = (pkg_version.parse(v) for v in allowed_versions)
installed_triton = pkg_version.parse(triton.__version__)
triton_mismatch = all(installed_triton.major != a.major or installed_triton.minor != a.minor
for a in allowed)
else:
installed_triton = triton.__version__
major, minor, _ = installed_triton.split(".")
allowed = (v.split(".") for v in allowed_versions)
triton_mismatch = all(major != v[0] or minor != v[1] for v in allowed)
if triton_mismatch:
if verbose:
self.warning(
f"FP Quantizer is using an untested triton version ({installed_triton}), only 2.3.{0,1} and 3.0.0 are known to be compatible with these kernels"
)
return False
return super().is_compatible(verbose) and cuda_okay
def filter_ccs(self, ccs):
ccs_retained = []
ccs_pruned = []
for cc in [cc.split('.') for cc in ccs]:
if int(cc[0]) >= 8:
ccs_retained.append(cc)
else:
ccs_pruned.append(cc)
if len(ccs_pruned) > 0:
self.warning(f"Filtered compute capabilities {ccs_pruned}")
return ccs_retained
def sources(self):
return [
"csrc/fp_quantizer/fp_quantize.cu",
"csrc/fp_quantizer/fp_quantize.cpp",
]
def extra_ldflags(self):
if not self.is_rocm_pytorch():
return ['-lcurand']
else:
return []
def include_paths(self):
return ['csrc/fp_quantizer/includes', 'csrc/includes']
@staticmethod
def get_default_quant_dtype():
import torch
return torch.uint8
@staticmethod
def get_quant_range(q_bits=None):
if q_bits == 8:
return 480
elif q_bits == 6:
return 28.
elif q_bits == 12:
return 510.
else:
assert (0), \
"Please specify the right quantization range for the selected precision!"