mirror of
https://github.com/deepspeedai/DeepSpeed.git
synced 2025-10-20 15:33:51 +08:00
Set the default value of op_builder/xxx.py/is_compatible()/verbose to False for quite warning. Add verbose judgement before op_builder/xxx.py/is_compatible()/self.warning(...). Otherwise the verbose arg will not work. --------- Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
119 lines
4.8 KiB
Python
119 lines
4.8 KiB
Python
# Copyright (c) Microsoft Corporation.
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
# DeepSpeed Team
|
|
|
|
import os
|
|
|
|
from .builder import CUDAOpBuilder, installed_cuda_version
|
|
|
|
|
|
class RaggedOpsBuilder(CUDAOpBuilder):
|
|
BUILD_VAR = "DS_BUILD_RAGGED_DEVICE_OPS"
|
|
NAME = "ragged_device_ops"
|
|
|
|
def __init__(self, name=None):
|
|
name = self.NAME if name is None else name
|
|
super().__init__(name=name)
|
|
|
|
def absolute_name(self):
|
|
return f'deepspeed.inference.v2.kernels.ragged_ops.{self.NAME}'
|
|
|
|
def is_compatible(self, verbose=False):
|
|
try:
|
|
import torch
|
|
except ImportError:
|
|
if verbose:
|
|
self.warning("Please install torch if trying to pre-compile inference kernels")
|
|
return False
|
|
|
|
cuda_okay = True
|
|
if not self.is_rocm_pytorch() and torch.cuda.is_available(): #ignore-cuda
|
|
sys_cuda_major, _ = installed_cuda_version()
|
|
torch_cuda_major = int(torch.version.cuda.split('.')[0])
|
|
cuda_capability = torch.cuda.get_device_properties(0).major #ignore-cuda
|
|
if cuda_capability < 6:
|
|
if verbose:
|
|
self.warning("NVIDIA Inference is only supported on Pascal and newer architectures")
|
|
cuda_okay = False
|
|
if cuda_capability >= 8:
|
|
if torch_cuda_major < 11 or sys_cuda_major < 11:
|
|
if verbose:
|
|
self.warning("On Ampere and higher architectures please use CUDA 11+")
|
|
cuda_okay = False
|
|
return super().is_compatible(verbose) and cuda_okay
|
|
|
|
def filter_ccs(self, ccs):
|
|
ccs_retained = []
|
|
ccs_pruned = []
|
|
for cc in ccs:
|
|
if int(cc[0]) >= 8:
|
|
# Blocked flash has a dependency on Ampere + newer
|
|
ccs_retained.append(cc)
|
|
else:
|
|
ccs_pruned.append(cc)
|
|
if len(ccs_pruned) > 0:
|
|
self.warning(f"Filtered compute capabilities {ccs_pruned}")
|
|
return ccs_retained
|
|
|
|
def get_prefix(self):
|
|
ds_path = self.deepspeed_src_path("deepspeed")
|
|
return "deepspeed" if os.path.isdir(ds_path) else ".."
|
|
|
|
def sources(self):
|
|
sources = [
|
|
"inference/v2/kernels/ragged_ops/ragged_ops.cpp",
|
|
"inference/v2/kernels/ragged_ops/atom_builder/atom_builder.cpp",
|
|
"inference/v2/kernels/ragged_ops/blocked_flash/blocked_flash.cpp",
|
|
"inference/v2/kernels/ragged_ops/embed/embed.cpp",
|
|
"inference/v2/kernels/ragged_ops/embed/embed_cuda.cu",
|
|
"inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.cpp",
|
|
"inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary_cuda.cu",
|
|
"inference/v2/kernels/ragged_ops/logits_gather/logits_gather.cpp",
|
|
"inference/v2/kernels/ragged_ops/logits_gather/logits_gather_cuda.cu",
|
|
"inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.cpp",
|
|
"inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter_cuda.cu",
|
|
"inference/v2/kernels/ragged_ops/moe_gather/moe_gather.cpp",
|
|
"inference/v2/kernels/ragged_ops/moe_gather/moe_gather_cuda.cu",
|
|
"inference/v2/kernels/ragged_ops/ragged_helpers/ragged_kernel_helpers.cpp",
|
|
"inference/v2/kernels/ragged_ops/top_k_gating/top_k_gating.cpp",
|
|
"inference/v2/kernels/ragged_ops/top_k_gating/top_k_gating_cuda.cu",
|
|
]
|
|
|
|
prefix = self.get_prefix()
|
|
sources = [os.path.join(prefix, src) for src in sources]
|
|
return sources
|
|
|
|
def extra_ldflags(self):
|
|
import dskernels
|
|
lib_path = dskernels.library_path()
|
|
|
|
prefix = self.get_prefix()
|
|
lib_path = os.path.join(prefix, lib_path)
|
|
lib_path = self.deepspeed_src_path(lib_path)
|
|
|
|
args = [f'-L{lib_path}', '-lblockedflash']
|
|
if self.jit_load:
|
|
args.append(f'-Wl,-rpath,{lib_path}')
|
|
return args
|
|
|
|
def include_paths(self):
|
|
sources = [
|
|
'inference/v2/kernels/includes',
|
|
'inference/v2/kernels/ragged_ops',
|
|
'inference/v2/kernels/ragged_ops/atom_builder',
|
|
'inference/v2/kernels/ragged_ops/blocked_flash',
|
|
'inference/v2/kernels/ragged_ops/embed',
|
|
'inference/v2/kernels/ragged_ops/includes',
|
|
'inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary',
|
|
'inference/v2/kernels/ragged_ops/logits_gather',
|
|
'inference/v2/kernels/ragged_ops/moe_gather',
|
|
'inference/v2/kernels/ragged_ops/moe_scatter',
|
|
'inference/v2/kernels/ragged_ops/ragged_helpers',
|
|
'inference/v2/kernels/ragged_ops/top_k_gating',
|
|
]
|
|
|
|
prefix = self.get_prefix()
|
|
sources = [os.path.join(prefix, src) for src in sources]
|
|
return sources
|