mirror of
https://github.com/deepspeedai/DeepSpeed.git
synced 2025-10-20 15:33:51 +08:00
Abstract accelerator (step 2) (#2560)
* Abstract accelerator (step 2) * more flex op_builder path for both installation and runtime * add SpatialInferenceBuilder into cuda_accelerator.py * use reflection to make cuda_accelerator adapt to CUDA op builder change automatically * clean up deepspeed/__init__.py * add comments in cuda_accelerator for no torch path * Update deepspeed/env_report.py Change env_report.py according to suggestion Co-authored-by: Michael Wyatt <mrwyattii@gmail.com> * reduce the range of try...except for better code clarity * Add porting for deepspeed/ops/random_ltd/dropping_utils.py * move accelerator to top directory and create symlink under deepspeed Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com> Co-authored-by: Michael Wyatt <mrwyattii@gmail.com> Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
This commit is contained in:
@ -4,3 +4,4 @@ recursive-include deepspeed *.cpp *.h *.cu *.hip *.tr *.cuh *.cc *.json
|
||||
recursive-include csrc *.cpp *.h *.cu *.tr *.cuh *.cc
|
||||
recursive-include op_builder *.py
|
||||
recursive-include benchmarks *.py
|
||||
recursive-include accelerator *.py
|
||||
|
@ -6,3 +6,4 @@ recursive-include deepspeed *.tr
|
||||
recursive-exclude deepspeed/ops/csrc *.cpp *.h *.cu *.cuh *.cc
|
||||
prune csrc
|
||||
prune op_builder
|
||||
prune accelerator
|
||||
|
@ -1,5 +1,14 @@
|
||||
from deepspeed.accelerator.abstract_accelerator import DeepSpeedAccelerator
|
||||
import os
|
||||
import pkgutil
|
||||
import importlib
|
||||
|
||||
from .abstract_accelerator import DeepSpeedAccelerator
|
||||
# During setup stage torch may not be installed, pass on no torch will
|
||||
# allow op builder related API to be executed.
|
||||
try:
|
||||
import torch.cuda
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
class CUDA_Accelerator(DeepSpeedAccelerator):
|
||||
@ -7,6 +16,25 @@ class CUDA_Accelerator(DeepSpeedAccelerator):
|
||||
self._name = 'cuda'
|
||||
self._communication_backend_name = 'nccl'
|
||||
|
||||
# begin initialize for create_op_builder()
|
||||
# put all valid class name <--> class type mapping into class_dict
|
||||
op_builder_dir = self.op_builder_dir()
|
||||
op_builder_module = importlib.import_module(op_builder_dir)
|
||||
|
||||
for _, module_name, _ in pkgutil.iter_modules([os.path.dirname(op_builder_module.__file__)]):
|
||||
# avoid self references
|
||||
if module_name != 'all_ops' and module_name != 'builder' and module_name != 'builder_names':
|
||||
module = importlib.import_module("{}.{}".format(
|
||||
op_builder_dir,
|
||||
module_name))
|
||||
for member_name in module.__dir__():
|
||||
if member_name.endswith(
|
||||
'Builder'
|
||||
) and member_name != "OpBuilder" and member_name != "CUDAOpBuilder" and member_name != "TorchCPUOpBuilder": # avoid abstract classes
|
||||
if not member_name in self.class_dict:
|
||||
self.class_dict[member_name] = getattr(module, member_name)
|
||||
# end initialize for create_op_builder()
|
||||
|
||||
# Device APIs
|
||||
def device_name(self, device_index=None):
|
||||
if device_index == None:
|
||||
@ -194,44 +222,21 @@ class CUDA_Accelerator(DeepSpeedAccelerator):
|
||||
return False
|
||||
|
||||
def op_builder_dir(self):
|
||||
try:
|
||||
# during installation time op_builder is visible, otherwise return deepspeed.ops.op_builder
|
||||
import op_builder # noqa: F401
|
||||
return "op_builder"
|
||||
except ImportError:
|
||||
return "deepspeed.ops.op_builder"
|
||||
|
||||
def create_op_builder(self, class_name):
|
||||
from deepspeed.ops.op_builder import AsyncIOBuilder, CPUAdagradBuilder, CPUAdamBuilder, FusedAdamBuilder, FusedLambBuilder, QuantizerBuilder, SparseAttnBuilder, StochasticTransformerBuilder, TransformerBuilder, InferenceBuilder, UtilsBuilder
|
||||
from deepspeed.ops.op_builder.builder_names import AsyncIOBuilder as AsyncIOBuilderName
|
||||
from deepspeed.ops.op_builder.builder_names import CPUAdagradBuilder as CPUAdagradBuilderName
|
||||
from deepspeed.ops.op_builder.builder_names import CPUAdamBuilder as CPUAdamBuilderName
|
||||
from deepspeed.ops.op_builder.builder_names import FusedAdamBuilder as FusedAdamBuilderName
|
||||
from deepspeed.ops.op_builder.builder_names import FusedLambBuilder as FusedLambBuilderName
|
||||
from deepspeed.ops.op_builder.builder_names import QuantizerBuilder as QuantizerBuilderName
|
||||
from deepspeed.ops.op_builder.builder_names import SparseAttnBuilder as SparseAttnBuilderName
|
||||
from deepspeed.ops.op_builder.builder_names import StochasticTransformerBuilder as StochasticTransformerBuilderName
|
||||
from deepspeed.ops.op_builder.builder_names import TransformerBuilder as TransformerBuilderName
|
||||
from deepspeed.ops.op_builder.builder_names import InferenceBuilder as InferenceBuilderName
|
||||
from deepspeed.ops.op_builder.builder_names import UtilsBuilder as UtilsBuilderName
|
||||
# dict that holds class name <--> class type mapping i.e.
|
||||
# 'AsyncIOBuilder': <class 'op_builder.async_io.AsyncIOBuilder'>
|
||||
# this dict will be filled at init stage
|
||||
class_dict = {}
|
||||
|
||||
if class_name == AsyncIOBuilderName:
|
||||
return AsyncIOBuilder()
|
||||
elif class_name == CPUAdagradBuilderName:
|
||||
return CPUAdagradBuilder()
|
||||
elif class_name == CPUAdamBuilderName:
|
||||
return CPUAdamBuilder()
|
||||
elif class_name == FusedAdamBuilderName:
|
||||
return FusedAdamBuilder()
|
||||
elif class_name == FusedLambBuilderName:
|
||||
return FusedLambBuilder()
|
||||
elif class_name == QuantizerBuilderName:
|
||||
return QuantizerBuilder()
|
||||
elif class_name == SparseAttnBuilderName:
|
||||
return SparseAttnBuilder()
|
||||
elif class_name == StochasticTransformerBuilderName:
|
||||
return StochasticTransformerBuilder()
|
||||
elif class_name == TransformerBuilderName:
|
||||
return TransformerBuilder()
|
||||
elif class_name == InferenceBuilderName:
|
||||
return InferenceBuilder()
|
||||
elif class_name == UtilsBuilderName:
|
||||
return UtilsBuilder()
|
||||
def create_op_builder(self, class_name):
|
||||
if class_name in self.class_dict:
|
||||
return self.class_dict[class_name]()
|
||||
else:
|
||||
return None
|
||||
|
@ -24,7 +24,7 @@ def get_accelerator():
|
||||
_validate_accelerator(ds_accelerator)
|
||||
return ds_accelerator
|
||||
|
||||
from deepspeed.accelerator.cuda_accelerator import CUDA_Accelerator
|
||||
from .cuda_accelerator import CUDA_Accelerator
|
||||
ds_accelerator = CUDA_Accelerator()
|
||||
_validate_accelerator(ds_accelerator)
|
||||
return ds_accelerator
|
@ -15,6 +15,8 @@ import shutil
|
||||
from test_ds_aio_utils import refine_integer_value
|
||||
from perf_sweep_utils import READ_OP_DESC, WRITE_OP_DESC, BENCH_LOG_DIR, \
|
||||
READ_IO_DIR, WRITE_IO_DIR, READ_LOG_DIR, WRITE_LOG_DIR
|
||||
from deepspeed.accelerator import get_accelerator
|
||||
from deepspeed.ops.op_builder.builder_names import AsyncIOBuilder
|
||||
|
||||
OTHER_OPTIONS = '--handle'
|
||||
PERF_SCRIPT = 'test_ds_aio.py'
|
||||
@ -277,8 +279,7 @@ def script_path():
|
||||
|
||||
|
||||
def async_io_setup():
|
||||
from deepspeed.ops.aio import AsyncIOBuilder
|
||||
return AsyncIOBuilder().is_compatible()
|
||||
return get_accelerator().create_op_builder(AsyncIOBuilder).is_compatible()
|
||||
|
||||
|
||||
def get_block_size_and_count(io_bytes):
|
||||
|
@ -8,9 +8,10 @@ Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
|
||||
import torch
|
||||
import os
|
||||
import time
|
||||
from deepspeed.ops.aio import AsyncIOBuilder
|
||||
from multiprocessing import Pool, Barrier
|
||||
from test_ds_aio_utils import report_results, task_log, task_barrier
|
||||
from deepspeed.accelerator import get_accelerator
|
||||
from deepspeed.ops.op_builder.builder_names import AsyncIOBuilder
|
||||
|
||||
|
||||
def pre_basic(args, tid, read_op):
|
||||
@ -19,7 +20,10 @@ def pre_basic(args, tid, read_op):
|
||||
file = args.read_file if read_op else f'{args.write_file}.{tid}'
|
||||
|
||||
task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
|
||||
buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory()
|
||||
buffer = get_accelerator().pin_memory(
|
||||
torch.empty(num_bytes,
|
||||
dtype=torch.uint8,
|
||||
device='cpu'))
|
||||
task_log(
|
||||
tid,
|
||||
f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
|
||||
@ -56,7 +60,8 @@ def post_basic(pool_params):
|
||||
def main_basic_read(pool_params):
|
||||
args, tid, ctxt = pool_params
|
||||
start_time = time.time()
|
||||
AsyncIOBuilder().load().aio_read(ctxt['buffer'],
|
||||
get_accelerator().create_op_builder(AsyncIOBuilder).load().aio_read(
|
||||
ctxt['buffer'],
|
||||
ctxt['file'],
|
||||
args.block_size,
|
||||
args.queue_depth,
|
||||
@ -72,7 +77,8 @@ def main_basic_read(pool_params):
|
||||
def main_basic_write(pool_params):
|
||||
args, tid, ctxt = pool_params
|
||||
start_time = time.time()
|
||||
AsyncIOBuilder().load().aio_write(ctxt['buffer'],
|
||||
get_accelerator().create_op_builder(AsyncIOBuilder).load().aio_write(
|
||||
ctxt['buffer'],
|
||||
ctxt['file'],
|
||||
args.block_size,
|
||||
args.queue_depth,
|
||||
|
@ -9,8 +9,9 @@ import torch
|
||||
import os
|
||||
import time
|
||||
from multiprocessing import Pool, Barrier
|
||||
from deepspeed.ops.aio import AsyncIOBuilder
|
||||
from test_ds_aio_utils import report_results, task_log, task_barrier
|
||||
from deepspeed.accelerator import get_accelerator
|
||||
from deepspeed.ops.op_builder.builder_names import AsyncIOBuilder
|
||||
|
||||
|
||||
def pre_handle(args, tid, read_op):
|
||||
@ -20,16 +21,22 @@ def pre_handle(args, tid, read_op):
|
||||
|
||||
task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
|
||||
if args.gpu:
|
||||
buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cuda')
|
||||
buffer = torch.empty(num_bytes,
|
||||
dtype=torch.uint8,
|
||||
device=get_accelerator().device_name())
|
||||
else:
|
||||
buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory()
|
||||
buffer = get_accelerator().pin_memory(
|
||||
torch.empty(num_bytes,
|
||||
dtype=torch.uint8,
|
||||
device='cpu'))
|
||||
task_log(
|
||||
tid,
|
||||
f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
|
||||
)
|
||||
|
||||
io_parallel = args.io_parallel if args.io_parallel else 1
|
||||
handle = AsyncIOBuilder().load().aio_handle(args.block_size,
|
||||
handle = get_accelerator().create_op_builder(AsyncIOBuilder).load().aio_handle(
|
||||
args.block_size,
|
||||
args.queue_depth,
|
||||
args.single_submit,
|
||||
args.overlap_events,
|
||||
|
@ -4,5 +4,6 @@ Licensed under the MIT license.
|
||||
|
||||
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
|
||||
"""
|
||||
from deepspeed.ops.aio import AsyncIOBuilder
|
||||
assert AsyncIOBuilder().is_compatible()
|
||||
from deepspeed.accelerator import get_accelerator
|
||||
from deepspeed.ops.op_builder.builder_names import AsyncIOBuilder
|
||||
assert get_accelerator().create_op_builder(AsyncIOBuilder).is_compatible()
|
||||
|
1
deepspeed/accelerator
Symbolic link
1
deepspeed/accelerator
Symbolic link
@ -0,0 +1 @@
|
||||
../accelerator/
|
@ -31,6 +31,7 @@ import os
|
||||
|
||||
from ..constants import TORCH_DISTRIBUTED_DEFAULT_PORT, default_pg_timeout
|
||||
from .constants import *
|
||||
from deepspeed.accelerator import get_accelerator
|
||||
|
||||
|
||||
class ReduceOp(Enum):
|
||||
@ -127,7 +128,7 @@ def timed_op(func):
|
||||
finally:
|
||||
if comms_logger.enabled:
|
||||
# Need to make op blocking for accurate logging
|
||||
torch.cuda.synchronize()
|
||||
get_accelerator().synchronize()
|
||||
# If we're using MPI, we can't simply sync the stream
|
||||
if cdb.using_mpi:
|
||||
cdb.barrier()
|
||||
@ -595,7 +596,7 @@ def get_global_rank(group=None, group_rank=0):
|
||||
|
||||
|
||||
# Main DeepSpeed Comms. public API.
|
||||
def init_distributed(dist_backend="nccl",
|
||||
def init_distributed(dist_backend=None,
|
||||
auto_mpi_discovery=True,
|
||||
distributed_port=TORCH_DISTRIBUTED_DEFAULT_PORT,
|
||||
verbose=True,
|
||||
@ -650,6 +651,8 @@ def init_distributed(dist_backend="nccl",
|
||||
utils.logger.info('Distributed backend already initialized')
|
||||
else:
|
||||
assert isinstance(timeout, timedelta)
|
||||
if dist_backend == None:
|
||||
dist_backend = get_accelerator().communication_backend_name()
|
||||
if int(os.getenv('RANK', '0')) == 0:
|
||||
utils.logger.info(
|
||||
'Initializing TorchBackend in DeepSpeed with backend {}'.format(
|
||||
|
@ -2,8 +2,9 @@ import torch
|
||||
import deepspeed
|
||||
import subprocess
|
||||
import argparse
|
||||
from .ops.op_builder import ALL_OPS
|
||||
from .ops.op_builder.all_ops import ALL_OPS
|
||||
from .git_version_info import installed_ops, torch_info
|
||||
from deepspeed.accelerator import get_accelerator
|
||||
|
||||
GREEN = '\033[92m'
|
||||
RED = '\033[91m'
|
||||
@ -79,31 +80,33 @@ def nvcc_version():
|
||||
def debug_report():
|
||||
max_dots = 33
|
||||
|
||||
hip_version = None
|
||||
if hasattr(torch.version, 'hip'):
|
||||
hip_version = torch.version.hip
|
||||
|
||||
report = [
|
||||
("torch install path",
|
||||
torch.__path__),
|
||||
("torch version",
|
||||
torch.__version__),
|
||||
("torch cuda version",
|
||||
("deepspeed install path",
|
||||
deepspeed.__path__),
|
||||
("deepspeed info",
|
||||
f"{deepspeed.__version__}, {deepspeed.__git_hash__}, {deepspeed.__git_branch__}"
|
||||
)
|
||||
]
|
||||
if get_accelerator().device_name() == 'cuda':
|
||||
hip_version = getattr(torch.version, "hip", None)
|
||||
report.extend([("torch cuda version",
|
||||
torch.version.cuda),
|
||||
("torch hip version",
|
||||
hip_version),
|
||||
("nvcc version",
|
||||
(None if hip_version else nvcc_version())),
|
||||
("deepspeed install path",
|
||||
deepspeed.__path__),
|
||||
("deepspeed info",
|
||||
f"{deepspeed.__version__}, {deepspeed.__git_hash__}, {deepspeed.__git_branch__}"
|
||||
),
|
||||
("deepspeed wheel compiled w.",
|
||||
f"torch {torch_info['version']}, " +
|
||||
(f"hip {torch_info['hip_version']}"
|
||||
if hip_version else f"cuda {torch_info['cuda_version']}")),
|
||||
]
|
||||
if hip_version else f"cuda {torch_info['cuda_version']}"))])
|
||||
else:
|
||||
report.extend([("deepspeed wheel compiled w.",
|
||||
f"torch {torch_info['version']} ")])
|
||||
|
||||
print("DeepSpeed general environment info:")
|
||||
for name, value in report:
|
||||
print(name, "." * (max_dots - len(name)), value)
|
||||
|
@ -11,7 +11,7 @@ except ModuleNotFoundError:
|
||||
git_hash = '[none]'
|
||||
git_branch = '[none]'
|
||||
|
||||
from .ops.op_builder import ALL_OPS
|
||||
from .ops.op_builder.all_ops import ALL_OPS
|
||||
installed_ops = dict.fromkeys(ALL_OPS.keys(), False)
|
||||
compatible_ops = dict.fromkeys(ALL_OPS.keys(), False)
|
||||
torch_info = {'version': "0.0", "cuda_version": "0.0", "hip_version": "0.0"}
|
||||
|
@ -3,7 +3,8 @@ Copyright 2020 The Microsoft DeepSpeed Team
|
||||
'''
|
||||
|
||||
import torch
|
||||
from ..op_builder import CPUAdagradBuilder
|
||||
from deepspeed.accelerator import get_accelerator
|
||||
from deepspeed.ops.op_builder.builder_names import CPUAdagradBuilder
|
||||
from deepspeed.utils.logging import should_log_le
|
||||
|
||||
|
||||
@ -24,7 +25,8 @@ class DeepSpeedCPUAdagrad(torch.optim.Optimizer):
|
||||
self.opt_id = DeepSpeedCPUAdagrad.optimizer_id
|
||||
DeepSpeedCPUAdagrad.optimizer_id = DeepSpeedCPUAdagrad.optimizer_id + 1
|
||||
self.fp32_optimizer_states = fp32_optimizer_states
|
||||
self.ds_opt_adagrad = CPUAdagradBuilder().load()
|
||||
self.ds_opt_adagrad = get_accelerator().create_op_builder(
|
||||
CPUAdagradBuilder).load()
|
||||
|
||||
self.ds_opt_adagrad.create_adagrad(self.opt_id,
|
||||
lr,
|
||||
|
@ -4,9 +4,10 @@ Copyright 2020 The Microsoft DeepSpeed Team
|
||||
|
||||
import torch
|
||||
from cpuinfo import get_cpu_info
|
||||
from ..op_builder import CPUAdamBuilder
|
||||
from deepspeed.utils import logger
|
||||
from deepspeed.utils.logging import should_log_le
|
||||
from deepspeed.accelerator import get_accelerator
|
||||
from deepspeed.ops.op_builder.builder_names import CPUAdamBuilder
|
||||
|
||||
|
||||
class DeepSpeedCPUAdam(torch.optim.Optimizer):
|
||||
@ -91,7 +92,7 @@ class DeepSpeedCPUAdam(torch.optim.Optimizer):
|
||||
DeepSpeedCPUAdam.optimizer_id = DeepSpeedCPUAdam.optimizer_id + 1
|
||||
self.adam_w_mode = adamw_mode
|
||||
self.fp32_optimizer_states = fp32_optimizer_states
|
||||
self.ds_opt_adam = CPUAdamBuilder().load()
|
||||
self.ds_opt_adam = get_accelerator().create_op_builder(CPUAdamBuilder).load()
|
||||
|
||||
self.ds_opt_adam.create_adam(self.opt_id,
|
||||
lr,
|
||||
|
@ -9,7 +9,8 @@ import torch
|
||||
from .multi_tensor_apply import MultiTensorApply
|
||||
|
||||
multi_tensor_applier = MultiTensorApply(2048 * 32)
|
||||
from ..op_builder import FusedAdamBuilder
|
||||
from deepspeed.accelerator import get_accelerator
|
||||
from deepspeed.ops.op_builder.builder_names import FusedAdamBuilder
|
||||
|
||||
|
||||
class FusedAdam(torch.optim.Optimizer):
|
||||
@ -69,9 +70,9 @@ class FusedAdam(torch.optim.Optimizer):
|
||||
self.adam_w_mode = 1 if adam_w_mode else 0
|
||||
self.set_grad_none = set_grad_none
|
||||
|
||||
fused_adam_cuda = FusedAdamBuilder().load()
|
||||
fused_adam_cuda = get_accelerator().create_op_builder(FusedAdamBuilder).load()
|
||||
# Skip buffer
|
||||
self._dummy_overflow_buf = torch.cuda.IntTensor([0])
|
||||
self._dummy_overflow_buf = get_accelerator().IntTensor([0])
|
||||
self.multi_tensor_adam = fused_adam_cuda.multi_tensor_adam
|
||||
|
||||
def zero_grad(self):
|
||||
|
@ -6,7 +6,8 @@ This file is adapted from NVIDIA/apex/optimizer/fused_adam and implements the LA
|
||||
'''
|
||||
import types
|
||||
import torch
|
||||
from ..op_builder import FusedLambBuilder
|
||||
from deepspeed.accelerator import get_accelerator
|
||||
from deepspeed.ops.op_builder.builder_names import FusedLambBuilder
|
||||
|
||||
|
||||
class FusedLamb(torch.optim.Optimizer):
|
||||
@ -48,7 +49,8 @@ class FusedLamb(torch.optim.Optimizer):
|
||||
max_coeff=10.0,
|
||||
min_coeff=0.01,
|
||||
amsgrad=False):
|
||||
self.fused_lamb_cuda = FusedLambBuilder().load()
|
||||
self.fused_lamb_cuda = get_accelerator().create_op_builder(
|
||||
FusedLambBuilder).load()
|
||||
|
||||
if amsgrad:
|
||||
raise RuntimeError('FusedLamb does not support the AMSGrad variant.')
|
||||
|
@ -3,7 +3,8 @@ Copyright 2020 The Microsoft DeepSpeed Team
|
||||
'''
|
||||
import torch
|
||||
|
||||
from ..op_builder import QuantizerBuilder
|
||||
from deepspeed.accelerator import get_accelerator
|
||||
from deepspeed.ops.op_builder.builder_names import QuantizerBuilder
|
||||
|
||||
# Cuda modules will be imported if needed
|
||||
quantizer_cuda_module = None
|
||||
@ -13,7 +14,8 @@ def ds_quantizer(input, groups=1, bit_num=8, sr=False, asym=False):
|
||||
# Load cuda modules if needed
|
||||
global quantizer_cuda_module
|
||||
if quantizer_cuda_module is None:
|
||||
quantizer_cuda_module = QuantizerBuilder().load()
|
||||
quantizer_cuda_module = get_accelerator().create_op_builder(
|
||||
QuantizerBuilder).load()
|
||||
if sr:
|
||||
if asym:
|
||||
quantize_func = quantizer_cuda_module.ds_sr_quantize_asym_fp16 if input.dtype == torch.half else quantizer_cuda_module.ds_sr_quantize_asym_fp32
|
||||
|
@ -3,7 +3,8 @@ Copyright 2022 The Microsoft DeepSpeed Team
|
||||
"""
|
||||
import torch
|
||||
|
||||
from ..op_builder import RandomLTDBuilder
|
||||
from deepspeed.accelerator import get_accelerator
|
||||
from deepspeed.ops.op_builder.builder_names import RandomLTDBuilder
|
||||
"""
|
||||
Returns:
|
||||
sampled_indices: [layers, batch_size, reserved_length]
|
||||
@ -28,7 +29,7 @@ def gpt_sample_tokens(reserved_length: int,
|
||||
reserved_length).to(torch.int32)
|
||||
global random_ltd_module
|
||||
if random_ltd_module is None:
|
||||
random_ltd_module = RandomLTDBuilder().load()
|
||||
random_ltd_module = get_accelerator().create_op_builder(RandomLTDBuilder).load()
|
||||
sampled_indices = random_ltd_module.token_sort_(sampled_indices, seq_length)
|
||||
|
||||
# Not certain the optimized kernel is actually better here, cause it kind of screws
|
||||
@ -64,7 +65,7 @@ def bert_sample_tokens(reserved_length: int,
|
||||
reserved_length).to(torch.int32)
|
||||
global random_ltd_module
|
||||
if random_ltd_module is None:
|
||||
random_ltd_module = RandomLTDBuilder().load()
|
||||
random_ltd_module = get_accelerator().create_op_builder(RandomLTDBuilder).load()
|
||||
|
||||
sampled_indices = random_ltd_module.token_sort_(sampled_indices, seq_length)
|
||||
dtype = sampled_indices.dtype
|
||||
@ -89,7 +90,8 @@ class GatherTokens(torch.autograd.Function):
|
||||
batch_first: bool):
|
||||
global random_ltd_module
|
||||
if random_ltd_module is None:
|
||||
random_ltd_module = RandomLTDBuilder().load()
|
||||
random_ltd_module = get_accelerator().create_op_builder(
|
||||
RandomLTDBuilder).load()
|
||||
ctx.save_for_backward(activations, sorted_indices)
|
||||
ctx.batch_first = batch_first
|
||||
return activations, random_ltd_module.token_gather(activations, sorted_indices, batch_first)
|
||||
@ -100,7 +102,8 @@ class GatherTokens(torch.autograd.Function):
|
||||
g_gradients = g_gradients.contiguous()
|
||||
global random_ltd_module
|
||||
if random_ltd_module is None:
|
||||
random_ltd_module = RandomLTDBuilder().load()
|
||||
random_ltd_module = get_accelerator().create_op_builder(
|
||||
RandomLTDBuilder).load()
|
||||
activations, sorted_indices = ctx.saved_tensors
|
||||
batch_first = ctx.batch_first
|
||||
|
||||
@ -119,7 +122,8 @@ class ScatterTokens(torch.autograd.Function):
|
||||
batch_first: bool):
|
||||
global random_ltd_module
|
||||
if random_ltd_module is None:
|
||||
random_ltd_module = RandomLTDBuilder().load()
|
||||
random_ltd_module = get_accelerator().create_op_builder(
|
||||
RandomLTDBuilder).load()
|
||||
scatter_results = random_ltd_module.token_scatter_(all_activations.clone(),
|
||||
layer_activations,
|
||||
sorted_indices,
|
||||
@ -135,7 +139,8 @@ class ScatterTokens(torch.autograd.Function):
|
||||
out_gradients = out_gradients.contiguous()
|
||||
global random_ltd_module
|
||||
if random_ltd_module is None:
|
||||
random_ltd_module = RandomLTDBuilder().load()
|
||||
random_ltd_module = get_accelerator().create_op_builder(
|
||||
RandomLTDBuilder).load()
|
||||
sorted_indices, = ctx.saved_tensors
|
||||
batch_first = ctx.batch_first
|
||||
|
||||
|
@ -6,6 +6,7 @@ import torch
|
||||
import triton
|
||||
import triton.language as tl
|
||||
import triton._C.libtriton as libtriton
|
||||
from deepspeed.accelerator import get_accelerator
|
||||
|
||||
|
||||
@triton.jit
|
||||
@ -948,7 +949,7 @@ class MatMul:
|
||||
raise ValueError(
|
||||
f"Inputs must be on the same device; got {a.device} for tensor A "
|
||||
f"and {b.device} for tensor B")
|
||||
if not a.is_cuda:
|
||||
if not get_accelerator().on_accelerator(a):
|
||||
raise ValueError("Only GPU devices are supported for now")
|
||||
|
||||
# When autocast is enabled, torch.matmul autocasts to float16, so we do the same here
|
||||
|
@ -4,8 +4,8 @@ Copyright 2022 The Microsoft DeepSpeed Team
|
||||
|
||||
from typing import Optional
|
||||
import torch
|
||||
|
||||
from ... import op_builder
|
||||
from deepspeed.accelerator import get_accelerator
|
||||
from deepspeed.ops.op_builder.builder_names import SpatialInferenceBuilder
|
||||
|
||||
spatial_cuda_module = None
|
||||
|
||||
@ -16,7 +16,8 @@ def nhwc_bias_add(activation: torch.Tensor,
|
||||
other_bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
||||
global spatial_cuda_module
|
||||
if spatial_cuda_module is None:
|
||||
spatial_cuda_module = op_builder.SpatialInferenceBuilder().load()
|
||||
spatial_cuda_module = get_accelerator().create_op_builder(
|
||||
SpatialInferenceBuilder).load()
|
||||
|
||||
if other is None:
|
||||
return spatial_cuda_module.nhwc_bias_add(activation, bias)
|
||||
|
@ -4,10 +4,12 @@ Copyright 2022 The Microsoft DeepSpeed Team
|
||||
import math
|
||||
import torch
|
||||
from torch.autograd import Function
|
||||
from ... import op_builder
|
||||
import torch.nn as nn
|
||||
from packaging import version as pkg_version
|
||||
from deepspeed.utils.logging import log_dist
|
||||
from deepspeed.accelerator import get_accelerator
|
||||
from deepspeed.ops.op_builder.builder_names import InferenceBuilder
|
||||
|
||||
# Cuda modules will be imported if needed
|
||||
inference_cuda_module = None
|
||||
minus_inf = -10000.0
|
||||
@ -140,14 +142,15 @@ class DeepSpeedDiffusersAttention(nn.Module):
|
||||
self.config = config
|
||||
self.config.layer_id = DeepSpeedDiffusersAttention.layer_id
|
||||
DeepSpeedDiffusersAttention.layer_id += 1
|
||||
device = torch.cuda.current_device() if config.bigscience_bloom else 'cpu'
|
||||
device = get_accelerator().current_device_name(
|
||||
) if config.bigscience_bloom else 'cpu'
|
||||
qkv_size_per_partition = (self.config.hidden_size // self.config.mp_size) * 3
|
||||
|
||||
data_type = torch.int8 if config.q_int8 else torch.half if config.fp16 else torch.float
|
||||
data_type_fp = torch.half if config.fp16 else torch.float
|
||||
global inference_cuda_module
|
||||
if inference_cuda_module is None:
|
||||
builder = op_builder.InferenceBuilder()
|
||||
builder = get_accelerator().create_op_builder(InferenceBuilder)
|
||||
inference_cuda_module = builder.load()
|
||||
|
||||
if DeepSpeedDiffusersAttention.layer_id == 1:
|
||||
|
@ -4,12 +4,13 @@ Copyright 2022 The Microsoft DeepSpeed Team
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from ... import op_builder
|
||||
|
||||
from deepspeed import module_inject
|
||||
from .diffusers_attention import DeepSpeedDiffusersAttention
|
||||
from .bias_add import nhwc_bias_add
|
||||
from .diffusers_2d_transformer import Diffusers2DTransformerConfig
|
||||
from deepspeed.accelerator import get_accelerator
|
||||
from deepspeed.ops.op_builder.builder_names import InferenceBuilder, SpatialInferenceBuilder
|
||||
|
||||
# Ops will be loaded on demand
|
||||
transformer_cuda_module = None
|
||||
@ -19,14 +20,16 @@ spatial_cuda_module = None
|
||||
def load_transformer_module():
|
||||
global transformer_cuda_module
|
||||
if transformer_cuda_module is None:
|
||||
transformer_cuda_module = op_builder.InferenceBuilder().load()
|
||||
transformer_cuda_module = get_accelerator().create_op_builder(
|
||||
InferenceBuilder).load()
|
||||
return transformer_cuda_module
|
||||
|
||||
|
||||
def load_spatial_module():
|
||||
global spatial_cuda_module
|
||||
if spatial_cuda_module is None:
|
||||
spatial_cuda_module = op_builder.SpatialInferenceBuilder().load()
|
||||
spatial_cuda_module = get_accelerator().create_op_builder(
|
||||
SpatialInferenceBuilder).load()
|
||||
return spatial_cuda_module
|
||||
|
||||
|
||||
|
@ -6,6 +6,7 @@ import math
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from deepspeed import comm as dist
|
||||
from deepspeed.accelerator import get_accelerator
|
||||
from .op_binding import LinearOp, VectorMatMulOp, SoftmaxContextOp, QKVGemmOp, SoftmaxOp
|
||||
|
||||
minus_inf = -10000.0
|
||||
@ -27,7 +28,8 @@ class DeepSpeedSelfAttention(nn.Module):
|
||||
data_type_fp = torch.half if config.fp16 else torch.float
|
||||
self.config.layer_id = DeepSpeedSelfAttention.num_layers
|
||||
DeepSpeedSelfAttention.num_layers = DeepSpeedSelfAttention.num_layers + 1
|
||||
device = torch.cuda.current_device() #if config.bigscience_bloom else 'cpu'
|
||||
device = get_accelerator().current_device_name(
|
||||
) #if config.bigscience_bloom else 'cpu'
|
||||
qkv_size_per_partition = (self.config.hidden_size // self.config.mp_size) * 3
|
||||
self.attn_qkvw = nn.Parameter(torch.empty(self.config.hidden_size,
|
||||
qkv_size_per_partition,
|
||||
@ -199,7 +201,7 @@ class BloomSelfAttention(DeepSpeedSelfAttention):
|
||||
input_mask = torch.empty(1)
|
||||
|
||||
mixed_x_layer = qkv_out
|
||||
alibi = alibi.to(torch.cuda.current_device())
|
||||
alibi = alibi.to(get_accelerator().current_device_name())
|
||||
head_dim = self.hidden_size_per_partition // self.num_attention_heads_per_partition
|
||||
new_tensor_shape = mixed_x_layer.size()[:-1] + (
|
||||
self.num_attention_heads_per_partition,
|
||||
|
@ -6,9 +6,10 @@ import torch
|
||||
from torch.autograd import Function
|
||||
from deepspeed.utils.types import ActivationFuncType
|
||||
from deepspeed import comm as dist
|
||||
from deepspeed.accelerator import get_accelerator
|
||||
from deepspeed.ops.op_builder.builder_names import InferenceBuilder
|
||||
import torch.nn as nn
|
||||
import math
|
||||
from ... import op_builder
|
||||
|
||||
inference_cuda_module = None
|
||||
|
||||
@ -99,7 +100,8 @@ class DeepSpeedMLP(nn.Module):
|
||||
self.config = config
|
||||
data_type = torch.int8 if config.q_int8 else torch.half if config.fp16 else torch.float
|
||||
data_type_fp = torch.half if config.fp16 else torch.float
|
||||
device = torch.cuda.current_device() #if config.bigscience_bloom else 'cpu'
|
||||
device = get_accelerator().current_device_name(
|
||||
) #if config.bigscience_bloom else 'cpu'
|
||||
self.attn_nw = nn.Parameter(torch.empty(self.config.hidden_size,
|
||||
dtype=data_type_fp,
|
||||
device=device),
|
||||
@ -136,7 +138,7 @@ class DeepSpeedMLP(nn.Module):
|
||||
# load the cuda module
|
||||
global inference_cuda_module
|
||||
if inference_cuda_module is None:
|
||||
builder = op_builder.InferenceBuilder()
|
||||
builder = get_accelerator().create_op_builder(InferenceBuilder)
|
||||
inference_cuda_module = builder.load()
|
||||
|
||||
self.mp_group = mp_group
|
||||
|
@ -5,7 +5,6 @@ import json
|
||||
import math
|
||||
import torch
|
||||
from torch.autograd import Function
|
||||
from ... import op_builder
|
||||
#from ...inference.engine import inference_cuda_module, specialized_mode
|
||||
# Cuda modules will be imported if needed
|
||||
inference_cuda_module = None
|
||||
@ -15,6 +14,8 @@ from .ds_attention import DeepSpeedSelfAttention
|
||||
from .config import DeepSpeedInferenceConfig
|
||||
from ....moe.sharded_moe import TopKGate
|
||||
from deepspeed import comm as dist
|
||||
from deepspeed.accelerator import get_accelerator
|
||||
from deepspeed.ops.op_builder.builder_names import InferenceBuilder
|
||||
|
||||
|
||||
class DeepSpeedMoEInferenceConfig(DeepSpeedInferenceConfig):
|
||||
@ -241,15 +242,14 @@ class DeepSpeedMoEInference(nn.Module):
|
||||
global specialized_mode
|
||||
if inference_cuda_module is None:
|
||||
specialized_mode = False
|
||||
if hasattr(op_builder, 'InferenceSpecializedBuilder'):
|
||||
builder = op_builder.InferenceSpecializedBuilder()
|
||||
if builder.is_compatible():
|
||||
# InferenceSpecializedBuilder is not among DeepSpeed provided builder yet, so we infer by builder name string
|
||||
builder = get_accelerator().create_op_builder("InferenceSpecializedBuilder")
|
||||
if builder != None and builder.is_compatible():
|
||||
inference_cuda_module = builder.load()
|
||||
specialized_mode = True
|
||||
else:
|
||||
inference_cuda_module = op_builder.InferenceBuilder().load()
|
||||
else:
|
||||
inference_cuda_module = op_builder.InferenceBuilder().load()
|
||||
inference_cuda_module = get_accelerator().create_op_builder(
|
||||
InferenceBuilder).load()
|
||||
self.config.specialized_mode = specialized_mode
|
||||
|
||||
DeepSpeedMoEInference.layer_id += 1
|
||||
|
@ -6,8 +6,8 @@ import math
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.autograd import Function
|
||||
|
||||
from ..op_builder import TransformerBuilder, StochasticTransformerBuilder
|
||||
from deepspeed.accelerator import get_accelerator
|
||||
from deepspeed.ops.op_builder.builder_names import TransformerBuilder, StochasticTransformerBuilder
|
||||
|
||||
# Cuda modules will be imported if needed
|
||||
transformer_cuda_module = None
|
||||
@ -481,7 +481,7 @@ class DeepSpeedTransformerLayer(nn.Module):
|
||||
print("DeepSpeed Transformer config is ", self.config.__dict__)
|
||||
|
||||
if self.config.local_rank >= 0:
|
||||
torch.cuda.set_device(self.config.local_rank)
|
||||
get_accelerator().set_device(self.config.local_rank)
|
||||
|
||||
if initial_weights is None and initial_biases is None:
|
||||
self.attn_qkvw = nn.Parameter(
|
||||
@ -531,9 +531,11 @@ class DeepSpeedTransformerLayer(nn.Module):
|
||||
# Load cuda modules if needed
|
||||
global transformer_cuda_module, stochastic_transformer_cuda_module
|
||||
if transformer_cuda_module is None and not self.config.stochastic_mode:
|
||||
transformer_cuda_module = TransformerBuilder().load()
|
||||
transformer_cuda_module = get_accelerator().create_op_builder(
|
||||
TransformerBuilder).load()
|
||||
if stochastic_transformer_cuda_module is None and self.config.stochastic_mode:
|
||||
stochastic_transformer_cuda_module = StochasticTransformerBuilder().load()
|
||||
stochastic_transformer_cuda_module = get_accelerator().create_op_builder(
|
||||
StochasticTransformerBuilder).load()
|
||||
|
||||
# create the layer in cuda kernels.
|
||||
cuda_module = stochastic_transformer_cuda_module if self.config.stochastic_mode else transformer_cuda_module
|
||||
|
20
op_builder/__init__.py
Executable file → Normal file
20
op_builder/__init__.py
Executable file → Normal file
@ -15,23 +15,3 @@ from .transformer_inference import InferenceBuilder
|
||||
from .quantizer import QuantizerBuilder
|
||||
from .spatial_inference import SpatialInferenceBuilder
|
||||
from .builder import get_default_compute_capabilities, OpBuilder
|
||||
|
||||
# TODO: This will be removed eventurally when all files containing reference to ALL_OPS redirected to op_builder.all_ops
|
||||
# TODO: infer this list instead of hard coded
|
||||
# List of all available ops
|
||||
__op_builders__ = [
|
||||
CPUAdamBuilder(),
|
||||
CPUAdagradBuilder(),
|
||||
FusedAdamBuilder(),
|
||||
FusedLambBuilder(),
|
||||
SparseAttnBuilder(),
|
||||
TransformerBuilder(),
|
||||
StochasticTransformerBuilder(),
|
||||
AsyncIOBuilder(),
|
||||
UtilsBuilder(),
|
||||
QuantizerBuilder(),
|
||||
InferenceBuilder(),
|
||||
SpatialInferenceBuilder(),
|
||||
RandomLTDBuilder()
|
||||
]
|
||||
ALL_OPS = {op.name: op for op in __op_builders__}
|
||||
|
@ -4,6 +4,10 @@ Copyright 2020 The Microsoft DeepSpeed Team
|
||||
import os
|
||||
import pkgutil
|
||||
import importlib
|
||||
try:
|
||||
# during installation time accelerator is visible, otherwise return deepspeed.accelerator
|
||||
from accelerator import get_accelerator
|
||||
except ImportError:
|
||||
from deepspeed.accelerator import get_accelerator
|
||||
|
||||
# List of all available ops
|
||||
|
@ -5,7 +5,12 @@ import importlib
|
||||
|
||||
# List of all available op builders from deepspeed op_builder
|
||||
|
||||
try:
|
||||
import op_builder # noqa: F401
|
||||
op_builder_dir = "op_builder"
|
||||
except ImportError:
|
||||
op_builder_dir = "deepspeed.ops.op_builder"
|
||||
|
||||
op_builder_module = importlib.import_module(op_builder_dir)
|
||||
__op_builders__ = []
|
||||
|
||||
|
9
setup.py
9
setup.py
@ -25,13 +25,13 @@ import time
|
||||
torch_available = True
|
||||
try:
|
||||
import torch
|
||||
from torch.utils.cpp_extension import BuildExtension
|
||||
except ImportError:
|
||||
torch_available = False
|
||||
print('[WARNING] Unable to import torch, pre-compiling ops will be disabled. ' \
|
||||
'Please visit https://pytorch.org/ to see how to properly install torch on your system.')
|
||||
|
||||
from op_builder import ALL_OPS, get_default_compute_capabilities, OpBuilder
|
||||
from op_builder import get_default_compute_capabilities, OpBuilder
|
||||
from op_builder.all_ops import ALL_OPS
|
||||
from op_builder.builder import installed_cuda_version
|
||||
|
||||
# fetch rocm state
|
||||
@ -91,7 +91,9 @@ cmdclass = {}
|
||||
|
||||
# For any pre-installed ops force disable ninja
|
||||
if torch_available:
|
||||
cmdclass['build_ext'] = BuildExtension.with_options(use_ninja=False)
|
||||
from accelerator import get_accelerator
|
||||
cmdclass['build_ext'] = get_accelerator().build_extension().with_options(
|
||||
use_ninja=False)
|
||||
|
||||
if torch_available:
|
||||
TORCH_MAJOR = torch.__version__.split('.')[0]
|
||||
@ -195,6 +197,7 @@ if sys.platform == "win32":
|
||||
# It needs Administrator privilege to create symlinks on Windows.
|
||||
create_dir_symlink('..\\..\\csrc', '.\\deepspeed\\ops\\csrc')
|
||||
create_dir_symlink('..\\..\\op_builder', '.\\deepspeed\\ops\\op_builder')
|
||||
create_dir_symlink('..\\accelerator', '.\\deepspeed\\accelerator')
|
||||
egg_info.manifest_maker.template = 'MANIFEST_win.in'
|
||||
|
||||
# Parse the DeepSpeed version string from version.txt
|
||||
|
Reference in New Issue
Block a user