From 9548d48f48a53b97bdbeada374b7f35ad2f1c655 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Sat, 7 Jan 2023 12:40:58 +0800 Subject: [PATCH] Abstract accelerator (step 2) (#2560) * Abstract accelerator (step 2) * more flex op_builder path for both installation and runtime * add SpatialInferenceBuilder into cuda_accelerator.py * use reflection to make cuda_accelerator adapt to CUDA op builder change automatically * clean up deepspeed/__init__.py * add comments in cuda_accelerator for no torch path * Update deepspeed/env_report.py Change env_report.py according to suggestion Co-authored-by: Michael Wyatt * reduce the range of try...except for better code clarity * Add porting for deepspeed/ops/random_ltd/dropping_utils.py * move accelerator to top directory and create symlink under deepspeed Co-authored-by: Olatunji Ruwase Co-authored-by: Michael Wyatt Co-authored-by: Jeff Rasley --- MANIFEST.in | 1 + MANIFEST_win.in | 1 + .../accelerator => accelerator}/__init__.py | 0 .../abstract_accelerator.py | 0 .../cuda_accelerator.py | 81 ++++++++++--------- .../real_accelerator.py | 2 +- csrc/aio/py_test/aio_bench_perf_sweep.py | 5 +- csrc/aio/py_test/ds_aio_basic.py | 38 +++++---- csrc/aio/py_test/ds_aio_handle.py | 23 ++++-- csrc/aio/py_test/validate_async_io.py | 5 +- deepspeed/accelerator | 1 + deepspeed/comm/comm.py | 7 +- deepspeed/env_report.py | 35 ++++---- deepspeed/git_version_info.py | 2 +- deepspeed/ops/adagrad/cpu_adagrad.py | 6 +- deepspeed/ops/adam/cpu_adam.py | 5 +- deepspeed/ops/adam/fused_adam.py | 7 +- deepspeed/ops/lamb/fused_lamb.py | 6 +- deepspeed/ops/quantizer/quantizer.py | 6 +- deepspeed/ops/random_ltd/dropping_utils.py | 19 +++-- deepspeed/ops/sparse_attention/matmul.py | 3 +- .../ops/transformer/inference/bias_add.py | 7 +- .../inference/diffusers_attention.py | 9 ++- .../inference/diffusers_transformer_block.py | 9 ++- .../ops/transformer/inference/ds_attention.py | 6 +- deepspeed/ops/transformer/inference/ds_mlp.py | 8 +- .../transformer/inference/moe_inference.py | 18 ++--- deepspeed/ops/transformer/transformer.py | 12 +-- op_builder/__init__.py | 20 ----- op_builder/all_ops.py | 6 +- op_builder/builder_names.py | 7 +- setup.py | 9 ++- 32 files changed, 206 insertions(+), 158 deletions(-) rename {deepspeed/accelerator => accelerator}/__init__.py (100%) rename {deepspeed/accelerator => accelerator}/abstract_accelerator.py (100%) rename {deepspeed/accelerator => accelerator}/cuda_accelerator.py (70%) rename {deepspeed/accelerator => accelerator}/real_accelerator.py (97%) create mode 120000 deepspeed/accelerator mode change 100755 => 100644 op_builder/__init__.py diff --git a/MANIFEST.in b/MANIFEST.in index a013ac40b..2fec750c6 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -4,3 +4,4 @@ recursive-include deepspeed *.cpp *.h *.cu *.hip *.tr *.cuh *.cc *.json recursive-include csrc *.cpp *.h *.cu *.tr *.cuh *.cc recursive-include op_builder *.py recursive-include benchmarks *.py +recursive-include accelerator *.py diff --git a/MANIFEST_win.in b/MANIFEST_win.in index ddfe73e0b..f0426fb00 100644 --- a/MANIFEST_win.in +++ b/MANIFEST_win.in @@ -6,3 +6,4 @@ recursive-include deepspeed *.tr recursive-exclude deepspeed/ops/csrc *.cpp *.h *.cu *.cuh *.cc prune csrc prune op_builder +prune accelerator diff --git a/deepspeed/accelerator/__init__.py b/accelerator/__init__.py similarity index 100% rename from deepspeed/accelerator/__init__.py rename to accelerator/__init__.py diff --git a/deepspeed/accelerator/abstract_accelerator.py b/accelerator/abstract_accelerator.py similarity index 100% rename from deepspeed/accelerator/abstract_accelerator.py rename to accelerator/abstract_accelerator.py diff --git a/deepspeed/accelerator/cuda_accelerator.py b/accelerator/cuda_accelerator.py similarity index 70% rename from deepspeed/accelerator/cuda_accelerator.py rename to accelerator/cuda_accelerator.py index dacea640f..96290367f 100644 --- a/deepspeed/accelerator/cuda_accelerator.py +++ b/accelerator/cuda_accelerator.py @@ -1,5 +1,14 @@ -from deepspeed.accelerator.abstract_accelerator import DeepSpeedAccelerator -import torch.cuda +import os +import pkgutil +import importlib + +from .abstract_accelerator import DeepSpeedAccelerator +# During setup stage torch may not be installed, pass on no torch will +# allow op builder related API to be executed. +try: + import torch.cuda +except ImportError: + pass class CUDA_Accelerator(DeepSpeedAccelerator): @@ -7,6 +16,25 @@ class CUDA_Accelerator(DeepSpeedAccelerator): self._name = 'cuda' self._communication_backend_name = 'nccl' + # begin initialize for create_op_builder() + # put all valid class name <--> class type mapping into class_dict + op_builder_dir = self.op_builder_dir() + op_builder_module = importlib.import_module(op_builder_dir) + + for _, module_name, _ in pkgutil.iter_modules([os.path.dirname(op_builder_module.__file__)]): + # avoid self references + if module_name != 'all_ops' and module_name != 'builder' and module_name != 'builder_names': + module = importlib.import_module("{}.{}".format( + op_builder_dir, + module_name)) + for member_name in module.__dir__(): + if member_name.endswith( + 'Builder' + ) and member_name != "OpBuilder" and member_name != "CUDAOpBuilder" and member_name != "TorchCPUOpBuilder": # avoid abstract classes + if not member_name in self.class_dict: + self.class_dict[member_name] = getattr(module, member_name) + # end initialize for create_op_builder() + # Device APIs def device_name(self, device_index=None): if device_index == None: @@ -194,44 +222,21 @@ class CUDA_Accelerator(DeepSpeedAccelerator): return False def op_builder_dir(self): - return "deepspeed.ops.op_builder" + try: + # during installation time op_builder is visible, otherwise return deepspeed.ops.op_builder + import op_builder # noqa: F401 + return "op_builder" + except ImportError: + return "deepspeed.ops.op_builder" + + # dict that holds class name <--> class type mapping i.e. + # 'AsyncIOBuilder': + # this dict will be filled at init stage + class_dict = {} def create_op_builder(self, class_name): - from deepspeed.ops.op_builder import AsyncIOBuilder, CPUAdagradBuilder, CPUAdamBuilder, FusedAdamBuilder, FusedLambBuilder, QuantizerBuilder, SparseAttnBuilder, StochasticTransformerBuilder, TransformerBuilder, InferenceBuilder, UtilsBuilder - from deepspeed.ops.op_builder.builder_names import AsyncIOBuilder as AsyncIOBuilderName - from deepspeed.ops.op_builder.builder_names import CPUAdagradBuilder as CPUAdagradBuilderName - from deepspeed.ops.op_builder.builder_names import CPUAdamBuilder as CPUAdamBuilderName - from deepspeed.ops.op_builder.builder_names import FusedAdamBuilder as FusedAdamBuilderName - from deepspeed.ops.op_builder.builder_names import FusedLambBuilder as FusedLambBuilderName - from deepspeed.ops.op_builder.builder_names import QuantizerBuilder as QuantizerBuilderName - from deepspeed.ops.op_builder.builder_names import SparseAttnBuilder as SparseAttnBuilderName - from deepspeed.ops.op_builder.builder_names import StochasticTransformerBuilder as StochasticTransformerBuilderName - from deepspeed.ops.op_builder.builder_names import TransformerBuilder as TransformerBuilderName - from deepspeed.ops.op_builder.builder_names import InferenceBuilder as InferenceBuilderName - from deepspeed.ops.op_builder.builder_names import UtilsBuilder as UtilsBuilderName - - if class_name == AsyncIOBuilderName: - return AsyncIOBuilder() - elif class_name == CPUAdagradBuilderName: - return CPUAdagradBuilder() - elif class_name == CPUAdamBuilderName: - return CPUAdamBuilder() - elif class_name == FusedAdamBuilderName: - return FusedAdamBuilder() - elif class_name == FusedLambBuilderName: - return FusedLambBuilder() - elif class_name == QuantizerBuilderName: - return QuantizerBuilder() - elif class_name == SparseAttnBuilderName: - return SparseAttnBuilder() - elif class_name == StochasticTransformerBuilderName: - return StochasticTransformerBuilder() - elif class_name == TransformerBuilderName: - return TransformerBuilder() - elif class_name == InferenceBuilderName: - return InferenceBuilder() - elif class_name == UtilsBuilderName: - return UtilsBuilder() + if class_name in self.class_dict: + return self.class_dict[class_name]() else: return None diff --git a/deepspeed/accelerator/real_accelerator.py b/accelerator/real_accelerator.py similarity index 97% rename from deepspeed/accelerator/real_accelerator.py rename to accelerator/real_accelerator.py index 5d26a4727..8dda6e2a3 100644 --- a/deepspeed/accelerator/real_accelerator.py +++ b/accelerator/real_accelerator.py @@ -24,7 +24,7 @@ def get_accelerator(): _validate_accelerator(ds_accelerator) return ds_accelerator - from deepspeed.accelerator.cuda_accelerator import CUDA_Accelerator + from .cuda_accelerator import CUDA_Accelerator ds_accelerator = CUDA_Accelerator() _validate_accelerator(ds_accelerator) return ds_accelerator diff --git a/csrc/aio/py_test/aio_bench_perf_sweep.py b/csrc/aio/py_test/aio_bench_perf_sweep.py index f16e962b1..72cbb10e4 100644 --- a/csrc/aio/py_test/aio_bench_perf_sweep.py +++ b/csrc/aio/py_test/aio_bench_perf_sweep.py @@ -15,6 +15,8 @@ import shutil from test_ds_aio_utils import refine_integer_value from perf_sweep_utils import READ_OP_DESC, WRITE_OP_DESC, BENCH_LOG_DIR, \ READ_IO_DIR, WRITE_IO_DIR, READ_LOG_DIR, WRITE_LOG_DIR +from deepspeed.accelerator import get_accelerator +from deepspeed.ops.op_builder.builder_names import AsyncIOBuilder OTHER_OPTIONS = '--handle' PERF_SCRIPT = 'test_ds_aio.py' @@ -277,8 +279,7 @@ def script_path(): def async_io_setup(): - from deepspeed.ops.aio import AsyncIOBuilder - return AsyncIOBuilder().is_compatible() + return get_accelerator().create_op_builder(AsyncIOBuilder).is_compatible() def get_block_size_and_count(io_bytes): diff --git a/csrc/aio/py_test/ds_aio_basic.py b/csrc/aio/py_test/ds_aio_basic.py index cf70b6655..977d76f30 100755 --- a/csrc/aio/py_test/ds_aio_basic.py +++ b/csrc/aio/py_test/ds_aio_basic.py @@ -8,9 +8,10 @@ Functionality of swapping optimizer tensors to/from (NVMe) storage devices. import torch import os import time -from deepspeed.ops.aio import AsyncIOBuilder from multiprocessing import Pool, Barrier from test_ds_aio_utils import report_results, task_log, task_barrier +from deepspeed.accelerator import get_accelerator +from deepspeed.ops.op_builder.builder_names import AsyncIOBuilder def pre_basic(args, tid, read_op): @@ -19,7 +20,10 @@ def pre_basic(args, tid, read_op): file = args.read_file if read_op else f'{args.write_file}.{tid}' task_log(tid, f'Allocate tensor of size {num_bytes} bytes') - buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory() + buffer = get_accelerator().pin_memory( + torch.empty(num_bytes, + dtype=torch.uint8, + device='cpu')) task_log( tid, f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}' @@ -56,13 +60,14 @@ def post_basic(pool_params): def main_basic_read(pool_params): args, tid, ctxt = pool_params start_time = time.time() - AsyncIOBuilder().load().aio_read(ctxt['buffer'], - ctxt['file'], - args.block_size, - args.queue_depth, - args.single_submit, - args.overlap_events, - args.validate) + get_accelerator().create_op_builder(AsyncIOBuilder).load().aio_read( + ctxt['buffer'], + ctxt['file'], + args.block_size, + args.queue_depth, + args.single_submit, + args.overlap_events, + args.validate) end_time = time.time() ctxt['elapsed_sec'] += end_time - start_time @@ -72,13 +77,14 @@ def main_basic_read(pool_params): def main_basic_write(pool_params): args, tid, ctxt = pool_params start_time = time.time() - AsyncIOBuilder().load().aio_write(ctxt['buffer'], - ctxt['file'], - args.block_size, - args.queue_depth, - args.single_submit, - args.overlap_events, - args.validate) + get_accelerator().create_op_builder(AsyncIOBuilder).load().aio_write( + ctxt['buffer'], + ctxt['file'], + args.block_size, + args.queue_depth, + args.single_submit, + args.overlap_events, + args.validate) end_time = time.time() ctxt['elapsed_sec'] += end_time - start_time diff --git a/csrc/aio/py_test/ds_aio_handle.py b/csrc/aio/py_test/ds_aio_handle.py index 947ee2e6c..6222e3c79 100755 --- a/csrc/aio/py_test/ds_aio_handle.py +++ b/csrc/aio/py_test/ds_aio_handle.py @@ -9,8 +9,9 @@ import torch import os import time from multiprocessing import Pool, Barrier -from deepspeed.ops.aio import AsyncIOBuilder from test_ds_aio_utils import report_results, task_log, task_barrier +from deepspeed.accelerator import get_accelerator +from deepspeed.ops.op_builder.builder_names import AsyncIOBuilder def pre_handle(args, tid, read_op): @@ -20,20 +21,26 @@ def pre_handle(args, tid, read_op): task_log(tid, f'Allocate tensor of size {num_bytes} bytes') if args.gpu: - buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cuda') + buffer = torch.empty(num_bytes, + dtype=torch.uint8, + device=get_accelerator().device_name()) else: - buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory() + buffer = get_accelerator().pin_memory( + torch.empty(num_bytes, + dtype=torch.uint8, + device='cpu')) task_log( tid, f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}' ) io_parallel = args.io_parallel if args.io_parallel else 1 - handle = AsyncIOBuilder().load().aio_handle(args.block_size, - args.queue_depth, - args.single_submit, - args.overlap_events, - io_parallel) + handle = get_accelerator().create_op_builder(AsyncIOBuilder).load().aio_handle( + args.block_size, + args.queue_depth, + args.single_submit, + args.overlap_events, + io_parallel) task_log(tid, f'created deepspeed aio handle') ctxt = {} diff --git a/csrc/aio/py_test/validate_async_io.py b/csrc/aio/py_test/validate_async_io.py index 4db25fe1b..7ce5c52d5 100644 --- a/csrc/aio/py_test/validate_async_io.py +++ b/csrc/aio/py_test/validate_async_io.py @@ -4,5 +4,6 @@ Licensed under the MIT license. Functionality of swapping optimizer tensors to/from (NVMe) storage devices. """ -from deepspeed.ops.aio import AsyncIOBuilder -assert AsyncIOBuilder().is_compatible() +from deepspeed.accelerator import get_accelerator +from deepspeed.ops.op_builder.builder_names import AsyncIOBuilder +assert get_accelerator().create_op_builder(AsyncIOBuilder).is_compatible() diff --git a/deepspeed/accelerator b/deepspeed/accelerator new file mode 120000 index 000000000..b61ffface --- /dev/null +++ b/deepspeed/accelerator @@ -0,0 +1 @@ +../accelerator/ \ No newline at end of file diff --git a/deepspeed/comm/comm.py b/deepspeed/comm/comm.py index 729853745..dc7a38a21 100644 --- a/deepspeed/comm/comm.py +++ b/deepspeed/comm/comm.py @@ -31,6 +31,7 @@ import os from ..constants import TORCH_DISTRIBUTED_DEFAULT_PORT, default_pg_timeout from .constants import * +from deepspeed.accelerator import get_accelerator class ReduceOp(Enum): @@ -127,7 +128,7 @@ def timed_op(func): finally: if comms_logger.enabled: # Need to make op blocking for accurate logging - torch.cuda.synchronize() + get_accelerator().synchronize() # If we're using MPI, we can't simply sync the stream if cdb.using_mpi: cdb.barrier() @@ -595,7 +596,7 @@ def get_global_rank(group=None, group_rank=0): # Main DeepSpeed Comms. public API. -def init_distributed(dist_backend="nccl", +def init_distributed(dist_backend=None, auto_mpi_discovery=True, distributed_port=TORCH_DISTRIBUTED_DEFAULT_PORT, verbose=True, @@ -650,6 +651,8 @@ def init_distributed(dist_backend="nccl", utils.logger.info('Distributed backend already initialized') else: assert isinstance(timeout, timedelta) + if dist_backend == None: + dist_backend = get_accelerator().communication_backend_name() if int(os.getenv('RANK', '0')) == 0: utils.logger.info( 'Initializing TorchBackend in DeepSpeed with backend {}'.format( diff --git a/deepspeed/env_report.py b/deepspeed/env_report.py index c5949a8c4..ca115bf10 100644 --- a/deepspeed/env_report.py +++ b/deepspeed/env_report.py @@ -2,8 +2,9 @@ import torch import deepspeed import subprocess import argparse -from .ops.op_builder import ALL_OPS +from .ops.op_builder.all_ops import ALL_OPS from .git_version_info import installed_ops, torch_info +from deepspeed.accelerator import get_accelerator GREEN = '\033[92m' RED = '\033[91m' @@ -79,31 +80,33 @@ def nvcc_version(): def debug_report(): max_dots = 33 - hip_version = None - if hasattr(torch.version, 'hip'): - hip_version = torch.version.hip - report = [ ("torch install path", torch.__path__), ("torch version", torch.__version__), - ("torch cuda version", - torch.version.cuda), - ("torch hip version", - hip_version), - ("nvcc version", - (None if hip_version else nvcc_version())), ("deepspeed install path", deepspeed.__path__), ("deepspeed info", f"{deepspeed.__version__}, {deepspeed.__git_hash__}, {deepspeed.__git_branch__}" - ), - ("deepspeed wheel compiled w.", - f"torch {torch_info['version']}, " + - (f"hip {torch_info['hip_version']}" - if hip_version else f"cuda {torch_info['cuda_version']}")), + ) ] + if get_accelerator().device_name() == 'cuda': + hip_version = getattr(torch.version, "hip", None) + report.extend([("torch cuda version", + torch.version.cuda), + ("torch hip version", + hip_version), + ("nvcc version", + (None if hip_version else nvcc_version())), + ("deepspeed wheel compiled w.", + f"torch {torch_info['version']}, " + + (f"hip {torch_info['hip_version']}" + if hip_version else f"cuda {torch_info['cuda_version']}"))]) + else: + report.extend([("deepspeed wheel compiled w.", + f"torch {torch_info['version']} ")]) + print("DeepSpeed general environment info:") for name, value in report: print(name, "." * (max_dots - len(name)), value) diff --git a/deepspeed/git_version_info.py b/deepspeed/git_version_info.py index 5cd6d9f2f..071d4289b 100644 --- a/deepspeed/git_version_info.py +++ b/deepspeed/git_version_info.py @@ -11,7 +11,7 @@ except ModuleNotFoundError: git_hash = '[none]' git_branch = '[none]' - from .ops.op_builder import ALL_OPS + from .ops.op_builder.all_ops import ALL_OPS installed_ops = dict.fromkeys(ALL_OPS.keys(), False) compatible_ops = dict.fromkeys(ALL_OPS.keys(), False) torch_info = {'version': "0.0", "cuda_version": "0.0", "hip_version": "0.0"} diff --git a/deepspeed/ops/adagrad/cpu_adagrad.py b/deepspeed/ops/adagrad/cpu_adagrad.py index 2527259b1..d218f91a1 100755 --- a/deepspeed/ops/adagrad/cpu_adagrad.py +++ b/deepspeed/ops/adagrad/cpu_adagrad.py @@ -3,7 +3,8 @@ Copyright 2020 The Microsoft DeepSpeed Team ''' import torch -from ..op_builder import CPUAdagradBuilder +from deepspeed.accelerator import get_accelerator +from deepspeed.ops.op_builder.builder_names import CPUAdagradBuilder from deepspeed.utils.logging import should_log_le @@ -24,7 +25,8 @@ class DeepSpeedCPUAdagrad(torch.optim.Optimizer): self.opt_id = DeepSpeedCPUAdagrad.optimizer_id DeepSpeedCPUAdagrad.optimizer_id = DeepSpeedCPUAdagrad.optimizer_id + 1 self.fp32_optimizer_states = fp32_optimizer_states - self.ds_opt_adagrad = CPUAdagradBuilder().load() + self.ds_opt_adagrad = get_accelerator().create_op_builder( + CPUAdagradBuilder).load() self.ds_opt_adagrad.create_adagrad(self.opt_id, lr, diff --git a/deepspeed/ops/adam/cpu_adam.py b/deepspeed/ops/adam/cpu_adam.py index 911e4924d..3ba149ac4 100755 --- a/deepspeed/ops/adam/cpu_adam.py +++ b/deepspeed/ops/adam/cpu_adam.py @@ -4,9 +4,10 @@ Copyright 2020 The Microsoft DeepSpeed Team import torch from cpuinfo import get_cpu_info -from ..op_builder import CPUAdamBuilder from deepspeed.utils import logger from deepspeed.utils.logging import should_log_le +from deepspeed.accelerator import get_accelerator +from deepspeed.ops.op_builder.builder_names import CPUAdamBuilder class DeepSpeedCPUAdam(torch.optim.Optimizer): @@ -91,7 +92,7 @@ class DeepSpeedCPUAdam(torch.optim.Optimizer): DeepSpeedCPUAdam.optimizer_id = DeepSpeedCPUAdam.optimizer_id + 1 self.adam_w_mode = adamw_mode self.fp32_optimizer_states = fp32_optimizer_states - self.ds_opt_adam = CPUAdamBuilder().load() + self.ds_opt_adam = get_accelerator().create_op_builder(CPUAdamBuilder).load() self.ds_opt_adam.create_adam(self.opt_id, lr, diff --git a/deepspeed/ops/adam/fused_adam.py b/deepspeed/ops/adam/fused_adam.py index 5a1a1ddca..15f777ba6 100644 --- a/deepspeed/ops/adam/fused_adam.py +++ b/deepspeed/ops/adam/fused_adam.py @@ -9,7 +9,8 @@ import torch from .multi_tensor_apply import MultiTensorApply multi_tensor_applier = MultiTensorApply(2048 * 32) -from ..op_builder import FusedAdamBuilder +from deepspeed.accelerator import get_accelerator +from deepspeed.ops.op_builder.builder_names import FusedAdamBuilder class FusedAdam(torch.optim.Optimizer): @@ -69,9 +70,9 @@ class FusedAdam(torch.optim.Optimizer): self.adam_w_mode = 1 if adam_w_mode else 0 self.set_grad_none = set_grad_none - fused_adam_cuda = FusedAdamBuilder().load() + fused_adam_cuda = get_accelerator().create_op_builder(FusedAdamBuilder).load() # Skip buffer - self._dummy_overflow_buf = torch.cuda.IntTensor([0]) + self._dummy_overflow_buf = get_accelerator().IntTensor([0]) self.multi_tensor_adam = fused_adam_cuda.multi_tensor_adam def zero_grad(self): diff --git a/deepspeed/ops/lamb/fused_lamb.py b/deepspeed/ops/lamb/fused_lamb.py index e9210cdda..dcc669718 100644 --- a/deepspeed/ops/lamb/fused_lamb.py +++ b/deepspeed/ops/lamb/fused_lamb.py @@ -6,7 +6,8 @@ This file is adapted from NVIDIA/apex/optimizer/fused_adam and implements the LA ''' import types import torch -from ..op_builder import FusedLambBuilder +from deepspeed.accelerator import get_accelerator +from deepspeed.ops.op_builder.builder_names import FusedLambBuilder class FusedLamb(torch.optim.Optimizer): @@ -48,7 +49,8 @@ class FusedLamb(torch.optim.Optimizer): max_coeff=10.0, min_coeff=0.01, amsgrad=False): - self.fused_lamb_cuda = FusedLambBuilder().load() + self.fused_lamb_cuda = get_accelerator().create_op_builder( + FusedLambBuilder).load() if amsgrad: raise RuntimeError('FusedLamb does not support the AMSGrad variant.') diff --git a/deepspeed/ops/quantizer/quantizer.py b/deepspeed/ops/quantizer/quantizer.py index 6b25d02d8..c442747cd 100755 --- a/deepspeed/ops/quantizer/quantizer.py +++ b/deepspeed/ops/quantizer/quantizer.py @@ -3,7 +3,8 @@ Copyright 2020 The Microsoft DeepSpeed Team ''' import torch -from ..op_builder import QuantizerBuilder +from deepspeed.accelerator import get_accelerator +from deepspeed.ops.op_builder.builder_names import QuantizerBuilder # Cuda modules will be imported if needed quantizer_cuda_module = None @@ -13,7 +14,8 @@ def ds_quantizer(input, groups=1, bit_num=8, sr=False, asym=False): # Load cuda modules if needed global quantizer_cuda_module if quantizer_cuda_module is None: - quantizer_cuda_module = QuantizerBuilder().load() + quantizer_cuda_module = get_accelerator().create_op_builder( + QuantizerBuilder).load() if sr: if asym: quantize_func = quantizer_cuda_module.ds_sr_quantize_asym_fp16 if input.dtype == torch.half else quantizer_cuda_module.ds_sr_quantize_asym_fp32 diff --git a/deepspeed/ops/random_ltd/dropping_utils.py b/deepspeed/ops/random_ltd/dropping_utils.py index 62422d1b3..6abc5f674 100644 --- a/deepspeed/ops/random_ltd/dropping_utils.py +++ b/deepspeed/ops/random_ltd/dropping_utils.py @@ -3,7 +3,8 @@ Copyright 2022 The Microsoft DeepSpeed Team """ import torch -from ..op_builder import RandomLTDBuilder +from deepspeed.accelerator import get_accelerator +from deepspeed.ops.op_builder.builder_names import RandomLTDBuilder """ Returns: sampled_indices: [layers, batch_size, reserved_length] @@ -28,7 +29,7 @@ def gpt_sample_tokens(reserved_length: int, reserved_length).to(torch.int32) global random_ltd_module if random_ltd_module is None: - random_ltd_module = RandomLTDBuilder().load() + random_ltd_module = get_accelerator().create_op_builder(RandomLTDBuilder).load() sampled_indices = random_ltd_module.token_sort_(sampled_indices, seq_length) # Not certain the optimized kernel is actually better here, cause it kind of screws @@ -64,7 +65,7 @@ def bert_sample_tokens(reserved_length: int, reserved_length).to(torch.int32) global random_ltd_module if random_ltd_module is None: - random_ltd_module = RandomLTDBuilder().load() + random_ltd_module = get_accelerator().create_op_builder(RandomLTDBuilder).load() sampled_indices = random_ltd_module.token_sort_(sampled_indices, seq_length) dtype = sampled_indices.dtype @@ -89,7 +90,8 @@ class GatherTokens(torch.autograd.Function): batch_first: bool): global random_ltd_module if random_ltd_module is None: - random_ltd_module = RandomLTDBuilder().load() + random_ltd_module = get_accelerator().create_op_builder( + RandomLTDBuilder).load() ctx.save_for_backward(activations, sorted_indices) ctx.batch_first = batch_first return activations, random_ltd_module.token_gather(activations, sorted_indices, batch_first) @@ -100,7 +102,8 @@ class GatherTokens(torch.autograd.Function): g_gradients = g_gradients.contiguous() global random_ltd_module if random_ltd_module is None: - random_ltd_module = RandomLTDBuilder().load() + random_ltd_module = get_accelerator().create_op_builder( + RandomLTDBuilder).load() activations, sorted_indices = ctx.saved_tensors batch_first = ctx.batch_first @@ -119,7 +122,8 @@ class ScatterTokens(torch.autograd.Function): batch_first: bool): global random_ltd_module if random_ltd_module is None: - random_ltd_module = RandomLTDBuilder().load() + random_ltd_module = get_accelerator().create_op_builder( + RandomLTDBuilder).load() scatter_results = random_ltd_module.token_scatter_(all_activations.clone(), layer_activations, sorted_indices, @@ -135,7 +139,8 @@ class ScatterTokens(torch.autograd.Function): out_gradients = out_gradients.contiguous() global random_ltd_module if random_ltd_module is None: - random_ltd_module = RandomLTDBuilder().load() + random_ltd_module = get_accelerator().create_op_builder( + RandomLTDBuilder).load() sorted_indices, = ctx.saved_tensors batch_first = ctx.batch_first diff --git a/deepspeed/ops/sparse_attention/matmul.py b/deepspeed/ops/sparse_attention/matmul.py index 986666bde..554b0b18c 100755 --- a/deepspeed/ops/sparse_attention/matmul.py +++ b/deepspeed/ops/sparse_attention/matmul.py @@ -6,6 +6,7 @@ import torch import triton import triton.language as tl import triton._C.libtriton as libtriton +from deepspeed.accelerator import get_accelerator @triton.jit @@ -948,7 +949,7 @@ class MatMul: raise ValueError( f"Inputs must be on the same device; got {a.device} for tensor A " f"and {b.device} for tensor B") - if not a.is_cuda: + if not get_accelerator().on_accelerator(a): raise ValueError("Only GPU devices are supported for now") # When autocast is enabled, torch.matmul autocasts to float16, so we do the same here diff --git a/deepspeed/ops/transformer/inference/bias_add.py b/deepspeed/ops/transformer/inference/bias_add.py index a6ad7ed88..8ecf1f03b 100644 --- a/deepspeed/ops/transformer/inference/bias_add.py +++ b/deepspeed/ops/transformer/inference/bias_add.py @@ -4,8 +4,8 @@ Copyright 2022 The Microsoft DeepSpeed Team from typing import Optional import torch - -from ... import op_builder +from deepspeed.accelerator import get_accelerator +from deepspeed.ops.op_builder.builder_names import SpatialInferenceBuilder spatial_cuda_module = None @@ -16,7 +16,8 @@ def nhwc_bias_add(activation: torch.Tensor, other_bias: Optional[torch.Tensor] = None) -> torch.Tensor: global spatial_cuda_module if spatial_cuda_module is None: - spatial_cuda_module = op_builder.SpatialInferenceBuilder().load() + spatial_cuda_module = get_accelerator().create_op_builder( + SpatialInferenceBuilder).load() if other is None: return spatial_cuda_module.nhwc_bias_add(activation, bias) diff --git a/deepspeed/ops/transformer/inference/diffusers_attention.py b/deepspeed/ops/transformer/inference/diffusers_attention.py index 45a206a0a..39229b007 100644 --- a/deepspeed/ops/transformer/inference/diffusers_attention.py +++ b/deepspeed/ops/transformer/inference/diffusers_attention.py @@ -4,10 +4,12 @@ Copyright 2022 The Microsoft DeepSpeed Team import math import torch from torch.autograd import Function -from ... import op_builder import torch.nn as nn from packaging import version as pkg_version from deepspeed.utils.logging import log_dist +from deepspeed.accelerator import get_accelerator +from deepspeed.ops.op_builder.builder_names import InferenceBuilder + # Cuda modules will be imported if needed inference_cuda_module = None minus_inf = -10000.0 @@ -140,14 +142,15 @@ class DeepSpeedDiffusersAttention(nn.Module): self.config = config self.config.layer_id = DeepSpeedDiffusersAttention.layer_id DeepSpeedDiffusersAttention.layer_id += 1 - device = torch.cuda.current_device() if config.bigscience_bloom else 'cpu' + device = get_accelerator().current_device_name( + ) if config.bigscience_bloom else 'cpu' qkv_size_per_partition = (self.config.hidden_size // self.config.mp_size) * 3 data_type = torch.int8 if config.q_int8 else torch.half if config.fp16 else torch.float data_type_fp = torch.half if config.fp16 else torch.float global inference_cuda_module if inference_cuda_module is None: - builder = op_builder.InferenceBuilder() + builder = get_accelerator().create_op_builder(InferenceBuilder) inference_cuda_module = builder.load() if DeepSpeedDiffusersAttention.layer_id == 1: diff --git a/deepspeed/ops/transformer/inference/diffusers_transformer_block.py b/deepspeed/ops/transformer/inference/diffusers_transformer_block.py index daec2031c..df9dc38b3 100644 --- a/deepspeed/ops/transformer/inference/diffusers_transformer_block.py +++ b/deepspeed/ops/transformer/inference/diffusers_transformer_block.py @@ -4,12 +4,13 @@ Copyright 2022 The Microsoft DeepSpeed Team import torch import torch.nn as nn -from ... import op_builder from deepspeed import module_inject from .diffusers_attention import DeepSpeedDiffusersAttention from .bias_add import nhwc_bias_add from .diffusers_2d_transformer import Diffusers2DTransformerConfig +from deepspeed.accelerator import get_accelerator +from deepspeed.ops.op_builder.builder_names import InferenceBuilder, SpatialInferenceBuilder # Ops will be loaded on demand transformer_cuda_module = None @@ -19,14 +20,16 @@ spatial_cuda_module = None def load_transformer_module(): global transformer_cuda_module if transformer_cuda_module is None: - transformer_cuda_module = op_builder.InferenceBuilder().load() + transformer_cuda_module = get_accelerator().create_op_builder( + InferenceBuilder).load() return transformer_cuda_module def load_spatial_module(): global spatial_cuda_module if spatial_cuda_module is None: - spatial_cuda_module = op_builder.SpatialInferenceBuilder().load() + spatial_cuda_module = get_accelerator().create_op_builder( + SpatialInferenceBuilder).load() return spatial_cuda_module diff --git a/deepspeed/ops/transformer/inference/ds_attention.py b/deepspeed/ops/transformer/inference/ds_attention.py index d80182db5..78fe5dd9d 100644 --- a/deepspeed/ops/transformer/inference/ds_attention.py +++ b/deepspeed/ops/transformer/inference/ds_attention.py @@ -6,6 +6,7 @@ import math import torch import torch.nn as nn from deepspeed import comm as dist +from deepspeed.accelerator import get_accelerator from .op_binding import LinearOp, VectorMatMulOp, SoftmaxContextOp, QKVGemmOp, SoftmaxOp minus_inf = -10000.0 @@ -27,7 +28,8 @@ class DeepSpeedSelfAttention(nn.Module): data_type_fp = torch.half if config.fp16 else torch.float self.config.layer_id = DeepSpeedSelfAttention.num_layers DeepSpeedSelfAttention.num_layers = DeepSpeedSelfAttention.num_layers + 1 - device = torch.cuda.current_device() #if config.bigscience_bloom else 'cpu' + device = get_accelerator().current_device_name( + ) #if config.bigscience_bloom else 'cpu' qkv_size_per_partition = (self.config.hidden_size // self.config.mp_size) * 3 self.attn_qkvw = nn.Parameter(torch.empty(self.config.hidden_size, qkv_size_per_partition, @@ -199,7 +201,7 @@ class BloomSelfAttention(DeepSpeedSelfAttention): input_mask = torch.empty(1) mixed_x_layer = qkv_out - alibi = alibi.to(torch.cuda.current_device()) + alibi = alibi.to(get_accelerator().current_device_name()) head_dim = self.hidden_size_per_partition // self.num_attention_heads_per_partition new_tensor_shape = mixed_x_layer.size()[:-1] + ( self.num_attention_heads_per_partition, diff --git a/deepspeed/ops/transformer/inference/ds_mlp.py b/deepspeed/ops/transformer/inference/ds_mlp.py index 277ba1818..049559d6a 100644 --- a/deepspeed/ops/transformer/inference/ds_mlp.py +++ b/deepspeed/ops/transformer/inference/ds_mlp.py @@ -6,9 +6,10 @@ import torch from torch.autograd import Function from deepspeed.utils.types import ActivationFuncType from deepspeed import comm as dist +from deepspeed.accelerator import get_accelerator +from deepspeed.ops.op_builder.builder_names import InferenceBuilder import torch.nn as nn import math -from ... import op_builder inference_cuda_module = None @@ -99,7 +100,8 @@ class DeepSpeedMLP(nn.Module): self.config = config data_type = torch.int8 if config.q_int8 else torch.half if config.fp16 else torch.float data_type_fp = torch.half if config.fp16 else torch.float - device = torch.cuda.current_device() #if config.bigscience_bloom else 'cpu' + device = get_accelerator().current_device_name( + ) #if config.bigscience_bloom else 'cpu' self.attn_nw = nn.Parameter(torch.empty(self.config.hidden_size, dtype=data_type_fp, device=device), @@ -136,7 +138,7 @@ class DeepSpeedMLP(nn.Module): # load the cuda module global inference_cuda_module if inference_cuda_module is None: - builder = op_builder.InferenceBuilder() + builder = get_accelerator().create_op_builder(InferenceBuilder) inference_cuda_module = builder.load() self.mp_group = mp_group diff --git a/deepspeed/ops/transformer/inference/moe_inference.py b/deepspeed/ops/transformer/inference/moe_inference.py index cb8a0c023..5706295da 100644 --- a/deepspeed/ops/transformer/inference/moe_inference.py +++ b/deepspeed/ops/transformer/inference/moe_inference.py @@ -5,7 +5,6 @@ import json import math import torch from torch.autograd import Function -from ... import op_builder #from ...inference.engine import inference_cuda_module, specialized_mode # Cuda modules will be imported if needed inference_cuda_module = None @@ -15,6 +14,8 @@ from .ds_attention import DeepSpeedSelfAttention from .config import DeepSpeedInferenceConfig from ....moe.sharded_moe import TopKGate from deepspeed import comm as dist +from deepspeed.accelerator import get_accelerator +from deepspeed.ops.op_builder.builder_names import InferenceBuilder class DeepSpeedMoEInferenceConfig(DeepSpeedInferenceConfig): @@ -241,15 +242,14 @@ class DeepSpeedMoEInference(nn.Module): global specialized_mode if inference_cuda_module is None: specialized_mode = False - if hasattr(op_builder, 'InferenceSpecializedBuilder'): - builder = op_builder.InferenceSpecializedBuilder() - if builder.is_compatible(): - inference_cuda_module = builder.load() - specialized_mode = True - else: - inference_cuda_module = op_builder.InferenceBuilder().load() + # InferenceSpecializedBuilder is not among DeepSpeed provided builder yet, so we infer by builder name string + builder = get_accelerator().create_op_builder("InferenceSpecializedBuilder") + if builder != None and builder.is_compatible(): + inference_cuda_module = builder.load() + specialized_mode = True else: - inference_cuda_module = op_builder.InferenceBuilder().load() + inference_cuda_module = get_accelerator().create_op_builder( + InferenceBuilder).load() self.config.specialized_mode = specialized_mode DeepSpeedMoEInference.layer_id += 1 diff --git a/deepspeed/ops/transformer/transformer.py b/deepspeed/ops/transformer/transformer.py index 7963d1177..ba4ad48f3 100755 --- a/deepspeed/ops/transformer/transformer.py +++ b/deepspeed/ops/transformer/transformer.py @@ -6,8 +6,8 @@ import math import torch from torch import nn from torch.autograd import Function - -from ..op_builder import TransformerBuilder, StochasticTransformerBuilder +from deepspeed.accelerator import get_accelerator +from deepspeed.ops.op_builder.builder_names import TransformerBuilder, StochasticTransformerBuilder # Cuda modules will be imported if needed transformer_cuda_module = None @@ -481,7 +481,7 @@ class DeepSpeedTransformerLayer(nn.Module): print("DeepSpeed Transformer config is ", self.config.__dict__) if self.config.local_rank >= 0: - torch.cuda.set_device(self.config.local_rank) + get_accelerator().set_device(self.config.local_rank) if initial_weights is None and initial_biases is None: self.attn_qkvw = nn.Parameter( @@ -531,9 +531,11 @@ class DeepSpeedTransformerLayer(nn.Module): # Load cuda modules if needed global transformer_cuda_module, stochastic_transformer_cuda_module if transformer_cuda_module is None and not self.config.stochastic_mode: - transformer_cuda_module = TransformerBuilder().load() + transformer_cuda_module = get_accelerator().create_op_builder( + TransformerBuilder).load() if stochastic_transformer_cuda_module is None and self.config.stochastic_mode: - stochastic_transformer_cuda_module = StochasticTransformerBuilder().load() + stochastic_transformer_cuda_module = get_accelerator().create_op_builder( + StochasticTransformerBuilder).load() # create the layer in cuda kernels. cuda_module = stochastic_transformer_cuda_module if self.config.stochastic_mode else transformer_cuda_module diff --git a/op_builder/__init__.py b/op_builder/__init__.py old mode 100755 new mode 100644 index 6a138e5c9..5c0db6ad8 --- a/op_builder/__init__.py +++ b/op_builder/__init__.py @@ -15,23 +15,3 @@ from .transformer_inference import InferenceBuilder from .quantizer import QuantizerBuilder from .spatial_inference import SpatialInferenceBuilder from .builder import get_default_compute_capabilities, OpBuilder - -# TODO: This will be removed eventurally when all files containing reference to ALL_OPS redirected to op_builder.all_ops -# TODO: infer this list instead of hard coded -# List of all available ops -__op_builders__ = [ - CPUAdamBuilder(), - CPUAdagradBuilder(), - FusedAdamBuilder(), - FusedLambBuilder(), - SparseAttnBuilder(), - TransformerBuilder(), - StochasticTransformerBuilder(), - AsyncIOBuilder(), - UtilsBuilder(), - QuantizerBuilder(), - InferenceBuilder(), - SpatialInferenceBuilder(), - RandomLTDBuilder() -] -ALL_OPS = {op.name: op for op in __op_builders__} diff --git a/op_builder/all_ops.py b/op_builder/all_ops.py index 9929e5262..7ca5039ab 100644 --- a/op_builder/all_ops.py +++ b/op_builder/all_ops.py @@ -4,7 +4,11 @@ Copyright 2020 The Microsoft DeepSpeed Team import os import pkgutil import importlib -from deepspeed.accelerator import get_accelerator +try: + # during installation time accelerator is visible, otherwise return deepspeed.accelerator + from accelerator import get_accelerator +except ImportError: + from deepspeed.accelerator import get_accelerator # List of all available ops diff --git a/op_builder/builder_names.py b/op_builder/builder_names.py index 62dd5a9f0..37100b7ce 100644 --- a/op_builder/builder_names.py +++ b/op_builder/builder_names.py @@ -5,7 +5,12 @@ import importlib # List of all available op builders from deepspeed op_builder -op_builder_dir = "deepspeed.ops.op_builder" +try: + import op_builder # noqa: F401 + op_builder_dir = "op_builder" +except ImportError: + op_builder_dir = "deepspeed.ops.op_builder" + op_builder_module = importlib.import_module(op_builder_dir) __op_builders__ = [] diff --git a/setup.py b/setup.py index d1ce425cd..ae9ece39e 100755 --- a/setup.py +++ b/setup.py @@ -25,13 +25,13 @@ import time torch_available = True try: import torch - from torch.utils.cpp_extension import BuildExtension except ImportError: torch_available = False print('[WARNING] Unable to import torch, pre-compiling ops will be disabled. ' \ 'Please visit https://pytorch.org/ to see how to properly install torch on your system.') -from op_builder import ALL_OPS, get_default_compute_capabilities, OpBuilder +from op_builder import get_default_compute_capabilities, OpBuilder +from op_builder.all_ops import ALL_OPS from op_builder.builder import installed_cuda_version # fetch rocm state @@ -91,7 +91,9 @@ cmdclass = {} # For any pre-installed ops force disable ninja if torch_available: - cmdclass['build_ext'] = BuildExtension.with_options(use_ninja=False) + from accelerator import get_accelerator + cmdclass['build_ext'] = get_accelerator().build_extension().with_options( + use_ninja=False) if torch_available: TORCH_MAJOR = torch.__version__.split('.')[0] @@ -195,6 +197,7 @@ if sys.platform == "win32": # It needs Administrator privilege to create symlinks on Windows. create_dir_symlink('..\\..\\csrc', '.\\deepspeed\\ops\\csrc') create_dir_symlink('..\\..\\op_builder', '.\\deepspeed\\ops\\op_builder') + create_dir_symlink('..\\accelerator', '.\\deepspeed\\accelerator') egg_info.manifest_maker.template = 'MANIFEST_win.in' # Parse the DeepSpeed version string from version.txt