Abstract accelerator (step 2) (#2560)

* Abstract accelerator (step 2)

* more flex op_builder path for both installation and runtime

* add SpatialInferenceBuilder into cuda_accelerator.py

* use reflection to make cuda_accelerator adapt to CUDA op builder change automatically

* clean up deepspeed/__init__.py

* add comments in cuda_accelerator for no torch path

* Update deepspeed/env_report.py

Change env_report.py according to suggestion

Co-authored-by: Michael Wyatt <mrwyattii@gmail.com>

* reduce the range of try...except for better code clarity

* Add porting for deepspeed/ops/random_ltd/dropping_utils.py

* move accelerator to top directory and create symlink under deepspeed

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
Co-authored-by: Michael Wyatt <mrwyattii@gmail.com>
Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
This commit is contained in:
Ma, Guokai
2023-01-07 12:40:58 +08:00
committed by GitHub
parent 95d9a1b6c3
commit 9548d48f48
32 changed files with 206 additions and 158 deletions

View File

@ -4,3 +4,4 @@ recursive-include deepspeed *.cpp *.h *.cu *.hip *.tr *.cuh *.cc *.json
recursive-include csrc *.cpp *.h *.cu *.tr *.cuh *.cc
recursive-include op_builder *.py
recursive-include benchmarks *.py
recursive-include accelerator *.py

View File

@ -6,3 +6,4 @@ recursive-include deepspeed *.tr
recursive-exclude deepspeed/ops/csrc *.cpp *.h *.cu *.cuh *.cc
prune csrc
prune op_builder
prune accelerator

View File

@ -1,5 +1,14 @@
from deepspeed.accelerator.abstract_accelerator import DeepSpeedAccelerator
import torch.cuda
import os
import pkgutil
import importlib
from .abstract_accelerator import DeepSpeedAccelerator
# During setup stage torch may not be installed, pass on no torch will
# allow op builder related API to be executed.
try:
import torch.cuda
except ImportError:
pass
class CUDA_Accelerator(DeepSpeedAccelerator):
@ -7,6 +16,25 @@ class CUDA_Accelerator(DeepSpeedAccelerator):
self._name = 'cuda'
self._communication_backend_name = 'nccl'
# begin initialize for create_op_builder()
# put all valid class name <--> class type mapping into class_dict
op_builder_dir = self.op_builder_dir()
op_builder_module = importlib.import_module(op_builder_dir)
for _, module_name, _ in pkgutil.iter_modules([os.path.dirname(op_builder_module.__file__)]):
# avoid self references
if module_name != 'all_ops' and module_name != 'builder' and module_name != 'builder_names':
module = importlib.import_module("{}.{}".format(
op_builder_dir,
module_name))
for member_name in module.__dir__():
if member_name.endswith(
'Builder'
) and member_name != "OpBuilder" and member_name != "CUDAOpBuilder" and member_name != "TorchCPUOpBuilder": # avoid abstract classes
if not member_name in self.class_dict:
self.class_dict[member_name] = getattr(module, member_name)
# end initialize for create_op_builder()
# Device APIs
def device_name(self, device_index=None):
if device_index == None:
@ -194,44 +222,21 @@ class CUDA_Accelerator(DeepSpeedAccelerator):
return False
def op_builder_dir(self):
return "deepspeed.ops.op_builder"
try:
# during installation time op_builder is visible, otherwise return deepspeed.ops.op_builder
import op_builder # noqa: F401
return "op_builder"
except ImportError:
return "deepspeed.ops.op_builder"
# dict that holds class name <--> class type mapping i.e.
# 'AsyncIOBuilder': <class 'op_builder.async_io.AsyncIOBuilder'>
# this dict will be filled at init stage
class_dict = {}
def create_op_builder(self, class_name):
from deepspeed.ops.op_builder import AsyncIOBuilder, CPUAdagradBuilder, CPUAdamBuilder, FusedAdamBuilder, FusedLambBuilder, QuantizerBuilder, SparseAttnBuilder, StochasticTransformerBuilder, TransformerBuilder, InferenceBuilder, UtilsBuilder
from deepspeed.ops.op_builder.builder_names import AsyncIOBuilder as AsyncIOBuilderName
from deepspeed.ops.op_builder.builder_names import CPUAdagradBuilder as CPUAdagradBuilderName
from deepspeed.ops.op_builder.builder_names import CPUAdamBuilder as CPUAdamBuilderName
from deepspeed.ops.op_builder.builder_names import FusedAdamBuilder as FusedAdamBuilderName
from deepspeed.ops.op_builder.builder_names import FusedLambBuilder as FusedLambBuilderName
from deepspeed.ops.op_builder.builder_names import QuantizerBuilder as QuantizerBuilderName
from deepspeed.ops.op_builder.builder_names import SparseAttnBuilder as SparseAttnBuilderName
from deepspeed.ops.op_builder.builder_names import StochasticTransformerBuilder as StochasticTransformerBuilderName
from deepspeed.ops.op_builder.builder_names import TransformerBuilder as TransformerBuilderName
from deepspeed.ops.op_builder.builder_names import InferenceBuilder as InferenceBuilderName
from deepspeed.ops.op_builder.builder_names import UtilsBuilder as UtilsBuilderName
if class_name == AsyncIOBuilderName:
return AsyncIOBuilder()
elif class_name == CPUAdagradBuilderName:
return CPUAdagradBuilder()
elif class_name == CPUAdamBuilderName:
return CPUAdamBuilder()
elif class_name == FusedAdamBuilderName:
return FusedAdamBuilder()
elif class_name == FusedLambBuilderName:
return FusedLambBuilder()
elif class_name == QuantizerBuilderName:
return QuantizerBuilder()
elif class_name == SparseAttnBuilderName:
return SparseAttnBuilder()
elif class_name == StochasticTransformerBuilderName:
return StochasticTransformerBuilder()
elif class_name == TransformerBuilderName:
return TransformerBuilder()
elif class_name == InferenceBuilderName:
return InferenceBuilder()
elif class_name == UtilsBuilderName:
return UtilsBuilder()
if class_name in self.class_dict:
return self.class_dict[class_name]()
else:
return None

View File

@ -24,7 +24,7 @@ def get_accelerator():
_validate_accelerator(ds_accelerator)
return ds_accelerator
from deepspeed.accelerator.cuda_accelerator import CUDA_Accelerator
from .cuda_accelerator import CUDA_Accelerator
ds_accelerator = CUDA_Accelerator()
_validate_accelerator(ds_accelerator)
return ds_accelerator

View File

@ -15,6 +15,8 @@ import shutil
from test_ds_aio_utils import refine_integer_value
from perf_sweep_utils import READ_OP_DESC, WRITE_OP_DESC, BENCH_LOG_DIR, \
READ_IO_DIR, WRITE_IO_DIR, READ_LOG_DIR, WRITE_LOG_DIR
from deepspeed.accelerator import get_accelerator
from deepspeed.ops.op_builder.builder_names import AsyncIOBuilder
OTHER_OPTIONS = '--handle'
PERF_SCRIPT = 'test_ds_aio.py'
@ -277,8 +279,7 @@ def script_path():
def async_io_setup():
from deepspeed.ops.aio import AsyncIOBuilder
return AsyncIOBuilder().is_compatible()
return get_accelerator().create_op_builder(AsyncIOBuilder).is_compatible()
def get_block_size_and_count(io_bytes):

View File

@ -8,9 +8,10 @@ Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
import torch
import os
import time
from deepspeed.ops.aio import AsyncIOBuilder
from multiprocessing import Pool, Barrier
from test_ds_aio_utils import report_results, task_log, task_barrier
from deepspeed.accelerator import get_accelerator
from deepspeed.ops.op_builder.builder_names import AsyncIOBuilder
def pre_basic(args, tid, read_op):
@ -19,7 +20,10 @@ def pre_basic(args, tid, read_op):
file = args.read_file if read_op else f'{args.write_file}.{tid}'
task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory()
buffer = get_accelerator().pin_memory(
torch.empty(num_bytes,
dtype=torch.uint8,
device='cpu'))
task_log(
tid,
f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
@ -56,13 +60,14 @@ def post_basic(pool_params):
def main_basic_read(pool_params):
args, tid, ctxt = pool_params
start_time = time.time()
AsyncIOBuilder().load().aio_read(ctxt['buffer'],
ctxt['file'],
args.block_size,
args.queue_depth,
args.single_submit,
args.overlap_events,
args.validate)
get_accelerator().create_op_builder(AsyncIOBuilder).load().aio_read(
ctxt['buffer'],
ctxt['file'],
args.block_size,
args.queue_depth,
args.single_submit,
args.overlap_events,
args.validate)
end_time = time.time()
ctxt['elapsed_sec'] += end_time - start_time
@ -72,13 +77,14 @@ def main_basic_read(pool_params):
def main_basic_write(pool_params):
args, tid, ctxt = pool_params
start_time = time.time()
AsyncIOBuilder().load().aio_write(ctxt['buffer'],
ctxt['file'],
args.block_size,
args.queue_depth,
args.single_submit,
args.overlap_events,
args.validate)
get_accelerator().create_op_builder(AsyncIOBuilder).load().aio_write(
ctxt['buffer'],
ctxt['file'],
args.block_size,
args.queue_depth,
args.single_submit,
args.overlap_events,
args.validate)
end_time = time.time()
ctxt['elapsed_sec'] += end_time - start_time

View File

@ -9,8 +9,9 @@ import torch
import os
import time
from multiprocessing import Pool, Barrier
from deepspeed.ops.aio import AsyncIOBuilder
from test_ds_aio_utils import report_results, task_log, task_barrier
from deepspeed.accelerator import get_accelerator
from deepspeed.ops.op_builder.builder_names import AsyncIOBuilder
def pre_handle(args, tid, read_op):
@ -20,20 +21,26 @@ def pre_handle(args, tid, read_op):
task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
if args.gpu:
buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cuda')
buffer = torch.empty(num_bytes,
dtype=torch.uint8,
device=get_accelerator().device_name())
else:
buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory()
buffer = get_accelerator().pin_memory(
torch.empty(num_bytes,
dtype=torch.uint8,
device='cpu'))
task_log(
tid,
f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
)
io_parallel = args.io_parallel if args.io_parallel else 1
handle = AsyncIOBuilder().load().aio_handle(args.block_size,
args.queue_depth,
args.single_submit,
args.overlap_events,
io_parallel)
handle = get_accelerator().create_op_builder(AsyncIOBuilder).load().aio_handle(
args.block_size,
args.queue_depth,
args.single_submit,
args.overlap_events,
io_parallel)
task_log(tid, f'created deepspeed aio handle')
ctxt = {}

View File

@ -4,5 +4,6 @@ Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
from deepspeed.ops.aio import AsyncIOBuilder
assert AsyncIOBuilder().is_compatible()
from deepspeed.accelerator import get_accelerator
from deepspeed.ops.op_builder.builder_names import AsyncIOBuilder
assert get_accelerator().create_op_builder(AsyncIOBuilder).is_compatible()

1
deepspeed/accelerator Symbolic link
View File

@ -0,0 +1 @@
../accelerator/

View File

@ -31,6 +31,7 @@ import os
from ..constants import TORCH_DISTRIBUTED_DEFAULT_PORT, default_pg_timeout
from .constants import *
from deepspeed.accelerator import get_accelerator
class ReduceOp(Enum):
@ -127,7 +128,7 @@ def timed_op(func):
finally:
if comms_logger.enabled:
# Need to make op blocking for accurate logging
torch.cuda.synchronize()
get_accelerator().synchronize()
# If we're using MPI, we can't simply sync the stream
if cdb.using_mpi:
cdb.barrier()
@ -595,7 +596,7 @@ def get_global_rank(group=None, group_rank=0):
# Main DeepSpeed Comms. public API.
def init_distributed(dist_backend="nccl",
def init_distributed(dist_backend=None,
auto_mpi_discovery=True,
distributed_port=TORCH_DISTRIBUTED_DEFAULT_PORT,
verbose=True,
@ -650,6 +651,8 @@ def init_distributed(dist_backend="nccl",
utils.logger.info('Distributed backend already initialized')
else:
assert isinstance(timeout, timedelta)
if dist_backend == None:
dist_backend = get_accelerator().communication_backend_name()
if int(os.getenv('RANK', '0')) == 0:
utils.logger.info(
'Initializing TorchBackend in DeepSpeed with backend {}'.format(

View File

@ -2,8 +2,9 @@ import torch
import deepspeed
import subprocess
import argparse
from .ops.op_builder import ALL_OPS
from .ops.op_builder.all_ops import ALL_OPS
from .git_version_info import installed_ops, torch_info
from deepspeed.accelerator import get_accelerator
GREEN = '\033[92m'
RED = '\033[91m'
@ -79,31 +80,33 @@ def nvcc_version():
def debug_report():
max_dots = 33
hip_version = None
if hasattr(torch.version, 'hip'):
hip_version = torch.version.hip
report = [
("torch install path",
torch.__path__),
("torch version",
torch.__version__),
("torch cuda version",
torch.version.cuda),
("torch hip version",
hip_version),
("nvcc version",
(None if hip_version else nvcc_version())),
("deepspeed install path",
deepspeed.__path__),
("deepspeed info",
f"{deepspeed.__version__}, {deepspeed.__git_hash__}, {deepspeed.__git_branch__}"
),
("deepspeed wheel compiled w.",
f"torch {torch_info['version']}, " +
(f"hip {torch_info['hip_version']}"
if hip_version else f"cuda {torch_info['cuda_version']}")),
)
]
if get_accelerator().device_name() == 'cuda':
hip_version = getattr(torch.version, "hip", None)
report.extend([("torch cuda version",
torch.version.cuda),
("torch hip version",
hip_version),
("nvcc version",
(None if hip_version else nvcc_version())),
("deepspeed wheel compiled w.",
f"torch {torch_info['version']}, " +
(f"hip {torch_info['hip_version']}"
if hip_version else f"cuda {torch_info['cuda_version']}"))])
else:
report.extend([("deepspeed wheel compiled w.",
f"torch {torch_info['version']} ")])
print("DeepSpeed general environment info:")
for name, value in report:
print(name, "." * (max_dots - len(name)), value)

View File

@ -11,7 +11,7 @@ except ModuleNotFoundError:
git_hash = '[none]'
git_branch = '[none]'
from .ops.op_builder import ALL_OPS
from .ops.op_builder.all_ops import ALL_OPS
installed_ops = dict.fromkeys(ALL_OPS.keys(), False)
compatible_ops = dict.fromkeys(ALL_OPS.keys(), False)
torch_info = {'version': "0.0", "cuda_version": "0.0", "hip_version": "0.0"}

View File

@ -3,7 +3,8 @@ Copyright 2020 The Microsoft DeepSpeed Team
'''
import torch
from ..op_builder import CPUAdagradBuilder
from deepspeed.accelerator import get_accelerator
from deepspeed.ops.op_builder.builder_names import CPUAdagradBuilder
from deepspeed.utils.logging import should_log_le
@ -24,7 +25,8 @@ class DeepSpeedCPUAdagrad(torch.optim.Optimizer):
self.opt_id = DeepSpeedCPUAdagrad.optimizer_id
DeepSpeedCPUAdagrad.optimizer_id = DeepSpeedCPUAdagrad.optimizer_id + 1
self.fp32_optimizer_states = fp32_optimizer_states
self.ds_opt_adagrad = CPUAdagradBuilder().load()
self.ds_opt_adagrad = get_accelerator().create_op_builder(
CPUAdagradBuilder).load()
self.ds_opt_adagrad.create_adagrad(self.opt_id,
lr,

View File

@ -4,9 +4,10 @@ Copyright 2020 The Microsoft DeepSpeed Team
import torch
from cpuinfo import get_cpu_info
from ..op_builder import CPUAdamBuilder
from deepspeed.utils import logger
from deepspeed.utils.logging import should_log_le
from deepspeed.accelerator import get_accelerator
from deepspeed.ops.op_builder.builder_names import CPUAdamBuilder
class DeepSpeedCPUAdam(torch.optim.Optimizer):
@ -91,7 +92,7 @@ class DeepSpeedCPUAdam(torch.optim.Optimizer):
DeepSpeedCPUAdam.optimizer_id = DeepSpeedCPUAdam.optimizer_id + 1
self.adam_w_mode = adamw_mode
self.fp32_optimizer_states = fp32_optimizer_states
self.ds_opt_adam = CPUAdamBuilder().load()
self.ds_opt_adam = get_accelerator().create_op_builder(CPUAdamBuilder).load()
self.ds_opt_adam.create_adam(self.opt_id,
lr,

View File

@ -9,7 +9,8 @@ import torch
from .multi_tensor_apply import MultiTensorApply
multi_tensor_applier = MultiTensorApply(2048 * 32)
from ..op_builder import FusedAdamBuilder
from deepspeed.accelerator import get_accelerator
from deepspeed.ops.op_builder.builder_names import FusedAdamBuilder
class FusedAdam(torch.optim.Optimizer):
@ -69,9 +70,9 @@ class FusedAdam(torch.optim.Optimizer):
self.adam_w_mode = 1 if adam_w_mode else 0
self.set_grad_none = set_grad_none
fused_adam_cuda = FusedAdamBuilder().load()
fused_adam_cuda = get_accelerator().create_op_builder(FusedAdamBuilder).load()
# Skip buffer
self._dummy_overflow_buf = torch.cuda.IntTensor([0])
self._dummy_overflow_buf = get_accelerator().IntTensor([0])
self.multi_tensor_adam = fused_adam_cuda.multi_tensor_adam
def zero_grad(self):

View File

@ -6,7 +6,8 @@ This file is adapted from NVIDIA/apex/optimizer/fused_adam and implements the LA
'''
import types
import torch
from ..op_builder import FusedLambBuilder
from deepspeed.accelerator import get_accelerator
from deepspeed.ops.op_builder.builder_names import FusedLambBuilder
class FusedLamb(torch.optim.Optimizer):
@ -48,7 +49,8 @@ class FusedLamb(torch.optim.Optimizer):
max_coeff=10.0,
min_coeff=0.01,
amsgrad=False):
self.fused_lamb_cuda = FusedLambBuilder().load()
self.fused_lamb_cuda = get_accelerator().create_op_builder(
FusedLambBuilder).load()
if amsgrad:
raise RuntimeError('FusedLamb does not support the AMSGrad variant.')

View File

@ -3,7 +3,8 @@ Copyright 2020 The Microsoft DeepSpeed Team
'''
import torch
from ..op_builder import QuantizerBuilder
from deepspeed.accelerator import get_accelerator
from deepspeed.ops.op_builder.builder_names import QuantizerBuilder
# Cuda modules will be imported if needed
quantizer_cuda_module = None
@ -13,7 +14,8 @@ def ds_quantizer(input, groups=1, bit_num=8, sr=False, asym=False):
# Load cuda modules if needed
global quantizer_cuda_module
if quantizer_cuda_module is None:
quantizer_cuda_module = QuantizerBuilder().load()
quantizer_cuda_module = get_accelerator().create_op_builder(
QuantizerBuilder).load()
if sr:
if asym:
quantize_func = quantizer_cuda_module.ds_sr_quantize_asym_fp16 if input.dtype == torch.half else quantizer_cuda_module.ds_sr_quantize_asym_fp32

View File

@ -3,7 +3,8 @@ Copyright 2022 The Microsoft DeepSpeed Team
"""
import torch
from ..op_builder import RandomLTDBuilder
from deepspeed.accelerator import get_accelerator
from deepspeed.ops.op_builder.builder_names import RandomLTDBuilder
"""
Returns:
sampled_indices: [layers, batch_size, reserved_length]
@ -28,7 +29,7 @@ def gpt_sample_tokens(reserved_length: int,
reserved_length).to(torch.int32)
global random_ltd_module
if random_ltd_module is None:
random_ltd_module = RandomLTDBuilder().load()
random_ltd_module = get_accelerator().create_op_builder(RandomLTDBuilder).load()
sampled_indices = random_ltd_module.token_sort_(sampled_indices, seq_length)
# Not certain the optimized kernel is actually better here, cause it kind of screws
@ -64,7 +65,7 @@ def bert_sample_tokens(reserved_length: int,
reserved_length).to(torch.int32)
global random_ltd_module
if random_ltd_module is None:
random_ltd_module = RandomLTDBuilder().load()
random_ltd_module = get_accelerator().create_op_builder(RandomLTDBuilder).load()
sampled_indices = random_ltd_module.token_sort_(sampled_indices, seq_length)
dtype = sampled_indices.dtype
@ -89,7 +90,8 @@ class GatherTokens(torch.autograd.Function):
batch_first: bool):
global random_ltd_module
if random_ltd_module is None:
random_ltd_module = RandomLTDBuilder().load()
random_ltd_module = get_accelerator().create_op_builder(
RandomLTDBuilder).load()
ctx.save_for_backward(activations, sorted_indices)
ctx.batch_first = batch_first
return activations, random_ltd_module.token_gather(activations, sorted_indices, batch_first)
@ -100,7 +102,8 @@ class GatherTokens(torch.autograd.Function):
g_gradients = g_gradients.contiguous()
global random_ltd_module
if random_ltd_module is None:
random_ltd_module = RandomLTDBuilder().load()
random_ltd_module = get_accelerator().create_op_builder(
RandomLTDBuilder).load()
activations, sorted_indices = ctx.saved_tensors
batch_first = ctx.batch_first
@ -119,7 +122,8 @@ class ScatterTokens(torch.autograd.Function):
batch_first: bool):
global random_ltd_module
if random_ltd_module is None:
random_ltd_module = RandomLTDBuilder().load()
random_ltd_module = get_accelerator().create_op_builder(
RandomLTDBuilder).load()
scatter_results = random_ltd_module.token_scatter_(all_activations.clone(),
layer_activations,
sorted_indices,
@ -135,7 +139,8 @@ class ScatterTokens(torch.autograd.Function):
out_gradients = out_gradients.contiguous()
global random_ltd_module
if random_ltd_module is None:
random_ltd_module = RandomLTDBuilder().load()
random_ltd_module = get_accelerator().create_op_builder(
RandomLTDBuilder).load()
sorted_indices, = ctx.saved_tensors
batch_first = ctx.batch_first

View File

@ -6,6 +6,7 @@ import torch
import triton
import triton.language as tl
import triton._C.libtriton as libtriton
from deepspeed.accelerator import get_accelerator
@triton.jit
@ -948,7 +949,7 @@ class MatMul:
raise ValueError(
f"Inputs must be on the same device; got {a.device} for tensor A "
f"and {b.device} for tensor B")
if not a.is_cuda:
if not get_accelerator().on_accelerator(a):
raise ValueError("Only GPU devices are supported for now")
# When autocast is enabled, torch.matmul autocasts to float16, so we do the same here

View File

@ -4,8 +4,8 @@ Copyright 2022 The Microsoft DeepSpeed Team
from typing import Optional
import torch
from ... import op_builder
from deepspeed.accelerator import get_accelerator
from deepspeed.ops.op_builder.builder_names import SpatialInferenceBuilder
spatial_cuda_module = None
@ -16,7 +16,8 @@ def nhwc_bias_add(activation: torch.Tensor,
other_bias: Optional[torch.Tensor] = None) -> torch.Tensor:
global spatial_cuda_module
if spatial_cuda_module is None:
spatial_cuda_module = op_builder.SpatialInferenceBuilder().load()
spatial_cuda_module = get_accelerator().create_op_builder(
SpatialInferenceBuilder).load()
if other is None:
return spatial_cuda_module.nhwc_bias_add(activation, bias)

View File

@ -4,10 +4,12 @@ Copyright 2022 The Microsoft DeepSpeed Team
import math
import torch
from torch.autograd import Function
from ... import op_builder
import torch.nn as nn
from packaging import version as pkg_version
from deepspeed.utils.logging import log_dist
from deepspeed.accelerator import get_accelerator
from deepspeed.ops.op_builder.builder_names import InferenceBuilder
# Cuda modules will be imported if needed
inference_cuda_module = None
minus_inf = -10000.0
@ -140,14 +142,15 @@ class DeepSpeedDiffusersAttention(nn.Module):
self.config = config
self.config.layer_id = DeepSpeedDiffusersAttention.layer_id
DeepSpeedDiffusersAttention.layer_id += 1
device = torch.cuda.current_device() if config.bigscience_bloom else 'cpu'
device = get_accelerator().current_device_name(
) if config.bigscience_bloom else 'cpu'
qkv_size_per_partition = (self.config.hidden_size // self.config.mp_size) * 3
data_type = torch.int8 if config.q_int8 else torch.half if config.fp16 else torch.float
data_type_fp = torch.half if config.fp16 else torch.float
global inference_cuda_module
if inference_cuda_module is None:
builder = op_builder.InferenceBuilder()
builder = get_accelerator().create_op_builder(InferenceBuilder)
inference_cuda_module = builder.load()
if DeepSpeedDiffusersAttention.layer_id == 1:

View File

@ -4,12 +4,13 @@ Copyright 2022 The Microsoft DeepSpeed Team
import torch
import torch.nn as nn
from ... import op_builder
from deepspeed import module_inject
from .diffusers_attention import DeepSpeedDiffusersAttention
from .bias_add import nhwc_bias_add
from .diffusers_2d_transformer import Diffusers2DTransformerConfig
from deepspeed.accelerator import get_accelerator
from deepspeed.ops.op_builder.builder_names import InferenceBuilder, SpatialInferenceBuilder
# Ops will be loaded on demand
transformer_cuda_module = None
@ -19,14 +20,16 @@ spatial_cuda_module = None
def load_transformer_module():
global transformer_cuda_module
if transformer_cuda_module is None:
transformer_cuda_module = op_builder.InferenceBuilder().load()
transformer_cuda_module = get_accelerator().create_op_builder(
InferenceBuilder).load()
return transformer_cuda_module
def load_spatial_module():
global spatial_cuda_module
if spatial_cuda_module is None:
spatial_cuda_module = op_builder.SpatialInferenceBuilder().load()
spatial_cuda_module = get_accelerator().create_op_builder(
SpatialInferenceBuilder).load()
return spatial_cuda_module

View File

@ -6,6 +6,7 @@ import math
import torch
import torch.nn as nn
from deepspeed import comm as dist
from deepspeed.accelerator import get_accelerator
from .op_binding import LinearOp, VectorMatMulOp, SoftmaxContextOp, QKVGemmOp, SoftmaxOp
minus_inf = -10000.0
@ -27,7 +28,8 @@ class DeepSpeedSelfAttention(nn.Module):
data_type_fp = torch.half if config.fp16 else torch.float
self.config.layer_id = DeepSpeedSelfAttention.num_layers
DeepSpeedSelfAttention.num_layers = DeepSpeedSelfAttention.num_layers + 1
device = torch.cuda.current_device() #if config.bigscience_bloom else 'cpu'
device = get_accelerator().current_device_name(
) #if config.bigscience_bloom else 'cpu'
qkv_size_per_partition = (self.config.hidden_size // self.config.mp_size) * 3
self.attn_qkvw = nn.Parameter(torch.empty(self.config.hidden_size,
qkv_size_per_partition,
@ -199,7 +201,7 @@ class BloomSelfAttention(DeepSpeedSelfAttention):
input_mask = torch.empty(1)
mixed_x_layer = qkv_out
alibi = alibi.to(torch.cuda.current_device())
alibi = alibi.to(get_accelerator().current_device_name())
head_dim = self.hidden_size_per_partition // self.num_attention_heads_per_partition
new_tensor_shape = mixed_x_layer.size()[:-1] + (
self.num_attention_heads_per_partition,

View File

@ -6,9 +6,10 @@ import torch
from torch.autograd import Function
from deepspeed.utils.types import ActivationFuncType
from deepspeed import comm as dist
from deepspeed.accelerator import get_accelerator
from deepspeed.ops.op_builder.builder_names import InferenceBuilder
import torch.nn as nn
import math
from ... import op_builder
inference_cuda_module = None
@ -99,7 +100,8 @@ class DeepSpeedMLP(nn.Module):
self.config = config
data_type = torch.int8 if config.q_int8 else torch.half if config.fp16 else torch.float
data_type_fp = torch.half if config.fp16 else torch.float
device = torch.cuda.current_device() #if config.bigscience_bloom else 'cpu'
device = get_accelerator().current_device_name(
) #if config.bigscience_bloom else 'cpu'
self.attn_nw = nn.Parameter(torch.empty(self.config.hidden_size,
dtype=data_type_fp,
device=device),
@ -136,7 +138,7 @@ class DeepSpeedMLP(nn.Module):
# load the cuda module
global inference_cuda_module
if inference_cuda_module is None:
builder = op_builder.InferenceBuilder()
builder = get_accelerator().create_op_builder(InferenceBuilder)
inference_cuda_module = builder.load()
self.mp_group = mp_group

View File

@ -5,7 +5,6 @@ import json
import math
import torch
from torch.autograd import Function
from ... import op_builder
#from ...inference.engine import inference_cuda_module, specialized_mode
# Cuda modules will be imported if needed
inference_cuda_module = None
@ -15,6 +14,8 @@ from .ds_attention import DeepSpeedSelfAttention
from .config import DeepSpeedInferenceConfig
from ....moe.sharded_moe import TopKGate
from deepspeed import comm as dist
from deepspeed.accelerator import get_accelerator
from deepspeed.ops.op_builder.builder_names import InferenceBuilder
class DeepSpeedMoEInferenceConfig(DeepSpeedInferenceConfig):
@ -241,15 +242,14 @@ class DeepSpeedMoEInference(nn.Module):
global specialized_mode
if inference_cuda_module is None:
specialized_mode = False
if hasattr(op_builder, 'InferenceSpecializedBuilder'):
builder = op_builder.InferenceSpecializedBuilder()
if builder.is_compatible():
inference_cuda_module = builder.load()
specialized_mode = True
else:
inference_cuda_module = op_builder.InferenceBuilder().load()
# InferenceSpecializedBuilder is not among DeepSpeed provided builder yet, so we infer by builder name string
builder = get_accelerator().create_op_builder("InferenceSpecializedBuilder")
if builder != None and builder.is_compatible():
inference_cuda_module = builder.load()
specialized_mode = True
else:
inference_cuda_module = op_builder.InferenceBuilder().load()
inference_cuda_module = get_accelerator().create_op_builder(
InferenceBuilder).load()
self.config.specialized_mode = specialized_mode
DeepSpeedMoEInference.layer_id += 1

View File

@ -6,8 +6,8 @@ import math
import torch
from torch import nn
from torch.autograd import Function
from ..op_builder import TransformerBuilder, StochasticTransformerBuilder
from deepspeed.accelerator import get_accelerator
from deepspeed.ops.op_builder.builder_names import TransformerBuilder, StochasticTransformerBuilder
# Cuda modules will be imported if needed
transformer_cuda_module = None
@ -481,7 +481,7 @@ class DeepSpeedTransformerLayer(nn.Module):
print("DeepSpeed Transformer config is ", self.config.__dict__)
if self.config.local_rank >= 0:
torch.cuda.set_device(self.config.local_rank)
get_accelerator().set_device(self.config.local_rank)
if initial_weights is None and initial_biases is None:
self.attn_qkvw = nn.Parameter(
@ -531,9 +531,11 @@ class DeepSpeedTransformerLayer(nn.Module):
# Load cuda modules if needed
global transformer_cuda_module, stochastic_transformer_cuda_module
if transformer_cuda_module is None and not self.config.stochastic_mode:
transformer_cuda_module = TransformerBuilder().load()
transformer_cuda_module = get_accelerator().create_op_builder(
TransformerBuilder).load()
if stochastic_transformer_cuda_module is None and self.config.stochastic_mode:
stochastic_transformer_cuda_module = StochasticTransformerBuilder().load()
stochastic_transformer_cuda_module = get_accelerator().create_op_builder(
StochasticTransformerBuilder).load()
# create the layer in cuda kernels.
cuda_module = stochastic_transformer_cuda_module if self.config.stochastic_mode else transformer_cuda_module

20
op_builder/__init__.py Executable file → Normal file
View File

@ -15,23 +15,3 @@ from .transformer_inference import InferenceBuilder
from .quantizer import QuantizerBuilder
from .spatial_inference import SpatialInferenceBuilder
from .builder import get_default_compute_capabilities, OpBuilder
# TODO: This will be removed eventurally when all files containing reference to ALL_OPS redirected to op_builder.all_ops
# TODO: infer this list instead of hard coded
# List of all available ops
__op_builders__ = [
CPUAdamBuilder(),
CPUAdagradBuilder(),
FusedAdamBuilder(),
FusedLambBuilder(),
SparseAttnBuilder(),
TransformerBuilder(),
StochasticTransformerBuilder(),
AsyncIOBuilder(),
UtilsBuilder(),
QuantizerBuilder(),
InferenceBuilder(),
SpatialInferenceBuilder(),
RandomLTDBuilder()
]
ALL_OPS = {op.name: op for op in __op_builders__}

View File

@ -4,7 +4,11 @@ Copyright 2020 The Microsoft DeepSpeed Team
import os
import pkgutil
import importlib
from deepspeed.accelerator import get_accelerator
try:
# during installation time accelerator is visible, otherwise return deepspeed.accelerator
from accelerator import get_accelerator
except ImportError:
from deepspeed.accelerator import get_accelerator
# List of all available ops

View File

@ -5,7 +5,12 @@ import importlib
# List of all available op builders from deepspeed op_builder
op_builder_dir = "deepspeed.ops.op_builder"
try:
import op_builder # noqa: F401
op_builder_dir = "op_builder"
except ImportError:
op_builder_dir = "deepspeed.ops.op_builder"
op_builder_module = importlib.import_module(op_builder_dir)
__op_builders__ = []

View File

@ -25,13 +25,13 @@ import time
torch_available = True
try:
import torch
from torch.utils.cpp_extension import BuildExtension
except ImportError:
torch_available = False
print('[WARNING] Unable to import torch, pre-compiling ops will be disabled. ' \
'Please visit https://pytorch.org/ to see how to properly install torch on your system.')
from op_builder import ALL_OPS, get_default_compute_capabilities, OpBuilder
from op_builder import get_default_compute_capabilities, OpBuilder
from op_builder.all_ops import ALL_OPS
from op_builder.builder import installed_cuda_version
# fetch rocm state
@ -91,7 +91,9 @@ cmdclass = {}
# For any pre-installed ops force disable ninja
if torch_available:
cmdclass['build_ext'] = BuildExtension.with_options(use_ninja=False)
from accelerator import get_accelerator
cmdclass['build_ext'] = get_accelerator().build_extension().with_options(
use_ninja=False)
if torch_available:
TORCH_MAJOR = torch.__version__.split('.')[0]
@ -195,6 +197,7 @@ if sys.platform == "win32":
# It needs Administrator privilege to create symlinks on Windows.
create_dir_symlink('..\\..\\csrc', '.\\deepspeed\\ops\\csrc')
create_dir_symlink('..\\..\\op_builder', '.\\deepspeed\\ops\\op_builder')
create_dir_symlink('..\\accelerator', '.\\deepspeed\\accelerator')
egg_info.manifest_maker.template = 'MANIFEST_win.in'
# Parse the DeepSpeed version string from version.txt