From fc2f378ece3055fddd13cdf309faa9ab66a1b302 Mon Sep 17 00:00:00 2001 From: Alex Hedges Date: Tue, 30 Nov 2021 22:12:29 -0500 Subject: [PATCH] Improve pre-commit hooks (#1602) Co-authored-by: Jeff Rasley --- .pre-commit-config.yaml | 31 ++++++++++------ .../config_templates/template_zero3.json | 2 +- deepspeed/module_inject/replace_module.py | 12 +++--- deepspeed/moe/utils.py | 10 ++--- deepspeed/ops/adam/fused_adam.py | 1 + .../sparse_attention/sparse_self_attention.py | 12 +++--- deepspeed/runtime/config.py | 37 +++++++++---------- deepspeed/runtime/engine.py | 12 +++--- deepspeed/runtime/utils.py | 4 +- deepspeed/runtime/zero/config.py | 5 +-- .../runtime/zero/partition_parameters.py | 9 ++--- deepspeed/runtime/zero/stage3.py | 4 +- .../2020-05-28-fastest-bert-training.md | 10 ++--- docs/_posts/2021-03-08-zero3-offload.md | 6 +-- docs/_posts/2021-05-05-MoQ.md | 2 +- ...021-05-05-inference-kernel-optimization.md | 6 +-- docs/_tutorials/megatron.md | 2 +- docs/code-docs/source/conf.py | 1 + requirements/requirements-autotuning-ml.txt | 2 +- requirements/requirements-dev.txt | 12 +++--- requirements/requirements-readthedocs.txt | 4 +- requirements/requirements.txt | 8 ++-- .../BingBertSquad_run_func_test.py | 1 - .../BingBertSquad_test_common.py | 1 - tests/model/BingBertSquad/__init__.py | 1 - tests/model/Megatron_GPT2/__init__.py | 1 - .../Megatron_GPT2/run_checkpoint_test.py | 1 - tests/model/Megatron_GPT2/run_func_test.py | 1 - .../model/Megatron_GPT2/run_perf_baseline.py | 1 - tests/model/Megatron_GPT2/run_perf_test.py | 1 - tests/model/Megatron_GPT2/test_common.py | 1 - tests/model/run_sanity_check.py | 1 - tests/unit/test_activation_checkpointing.py | 1 + tests/unit/test_checkpointing.py | 1 + tests/unit/test_onebit.py | 1 + tests/unit/test_pipe.py | 1 + tests/unit/test_pipe_module.py | 1 + 37 files changed, 103 insertions(+), 104 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 21be46d62..61fd67b0b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,27 +1,34 @@ - repos: -- repo: https://github.com/pre-commit/pre-commit-hooks - rev: v1.2.3 +- repo: meta hooks: - - id: trailing-whitespace - exclude: "DeepSpeedExamples/" + - id: check-hooks-apply + - id: check-useless-excludes + +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.0.1 + hooks: + - id: check-case-conflict + - id: check-json + - id: check-symlinks - id: check-yaml - exclude: "DeepSpeedExamples/" + - id: destroyed-symlinks - id: end-of-file-fixer - exclude: "DeepSpeedExamples/" - exclude: "docs/CNAME" + exclude: docs/CNAME + - id: fix-byte-order-marker + - id: fix-encoding-pragma + args: [--remove] - id: mixed-line-ending - exclude: "DeepSpeedExamples/" args: [--fix=lf] + - id: requirements-txt-fixer + - id: trailing-whitespace - repo: https://github.com/pre-commit/mirrors-yapf - rev: v0.29.0 + rev: v0.31.0 hooks: - id: yapf - exclude: "examples/" - repo: https://gitlab.com/daverona/pre-commit-cpp - rev: 0.6.0 + rev: 0.8.0 hooks: - id: clang-format # formatter of C/C++ code based on a style guide: LLVM, Google, Chromium, Mozilla, and WebKit available args: [] diff --git a/deepspeed/autotuning/config_templates/template_zero3.json b/deepspeed/autotuning/config_templates/template_zero3.json index 49a1c7788..e00f47f65 100644 --- a/deepspeed/autotuning/config_templates/template_zero3.json +++ b/deepspeed/autotuning/config_templates/template_zero3.json @@ -12,6 +12,6 @@ "stage3_prefetch_bucket_size": 5e8, "stage3_param_persistence_threshold": 1e6, "stage3_gather_fp16_weights_on_model_save": false, - "sub_group_size": 1e12, + "sub_group_size": 1e12 } } diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index 9e60e3583..b6e7f7c79 100755 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -450,12 +450,12 @@ def replace_transformer_layer(orig_layer_impl, else: # copy relevant state from child -> new module if replace_with_kernel_inject: - new_module = replace_with_policy( - child, - _policy, - inference=True, - preln=(_policy is not HFBertLayerPolicy), - layer_id=layer_id) + new_module = replace_with_policy(child, + _policy, + inference=True, + preln=(_policy + is not HFBertLayerPolicy), + layer_id=layer_id) else: new_module = replace_wo_policy(child, _policy) diff --git a/deepspeed/moe/utils.py b/deepspeed/moe/utils.py index 3c40630d3..91ba621b6 100644 --- a/deepspeed/moe/utils.py +++ b/deepspeed/moe/utils.py @@ -9,9 +9,8 @@ def is_moe_param(param: torch.Tensor) -> bool: def split_params_into_shared_and_expert_params( - params: List[torch.nn.Parameter] -) -> Tuple[torch.nn.Parameter, - torch.nn.Parameter]: + params: List[torch.nn.Parameter]) -> Tuple[torch.nn.Parameter, + torch.nn.Parameter]: shared_params, expert_params = [], [] for p in params: if is_moe_param(p): @@ -22,9 +21,8 @@ def split_params_into_shared_and_expert_params( def split_params_grads_into_shared_and_expert_params( - group: List[torch.nn.Parameter] -) -> Tuple[torch.nn.Parameter, - torch.nn.Parameter]: + group: List[torch.nn.Parameter]) -> Tuple[torch.nn.Parameter, + torch.nn.Parameter]: """Split grad of parameters into grads of non-expert params and grads of expert params. This is useful while computing grad-norms for clipping and overflow detection diff --git a/deepspeed/ops/adam/fused_adam.py b/deepspeed/ops/adam/fused_adam.py index ae7c5fac8..80e5bf565 100644 --- a/deepspeed/ops/adam/fused_adam.py +++ b/deepspeed/ops/adam/fused_adam.py @@ -8,6 +8,7 @@ This file is adapted from fused adam in NVIDIA/apex, commit a109f85 import torch import importlib from .multi_tensor_apply import MultiTensorApply + multi_tensor_applier = MultiTensorApply(2048 * 32) from ..op_builder import FusedAdamBuilder diff --git a/deepspeed/ops/sparse_attention/sparse_self_attention.py b/deepspeed/ops/sparse_attention/sparse_self_attention.py index bef6b9583..5ccfa1319 100644 --- a/deepspeed/ops/sparse_attention/sparse_self_attention.py +++ b/deepspeed/ops/sparse_attention/sparse_self_attention.py @@ -18,12 +18,12 @@ class SparseSelfAttention(nn.Module): For usage example please see, TODO DeepSpeed Sparse Transformer Tutorial. """ def __init__( - self, - # SparsityConfig parameters needs to be set accordingly - sparsity_config=SparsityConfig(num_heads=4), - key_padding_mask_mode='add', - attn_mask_mode='mul', - max_seq_length=2048): + self, + # SparsityConfig parameters needs to be set accordingly + sparsity_config=SparsityConfig(num_heads=4), + key_padding_mask_mode='add', + attn_mask_mode='mul', + max_seq_length=2048): """Initialize the sparse self attention layer. Arguments: sparsity_config: optional: this parameter determines sparsity pattern configuration; it is based on SparsityConfig class. diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py index 013c1f77f..5cb6deb97 100755 --- a/deepspeed/runtime/config.py +++ b/deepspeed/runtime/config.py @@ -284,32 +284,31 @@ def get_quantize_training(param_dict): if QUANTIZE_SCHEDULE in param_dict[QUANTIZE_TRAINING].keys() and SCHEDULE_OFFSET in param_dict[QUANTIZE_TRAINING][QUANTIZE_SCHEDULE].keys() else QUANTIZE_OFFSET_DEFAULT), - (param_dict[QUANTIZE_TRAINING][QUANTIZE_GROUPS] - if QUANTIZE_GROUPS in param_dict[QUANTIZE_TRAINING].keys() else - QUANTIZE_GROUPS_DEFAULT), + (param_dict[QUANTIZE_TRAINING][QUANTIZE_GROUPS] if QUANTIZE_GROUPS + in param_dict[QUANTIZE_TRAINING].keys() else QUANTIZE_GROUPS_DEFAULT), (param_dict[QUANTIZE_TRAINING][FP16_MIXED_QUANTIZE] [FP16_MIXED_QUANTIZE_ENABLED] if FP16_MIXED_QUANTIZE in param_dict[QUANTIZE_TRAINING].keys() - and FP16_MIXED_QUANTIZE_ENABLED in param_dict[QUANTIZE_TRAINING] - [FP16_MIXED_QUANTIZE].keys() else FP16_MIXED_QUANTIZE_ENABLED_DEFAULT), + and FP16_MIXED_QUANTIZE_ENABLED + in param_dict[QUANTIZE_TRAINING][FP16_MIXED_QUANTIZE].keys() else + FP16_MIXED_QUANTIZE_ENABLED_DEFAULT), (param_dict[QUANTIZE_TRAINING][FP16_MIXED_QUANTIZE][QUANTIZE_CHANGE_RATIO] if FP16_MIXED_QUANTIZE in param_dict[QUANTIZE_TRAINING].keys() - and QUANTIZE_CHANGE_RATIO in param_dict[QUANTIZE_TRAINING] - [FP16_MIXED_QUANTIZE].keys() else QUANTIZE_CHANGE_RATIO_DEFAULT), + and QUANTIZE_CHANGE_RATIO + in param_dict[QUANTIZE_TRAINING][FP16_MIXED_QUANTIZE].keys() else + QUANTIZE_CHANGE_RATIO_DEFAULT), (1 if QUANTIZE_ALGO in param_dict[QUANTIZE_TRAINING] and QUANTIZE_TYPE in param_dict[QUANTIZE_TRAINING][QUANTIZE_ALGO].keys() - and param_dict[QUANTIZE_TRAINING][QUANTIZE_ALGO][QUANTIZE_TYPE] == - QUANTIZE_ASYMMETRIC else QUANTIZE_TYPE_DEFAULT), - (1 if QUANTIZE_ALGO in param_dict[QUANTIZE_TRAINING] and - QUANTIZE_ROUNDING in param_dict[QUANTIZE_TRAINING][QUANTIZE_ALGO].keys() - and param_dict[QUANTIZE_TRAINING][QUANTIZE_ALGO][QUANTIZE_ROUNDING] == - STOCHASTIC_ROUNDING else QUANTIZE_ROUNDING_DEFAULT), - (param_dict[QUANTIZE_TRAINING][QUANTIZE_VERBOSE] - if QUANTIZE_VERBOSE in param_dict[QUANTIZE_TRAINING].keys() else - QUANTIZE_VERBOSE_DEFAULT), - (param_dict[QUANTIZE_TRAINING][QUANTIZER_KERNEL] - if QUANTIZER_KERNEL in param_dict[QUANTIZE_TRAINING].keys() else - QUANTIZER_KERNEL_DEFAULT), + and param_dict[QUANTIZE_TRAINING][QUANTIZE_ALGO][QUANTIZE_TYPE] + == QUANTIZE_ASYMMETRIC else QUANTIZE_TYPE_DEFAULT), + (1 if QUANTIZE_ALGO in param_dict[QUANTIZE_TRAINING] and QUANTIZE_ROUNDING + in param_dict[QUANTIZE_TRAINING][QUANTIZE_ALGO].keys() + and param_dict[QUANTIZE_TRAINING][QUANTIZE_ALGO][QUANTIZE_ROUNDING] + == STOCHASTIC_ROUNDING else QUANTIZE_ROUNDING_DEFAULT), + (param_dict[QUANTIZE_TRAINING][QUANTIZE_VERBOSE] if QUANTIZE_VERBOSE + in param_dict[QUANTIZE_TRAINING].keys() else QUANTIZE_VERBOSE_DEFAULT), + (param_dict[QUANTIZE_TRAINING][QUANTIZER_KERNEL] if QUANTIZER_KERNEL + in param_dict[QUANTIZE_TRAINING].keys() else QUANTIZER_KERNEL_DEFAULT), ) else: return ( diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index 3f1438a35..7aabe7926 100755 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -511,10 +511,10 @@ class DeepSpeedEngine(Module): return self._config.tensorboard_job_name def get_summary_writer( - self, - name="DeepSpeedJobName", - base=os.path.join(os.path.expanduser("~"), - "tensorboard"), + self, + name="DeepSpeedJobName", + base=os.path.join(os.path.expanduser("~"), + "tensorboard"), ): if self.tensorboard_output_path(): base_dir = self.tensorboard_output_path() @@ -1570,8 +1570,8 @@ class DeepSpeedEngine(Module): else: see_memory_usage("Engine before forward", force=self.memory_breakdown()) - flops_profiler_active = (self.flops_profiler_enabled() and - self.global_steps == self.flops_profiler_profile_step() + flops_profiler_active = (self.flops_profiler_enabled() and self.global_steps + == self.flops_profiler_profile_step() and self.global_rank == 0) if flops_profiler_active: diff --git a/deepspeed/runtime/utils.py b/deepspeed/runtime/utils.py index a45d0c256..4bd5a5372 100755 --- a/deepspeed/runtime/utils.py +++ b/deepspeed/runtime/utils.py @@ -354,8 +354,8 @@ def clip_grad_norm_(parameters, max_norm, norm_type=2, mpu=None): total_norm = 0 for p in parameters: if mpu is not None: - if (mpu.get_model_parallel_rank() == 0 - ) or is_model_parallel_parameter(p): + if (mpu.get_model_parallel_rank() + == 0) or is_model_parallel_parameter(p): param_norm = p.grad.data.norm(norm_type) total_norm += param_norm.item()**norm_type else: diff --git a/deepspeed/runtime/zero/config.py b/deepspeed/runtime/zero/config.py index a48dd4e62..782d4d9e3 100755 --- a/deepspeed/runtime/zero/config.py +++ b/deepspeed/runtime/zero/config.py @@ -104,9 +104,8 @@ class DeepSpeedZeroConfig(DeepSpeedConfigObject): self.overlap_comm = get_scalar_param( zero_config_dict, ZERO_OPTIMIZATION_OVERLAP_COMM, - ZERO3_OPTIMIZATION_OVERLAP_COMM_DEFAULT - if self.stage == ZERO_OPTIMIZATION_WEIGHTS else - ZERO_OPTIMIZATION_OVERLAP_COMM_DEFAULT) + ZERO3_OPTIMIZATION_OVERLAP_COMM_DEFAULT if self.stage + == ZERO_OPTIMIZATION_WEIGHTS else ZERO_OPTIMIZATION_OVERLAP_COMM_DEFAULT) self.allgather_partitions = get_scalar_param( zero_config_dict, diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py index 665ff0704..4efa99c88 100755 --- a/deepspeed/runtime/zero/partition_parameters.py +++ b/deepspeed/runtime/zero/partition_parameters.py @@ -488,8 +488,8 @@ class Init(InsertPostInitMethodToModuleSubClasses): # Remote device is the device where parameter partiitons are stored # It can be same as local_device or it could be CPU or NVMe. self.remote_device = self.local_device if remote_device is None else remote_device - self.pin_memory = pin_memory if ( - self.remote_device == OFFLOAD_CPU_DEVICE) else False + self.pin_memory = pin_memory if (self.remote_device + == OFFLOAD_CPU_DEVICE) else False # Enable fp16 param swapping to NVMe if self.remote_device == OFFLOAD_NVME_DEVICE: @@ -783,9 +783,8 @@ class Init(InsertPostInitMethodToModuleSubClasses): partitioned_tensor = torch.empty( partition_size, dtype=param.dtype, - device=OFFLOAD_CPU_DEVICE - if self.remote_device == OFFLOAD_NVME_DEVICE else - self.remote_device) + device=OFFLOAD_CPU_DEVICE if self.remote_device + == OFFLOAD_NVME_DEVICE else self.remote_device) if self.pin_memory: partitioned_tensor = partitioned_tensor.pin_memory() diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py index ae2b4e14f..4c1efb028 100755 --- a/deepspeed/runtime/zero/stage3.py +++ b/deepspeed/runtime/zero/stage3.py @@ -2733,8 +2733,8 @@ class FP16_DeepSpeedZeroOptimizer_Stage3(object): self.optimizer_swapper.swap_out_optimizer_state( parameter=self.fp32_partitioned_groups_flat[sub_group_id], - async_swap=self.next_swappable_fp32_partitioned_groups[sub_group_id] is - not None) + async_swap=self.next_swappable_fp32_partitioned_groups[sub_group_id] + is not None) self.stop_timers([OPTIMIZER_SWAP_OUT_STATE]) see_memory_usage( diff --git a/docs/_posts/2020-05-28-fastest-bert-training.md b/docs/_posts/2020-05-28-fastest-bert-training.md index 45ca0618e..923319449 100644 --- a/docs/_posts/2020-05-28-fastest-bert-training.md +++ b/docs/_posts/2020-05-28-fastest-bert-training.md @@ -16,11 +16,11 @@ example, DeepSpeed can attain a staggering 64 teraflops of single GPU performance on a NVIDIA V100 GPU which is over 50% of the hardware peak. In this blog post, we will discuss four technological improvements that enable -DeepSpeed to achieve this record-breaking BERT training time. +DeepSpeed to achieve this record-breaking BERT training time. 1. Highly optimized transformer kernels to improve compute efficiency 2. Overlapping I/O with computation through asynchronous prefetching queue -3. Sparse output processing to eliminate wasteful computation +3. Sparse output processing to eliminate wasteful computation 4. Layer-norm reordering for training stability and faster convergence These optimizations not only benefit BERT; they are also applicable to many @@ -143,7 +143,7 @@ transferring data to and from global memory and overhead from kernel launching. Existing compiler-based approaches perform fine-grained fusion (e.g., fusion of element-wise operations), leading to missed fusion opportunities. In contrast, we fully exploit both fine-grain and coarse-grained fusion, tailored for -Transformer blocks. +Transformer blocks. **QKV and various fusions.** We merge the three Query (Q), Key (K), and Value (V) weight matrices to dispatch a larger QKV GEMM to expose more parallelism and @@ -160,7 +160,7 @@ order to rearrange the data in a way that we can put the data in consecutive parts of memory. Even though we produce an uncoalesced pattern when accessing shared memory, we reduce the cost of uncoalesced access to main memory to better exploit memory bandwidth, resulting in 3% to 5% performance improvement -in the end-to-end training. +in the end-to-end training. ![QKV-Fusion](/assets/images/qkv_fusion.png){: .align-center} @@ -280,7 +280,7 @@ a modification described by several recent works for neural machine translation. The Pre-LayerNorm results in several useful characteristics such as avoiding vanishing gradient, stable optimization, and performance gain. It allows us to train at aggregated batch size of 64K with increased learning rate -and faster convergence. +and faster convergence. To try out these optimizations and training recipe, please check out our [BERT diff --git a/docs/_posts/2021-03-08-zero3-offload.md b/docs/_posts/2021-03-08-zero3-offload.md index 3fba666ea..6c4f5ab2e 100644 --- a/docs/_posts/2021-03-08-zero3-offload.md +++ b/docs/_posts/2021-03-08-zero3-offload.md @@ -12,7 +12,7 @@ Today we are announcing the release of ZeRO-3 Offload, a highly efficient and ea * Extremely Easy to use: * Scale to over a trillion parameters without the need to combine multiple parallelism techniques in complicated ways. * For existing DeepSpeed users, turn on ZeRO-3 Offload with just a few flags in DeepSpeed Config file. -* High-performance per-GPU throughput and super-linear scalability across GPUs for distributed training. +* High-performance per-GPU throughput and super-linear scalability across GPUs for distributed training. * With 1 Trillion parameters, ZeRO-3 Offload sustains 25 PetaFlops in compute performance on 512 NVIDIA V100 GPUs, achieving 49 TFlops/GPU. * Up to 2x improvement in throughput compared to ZeRO- 2 Offload on single GPU @@ -64,7 +64,7 @@ i) With ground-breaking memory efficiency, ZeRO-3 and ZeRO-3 Offload are the onl ii) ZeRO-3 Offload requires virtually no model refactoring from model scientists, liberating data scientists to scale up complex models to hundreds of billions to trillions of parameters.

Excellent training efficiency

-High-performance per-GPU throughput on multiple nodes: ZeRO-3 Offload offers excellent training efficiency for multi-billion and trillion parameter models on multiple nodes. It achieves a sustained throughput of up to 50 Tflops per GPU running on 32 DGX2 nodes comprising 512 NVIDIA V100 GPUs (see Figure 2). In comparison, the standard data parallel training with PyTorch can only achieve 30 TFlops per GPU for a 1.2B parameter model, the largest model that can be trained using data parallelism alone. +High-performance per-GPU throughput on multiple nodes: ZeRO-3 Offload offers excellent training efficiency for multi-billion and trillion parameter models on multiple nodes. It achieves a sustained throughput of up to 50 Tflops per GPU running on 32 DGX2 nodes comprising 512 NVIDIA V100 GPUs (see Figure 2). In comparison, the standard data parallel training with PyTorch can only achieve 30 TFlops per GPU for a 1.2B parameter model, the largest model that can be trained using data parallelism alone. @@ -74,7 +74,7 @@ Figure 2. ZeRO-3 Offload: Multi-billion and trillion parameter model throughput ZeRO-3 Offload obtains high efficiency despite the 50% communication overhead of ZeRO Stage 3 compared to standard data parallel training for a fixed batch size. This is made possible through a communication overlap centric design and implementation, which allows ZeRO-3 Offload to hide nearly all of the communication volume with computation, while taking advantage of a larger batch size for improved efficiency resulting from better GPU memory efficiency. -Efficient multi-billion parameter model training on a single GPU: ZeRO-3 Offload further democratizes AI by enabling efficient training of multi-billion parameter models on a single GPU. For single GPU training, ZeRO-3 Offload provides benefits over ZeRO-2 Offload along two dimensions. First, ZeRO-3 Offload increases the size of models trainable on a single V100 from 13B to 40B. Second, for ZeRO-3 Offload provides speedups (e.g., 2.3X for 13B) compared to ZeRO-2 Offload for model sizes trainable by both solutions. These results are summarized in Figure 3. +Efficient multi-billion parameter model training on a single GPU: ZeRO-3 Offload further democratizes AI by enabling efficient training of multi-billion parameter models on a single GPU. For single GPU training, ZeRO-3 Offload provides benefits over ZeRO-2 Offload along two dimensions. First, ZeRO-3 Offload increases the size of models trainable on a single V100 from 13B to 40B. Second, for ZeRO-3 Offload provides speedups (e.g., 2.3X for 13B) compared to ZeRO-2 Offload for model sizes trainable by both solutions. These results are summarized in Figure 3. diff --git a/docs/_posts/2021-05-05-MoQ.md b/docs/_posts/2021-05-05-MoQ.md index a0ea351ca..95dd8c7d1 100644 --- a/docs/_posts/2021-05-05-MoQ.md +++ b/docs/_posts/2021-05-05-MoQ.md @@ -53,7 +53,7 @@ For enabling quantization through Deepspeed, we only need to pass the scheduling ## Improving quantization accuracy. -To show how our quantization scheme preserves accuracy, we have experimented MoQ on several tasks and networks: GLUE tasks on Bert-Base and SQuAD on Bert-Large. Table 1 shows the accuracy results for the baseline without quantization (w/o Quant), basic quantization without using any scheduling during training (Basic Quant), and our MoQ scheme. Without using any scheduling, the accuracy for 8-bit quantization is often inferior to the baseline, and in this workload, it suffers from a drop of 1.02 point in accuracy (ACC). In contrast, MoQ powers 8-bit quantization to obtain comparable accuracy as the FP16 baseline, even with a slightly higher ACC, demonstrating the effectiveness of our quantization approach. +To show how our quantization scheme preserves accuracy, we have experimented MoQ on several tasks and networks: GLUE tasks on Bert-Base and SQuAD on Bert-Large. Table 1 shows the accuracy results for the baseline without quantization (w/o Quant), basic quantization without using any scheduling during training (Basic Quant), and our MoQ scheme. Without using any scheduling, the accuracy for 8-bit quantization is often inferior to the baseline, and in this workload, it suffers from a drop of 1.02 point in accuracy (ACC). In contrast, MoQ powers 8-bit quantization to obtain comparable accuracy as the FP16 baseline, even with a slightly higher ACC, demonstrating the effectiveness of our quantization approach. |Task |STSB |MRPC |COLA |WNLI |SST2 |RTE |QNLI |QQP |MNLI |SQuAD|ACC+ | |-------------|-----|-----|-----|-----|-----|-----|-----|-----|-----|-----|-----| diff --git a/docs/_posts/2021-05-05-inference-kernel-optimization.md b/docs/_posts/2021-05-05-inference-kernel-optimization.md index 9b9e747a2..2042703f4 100644 --- a/docs/_posts/2021-05-05-inference-kernel-optimization.md +++ b/docs/_posts/2021-05-05-inference-kernel-optimization.md @@ -46,19 +46,19 @@ To run the model in Inference mode, DeepSpeed simply requires the location of th To further reduce the inference cost for large-scale models, we created the DeepSpeed Quantization Toolkit, supporting flexible quantize-aware training and high-performance kernels for quantized inference. -For training, we introduce a novel approach called Mixture of Quantization (MoQ), which is inspired by mixed-precision training while seamlessly applying quantization. With MoQ, we can control the precision of the model by simulating the impact of quantization when updating the parameters at each step of training. Moreover, it supports flexible quantization policies and schedules—we find that by dynamically adjusting the number of quantization bits during training, the final quantized model provides higher accuracy under the same compression ratio. To adapt to different tasks, MoQ can also leverage the second order information of models to detect their sensitivity to precision and adjust the quantization schedule and target accordingly. +For training, we introduce a novel approach called Mixture of Quantization (MoQ), which is inspired by mixed-precision training while seamlessly applying quantization. With MoQ, we can control the precision of the model by simulating the impact of quantization when updating the parameters at each step of training. Moreover, it supports flexible quantization policies and schedules—we find that by dynamically adjusting the number of quantization bits during training, the final quantized model provides higher accuracy under the same compression ratio. To adapt to different tasks, MoQ can also leverage the second order information of models to detect their sensitivity to precision and adjust the quantization schedule and target accordingly. To maximize the performance gains from the quantization model, we provide inference kernels tailored for quantized models that reduce latency through optimizing data movement but do not require specialized hardware. Finally, our toolkit does not require any code changes on the client side, making it easy to use. ## Performance results -Boosting throughput and reducing inference cost. Figure 3 shows the inference throughput per GPU for the three model sizes corresponding to the three Transformer networks, GPT-2, Turing-NLG, and GPT-3. DeepSpeed Inference increases in per-GPU throughput by 2 to 4 times when using the same precision of FP16 as the baseline. By enabling quantization, we boost throughput further. We reach a throughput improvement of 3x for GPT-2, 5x for Turing-NLG, and 3x for a model that is similar in characteristics and size to GPT-3, which directly translates to 3–5x inference cost reduction on serving these large models. In addition, we achieve these throughput and cost improvements without compromising latency as shown in Figure 5. +Boosting throughput and reducing inference cost. Figure 3 shows the inference throughput per GPU for the three model sizes corresponding to the three Transformer networks, GPT-2, Turing-NLG, and GPT-3. DeepSpeed Inference increases in per-GPU throughput by 2 to 4 times when using the same precision of FP16 as the baseline. By enabling quantization, we boost throughput further. We reach a throughput improvement of 3x for GPT-2, 5x for Turing-NLG, and 3x for a model that is similar in characteristics and size to GPT-3, which directly translates to 3–5x inference cost reduction on serving these large models. In addition, we achieve these throughput and cost improvements without compromising latency as shown in Figure 5. ![Inference-Throughput](/assets/images/inference-throughput.png){: .align-center} Figure 3: Inference throughput for different model sizes. DeepSpeed Inference achieves 3x to 5x higher throughput than baseline. -One source of inference cost reduction is through reducing the number of GPUs for hosting large models as shown in Figure 4. The optimized GPU resources comes from 1) using inference-adapted parallelism, allowing users to adjust the model and pipeline parallelism degree from the trained model checkpoints, and 2) shrinking model memory footprint by half with INT8 quantization. As shown in this figure, we use 2x less GPUs to run inference for the 17B model size by adapting the parallelism. Together with INT8 quantization through DeepSpeed MoQ, we use 4x and 2x fewer GPUs for 17B and 175B sizes respectively. +One source of inference cost reduction is through reducing the number of GPUs for hosting large models as shown in Figure 4. The optimized GPU resources comes from 1) using inference-adapted parallelism, allowing users to adjust the model and pipeline parallelism degree from the trained model checkpoints, and 2) shrinking model memory footprint by half with INT8 quantization. As shown in this figure, we use 2x less GPUs to run inference for the 17B model size by adapting the parallelism. Together with INT8 quantization through DeepSpeed MoQ, we use 4x and 2x fewer GPUs for 17B and 175B sizes respectively. ![Inference-Throughput](/assets/images/gpu-numbers.png){: .align-center} diff --git a/docs/_tutorials/megatron.md b/docs/_tutorials/megatron.md index db4851468..a8d5974d9 100644 --- a/docs/_tutorials/megatron.md +++ b/docs/_tutorials/megatron.md @@ -373,7 +373,7 @@ optimizer](https://arxiv.org/abs/1910.02054v2). In February, we released a sub-s of optimizations from ZeRO in DeepSpeed that performs optimizer state partitioning. We refer to them as ZeRO-1. In May, 2020 we extended ZeRO-1 in DeepSpeed to include additional optimizations from ZeRO including gradient and activation partitioning, -as well as contiguous memory optimizations. We refer to this release as ZeRO-2. +as well as contiguous memory optimizations. We refer to this release as ZeRO-2. ZeRO-2 significantly reduces the memory footprint for training large models which means large models can be trained with i) less diff --git a/docs/code-docs/source/conf.py b/docs/code-docs/source/conf.py index 1fa3be97d..cb00d0d6b 100644 --- a/docs/code-docs/source/conf.py +++ b/docs/code-docs/source/conf.py @@ -72,6 +72,7 @@ html_context = { # Mock imports so we don't have to install torch to build the docs. from unittest.mock import MagicMock + sys.path.insert(0, os.path.abspath('../../../')) # Prepend module names to class descriptions? diff --git a/requirements/requirements-autotuning-ml.txt b/requirements/requirements-autotuning-ml.txt index c3e5824f3..8b1906430 100644 --- a/requirements/requirements-autotuning-ml.txt +++ b/requirements/requirements-autotuning-ml.txt @@ -1,3 +1,3 @@ hjson -xgboost tabulate +xgboost diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt index 5f2c87b34..fa8abebd3 100644 --- a/requirements/requirements-dev.txt +++ b/requirements/requirements-dev.txt @@ -1,11 +1,11 @@ +clang-format +docutils<0.18 +importlib-metadata>=4 +megatron-lm==1.1.5 +pre-commit pytest pytest-forked pytest-randomly -pre-commit -clang-format -sphinx recommonmark +sphinx sphinx-rtd-theme -megatron-lm==1.1.5 -importlib-metadata>=4 -docutils<0.18 diff --git a/requirements/requirements-readthedocs.txt b/requirements/requirements-readthedocs.txt index 677228512..f3ffe3b61 100644 --- a/requirements/requirements-readthedocs.txt +++ b/requirements/requirements-readthedocs.txt @@ -1,5 +1,5 @@ -tqdm -psutil docutils<0.18 hjson +psutil torch +tqdm diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 80a4d727f..dae432dec 100755 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -1,7 +1,7 @@ -torch -tqdm +hjson ninja numpy -psutil packaging -hjson +psutil +torch +tqdm diff --git a/tests/model/BingBertSquad/BingBertSquad_run_func_test.py b/tests/model/BingBertSquad/BingBertSquad_run_func_test.py index 33e5c1846..90e6858e8 100755 --- a/tests/model/BingBertSquad/BingBertSquad_run_func_test.py +++ b/tests/model/BingBertSquad/BingBertSquad_run_func_test.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved. # # Note: please copy webtext data to "Megatron-LM" folder, before running this script. diff --git a/tests/model/BingBertSquad/BingBertSquad_test_common.py b/tests/model/BingBertSquad/BingBertSquad_test_common.py index 940c18cf8..a9678bb69 100755 --- a/tests/model/BingBertSquad/BingBertSquad_test_common.py +++ b/tests/model/BingBertSquad/BingBertSquad_test_common.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved. # diff --git a/tests/model/BingBertSquad/__init__.py b/tests/model/BingBertSquad/__init__.py index 3e53472dd..e122adbdf 100755 --- a/tests/model/BingBertSquad/__init__.py +++ b/tests/model/BingBertSquad/__init__.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved. from .BingBertSquad_run_func_test import BingBertSquadFuncTestCase diff --git a/tests/model/Megatron_GPT2/__init__.py b/tests/model/Megatron_GPT2/__init__.py index 4577058a0..2451ec7ae 100644 --- a/tests/model/Megatron_GPT2/__init__.py +++ b/tests/model/Megatron_GPT2/__init__.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved. # # Note: please copy webtext data to "Megatron-LM" folder, before running this script. diff --git a/tests/model/Megatron_GPT2/run_checkpoint_test.py b/tests/model/Megatron_GPT2/run_checkpoint_test.py index cf11af6c2..fe564d4fd 100755 --- a/tests/model/Megatron_GPT2/run_checkpoint_test.py +++ b/tests/model/Megatron_GPT2/run_checkpoint_test.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved. # # Note: please copy webtext data to "Megatron-LM" folder, before running this script. diff --git a/tests/model/Megatron_GPT2/run_func_test.py b/tests/model/Megatron_GPT2/run_func_test.py index f8ab5bcb3..463aa1f94 100755 --- a/tests/model/Megatron_GPT2/run_func_test.py +++ b/tests/model/Megatron_GPT2/run_func_test.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved. # # Note: please copy webtext data to "Megatron-LM" folder, before running this script. diff --git a/tests/model/Megatron_GPT2/run_perf_baseline.py b/tests/model/Megatron_GPT2/run_perf_baseline.py index e7fb1c722..f30e9cfe9 100755 --- a/tests/model/Megatron_GPT2/run_perf_baseline.py +++ b/tests/model/Megatron_GPT2/run_perf_baseline.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved. # # Note: please copy webtext data to "Megatron-LM" folder, before running this script. diff --git a/tests/model/Megatron_GPT2/run_perf_test.py b/tests/model/Megatron_GPT2/run_perf_test.py index fbf816427..64b20f486 100755 --- a/tests/model/Megatron_GPT2/run_perf_test.py +++ b/tests/model/Megatron_GPT2/run_perf_test.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved. # # Note: please copy webtext data to "Megatron-LM" folder, before running this script. diff --git a/tests/model/Megatron_GPT2/test_common.py b/tests/model/Megatron_GPT2/test_common.py index ae1dd328d..04b3e4a23 100755 --- a/tests/model/Megatron_GPT2/test_common.py +++ b/tests/model/Megatron_GPT2/test_common.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved. # diff --git a/tests/model/run_sanity_check.py b/tests/model/run_sanity_check.py index b7d12ba18..2f020b52d 100755 --- a/tests/model/run_sanity_check.py +++ b/tests/model/run_sanity_check.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved. # # Note: please copy webtext data to "Megatron-LM" folder, before running this script. diff --git a/tests/unit/test_activation_checkpointing.py b/tests/unit/test_activation_checkpointing.py index 73ee6a25d..6c96b5a3d 100644 --- a/tests/unit/test_activation_checkpointing.py +++ b/tests/unit/test_activation_checkpointing.py @@ -7,6 +7,7 @@ import pytest import torch import deepspeed + ckpt = deepspeed.checkpointing.checkpoint from common import distributed_test diff --git a/tests/unit/test_checkpointing.py b/tests/unit/test_checkpointing.py index 9f53ce848..6d0a12808 100755 --- a/tests/unit/test_checkpointing.py +++ b/tests/unit/test_checkpointing.py @@ -10,6 +10,7 @@ from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer from deepspeed.runtime.pipe.topology import * + PipeTopo = PipeDataParallelTopology from deepspeed.ops.op_builder import FusedLambBuilder, CPUAdamBuilder diff --git a/tests/unit/test_onebit.py b/tests/unit/test_onebit.py index 18a0244c3..2c09220da 100644 --- a/tests/unit/test_onebit.py +++ b/tests/unit/test_onebit.py @@ -12,6 +12,7 @@ import numpy as np import time from deepspeed.runtime.pipe.topology import PipeDataParallelTopology, PipeModelDataParallelTopology + PipeTopo = PipeDataParallelTopology from deepspeed.runtime.pipe.module import PipelineModule, LayerSpec from common import distributed_test diff --git a/tests/unit/test_pipe.py b/tests/unit/test_pipe.py index 65ae0023b..495d4d72e 100755 --- a/tests/unit/test_pipe.py +++ b/tests/unit/test_pipe.py @@ -13,6 +13,7 @@ import deepspeed.runtime.utils as ds_utils from deepspeed.runtime.pipe.topology import PipeDataParallelTopology, PipeModelDataParallelTopology + PipeTopo = PipeDataParallelTopology from deepspeed.runtime.pipe.module import PipelineModule, LayerSpec diff --git a/tests/unit/test_pipe_module.py b/tests/unit/test_pipe_module.py index a29d22a2a..e4eb3e538 100644 --- a/tests/unit/test_pipe_module.py +++ b/tests/unit/test_pipe_module.py @@ -9,6 +9,7 @@ import pytest import deepspeed from deepspeed.runtime.pipe.topology import PipeDataParallelTopology, PipeModelDataParallelTopology + PipeTopo = PipeDataParallelTopology from deepspeed.pipe import PipelineModule, LayerSpec