diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index eeefecd0e..d5949733a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -22,8 +22,8 @@ repos: - id: requirements-txt-fixer - id: trailing-whitespace -- repo: https://github.com/pre-commit/mirrors-yapf - rev: v0.31.0 +- repo: https://github.com/google/yapf + rev: v0.32.0 hooks: - id: yapf diff --git a/.style.yapf b/.style.yapf index 4a4850fe4..be8721dd3 100644 --- a/.style.yapf +++ b/.style.yapf @@ -1,3 +1,3 @@ [style] -SPLIT_ALL_COMMA_SEPARATED_VALUES = true -COLUMN_LIMIT = 89 +SPLIT_ALL_COMMA_SEPARATED_VALUES = false +COLUMN_LIMIT = 119 diff --git a/accelerator/abstract_accelerator.py b/accelerator/abstract_accelerator.py index 03dfbe9df..df5080a3a 100644 --- a/accelerator/abstract_accelerator.py +++ b/accelerator/abstract_accelerator.py @@ -5,6 +5,7 @@ from abc import ABC class DeepSpeedAccelerator(ABC): + def __init__(self): self._name = None self._communication_backend_name = None diff --git a/accelerator/cuda_accelerator.py b/accelerator/cuda_accelerator.py index 945ba42a3..3e0d1e136 100644 --- a/accelerator/cuda_accelerator.py +++ b/accelerator/cuda_accelerator.py @@ -14,6 +14,7 @@ except ImportError: class CUDA_Accelerator(DeepSpeedAccelerator): + def __init__(self): self._name = 'cuda' self._communication_backend_name = 'nccl' @@ -26,9 +27,7 @@ class CUDA_Accelerator(DeepSpeedAccelerator): for _, module_name, _ in pkgutil.iter_modules([os.path.dirname(op_builder_module.__file__)]): # avoid self references if module_name != 'all_ops' and module_name != 'builder': - module = importlib.import_module("{}.{}".format( - op_builder_dir, - module_name)) + module = importlib.import_module("{}.{}".format(op_builder_dir, module_name)) for member_name in module.__dir__(): if member_name.endswith( 'Builder' diff --git a/accelerator/real_accelerator.py b/accelerator/real_accelerator.py index 06cbb0b08..adf491392 100644 --- a/accelerator/real_accelerator.py +++ b/accelerator/real_accelerator.py @@ -23,13 +23,8 @@ def _validate_accelerator(accel_obj): # accelerator.abstractor_accelerator # or deepspeed.accelerator.abstract_accelerator, consider accel_obj # is a conforming object - if not ((dsa1 != None and isinstance(accel_obj, - dsa1)) or - (dsa2 != None and isinstance(accel_obj, - dsa2))): - raise AssertionError( - f'{accel_obj.__class__.__name__} accelerator is not subclass of DeepSpeedAccelerator' - ) + if not ((dsa1 != None and isinstance(accel_obj, dsa1)) or (dsa2 != None and isinstance(accel_obj, dsa2))): + raise AssertionError(f'{accel_obj.__class__.__name__} accelerator is not subclass of DeepSpeedAccelerator') # TODO: turn off is_available test since this breaks tests #assert accel_obj.is_available(), \ diff --git a/benchmarks/communication/all_gather.py b/benchmarks/communication/all_gather.py index dc97267b3..cee7501c8 100644 --- a/benchmarks/communication/all_gather.py +++ b/benchmarks/communication/all_gather.py @@ -22,9 +22,7 @@ def timed_all_gather(input, output, args): if hasattr(torch.distributed, "_all_gather_base"): dist._all_gather_base(output, input, group=None, async_op=args.async_op) else: - output_tensors = list( - torch.chunk(output_tensor, - cdb.get_world_size(group))) + output_tensors = list(torch.chunk(output_tensor, cdb.get_world_size(group))) dist.all_gather(output_tensors, input_tensor, group=group, async_op=True) elif args.dist == 'deepspeed': dist.allgather_fn(output, input, group=None, async_op=args.async_op) @@ -38,9 +36,7 @@ def timed_all_gather(input, output, args): if hasattr(torch.distributed, "_all_gather_base"): dist._all_gather_base(output, input, group=None, async_op=args.async_op) else: - output_tensors = list( - torch.chunk(output_tensor, - cdb.get_world_size(group))) + output_tensors = list(torch.chunk(output_tensor, cdb.get_world_size(group))) dist.all_gather(output_tensors, input_tensor, group=group, async_op=True) elif args.dist == 'deepspeed': dist.allgather_fn(output, input, group=None, async_op=args.async_op) @@ -58,8 +54,7 @@ def timed_all_gather(input, output, args): if not args.raw: size = convert_size(size) - print_rank_0( - f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}") + print_rank_0(f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}") def run_all_gather(local_rank, args): @@ -84,22 +79,15 @@ def run_all_gather(local_rank, args): for M in M_LIST: global_rank = dist.get_rank() try: - mat = torch.ones(world_size, - M, - dtype=getattr( - torch, - args.dtype)).to( - get_accelerator().device_name(local_rank)) + mat = torch.ones(world_size, M, + dtype=getattr(torch, args.dtype)).to(get_accelerator().device_name(local_rank)) sync_all() input = ((mat.mul_(float(global_rank))).view(-1)) # Delete original mat to avoid OOM del mat get_accelerator().empty_cache() output = torch.zeros(input.nelement() * world_size, - dtype=getattr( - torch, - args.dtype)).to( - get_accelerator().device_name(local_rank)) + dtype=getattr(torch, args.dtype)).to(get_accelerator().device_name(local_rank)) except RuntimeError as e: if 'out of memory' in str(e): if dist.get_rank() == 0: @@ -110,41 +98,32 @@ def run_all_gather(local_rank, args): timed_all_gather(input, output, args) else: # all_gather_base saves memory - if (args.dist == 'torch' - and hasattr(torch.distributed, - "_all_gather_base")) or (args.dist == 'deepspeed' - and dist.has_allgather_base): + if (args.dist == 'torch' and hasattr(torch.distributed, "_all_gather_base")) or (args.dist == 'deepspeed' + and dist.has_allgather_base): mem_factor = args.mem_factor + 0.2 else: mem_factor = args.mem_factor # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor sync_all() elements_per_gpu = max_numel(comm_op='all_gather', - dtype=getattr(torch, - args.dtype), + dtype=getattr(torch, args.dtype), mem_factor=mem_factor, local_rank=local_rank, args=args) try: - mat = torch.ones(elements_per_gpu, - dtype=getattr(torch, - args.dtype)).to( - get_accelerator().device_name(local_rank)) + mat = torch.ones(elements_per_gpu, dtype=getattr(torch, + args.dtype)).to(get_accelerator().device_name(local_rank)) # multiply each GPU's tensor by the rank to ease debugging input = ((mat.mul_(float(global_rank))).view(-1)) # Delete original mat to avoid OOM del mat get_accelerator().empty_cache() - output = torch.zeros( - elements_per_gpu * world_size, - dtype=getattr(torch, - args.dtype)).to(get_accelerator().device_name(local_rank)) + output = torch.zeros(elements_per_gpu * world_size, + dtype=getattr(torch, args.dtype)).to(get_accelerator().device_name(local_rank)) except RuntimeError as e: if 'out of memory' in str(e): if dist.get_rank() == 0: - print( - 'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!' - ) + print('WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!') sync_all() return diff --git a/benchmarks/communication/all_reduce.py b/benchmarks/communication/all_reduce.py index edc1b9930..edd93e4a8 100644 --- a/benchmarks/communication/all_reduce.py +++ b/benchmarks/communication/all_reduce.py @@ -37,8 +37,7 @@ def timed_all_reduce(input, args): if not args.raw: size = convert_size(size) - print_rank_0( - f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}") + print_rank_0(f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}") def run_all_reduce(local_rank, args): @@ -63,12 +62,8 @@ def run_all_reduce(local_rank, args): for M in M_LIST: global_rank = dist.get_rank() try: - mat = torch.ones(world_size, - M, - dtype=getattr( - torch, - args.dtype)).to( - get_accelerator().device_name(local_rank)) + mat = torch.ones(world_size, M, + dtype=getattr(torch, args.dtype)).to(get_accelerator().device_name(local_rank)) sync_all() input = ((mat.mul_(float(global_rank))).view(-1)) except RuntimeError as e: @@ -83,23 +78,18 @@ def run_all_reduce(local_rank, args): # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor # Don't need output tensor, so we double mem_factor elements_per_gpu = max_numel(comm_op='all_reduce', - dtype=getattr(torch, - args.dtype), + dtype=getattr(torch, args.dtype), mem_factor=args.mem_factor * 2, local_rank=local_rank, args=args) try: - mat = torch.ones(elements_per_gpu, - dtype=getattr(torch, - args.dtype)).to( - get_accelerator().device_name(local_rank)) + mat = torch.ones(elements_per_gpu, dtype=getattr(torch, + args.dtype)).to(get_accelerator().device_name(local_rank)) input = ((mat.mul_(float(global_rank))).view(-1)) except RuntimeError as e: if 'out of memory' in str(e): if dist.get_rank() == 0: - print( - 'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!' - ) + print('WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!') sync_all() return sync_all() diff --git a/benchmarks/communication/all_to_all.py b/benchmarks/communication/all_to_all.py index bd35cf290..af82480f9 100644 --- a/benchmarks/communication/all_to_all.py +++ b/benchmarks/communication/all_to_all.py @@ -37,8 +37,7 @@ def timed_all_to_all(input, output, args): if not args.raw: size = convert_size(size) - print_rank_0( - f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}") + print_rank_0(f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}") def run_all_to_all(local_rank, args): @@ -62,12 +61,8 @@ def run_all_to_all(local_rank, args): for M in M_LIST: global_rank = dist.get_rank() try: - mat = torch.ones(world_size, - M, - dtype=getattr( - torch, - args.dtype)).to( - get_accelerator().device_name(local_rank)) + mat = torch.ones(world_size, M, + dtype=getattr(torch, args.dtype)).to(get_accelerator().device_name(local_rank)) assert mat.numel() % world_size == 0, f"tensor cannot be divided in {world_size} chunks" sync_all() input = ((mat.mul_(float(global_rank))).view(-1)) @@ -83,31 +78,25 @@ def run_all_to_all(local_rank, args): else: # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor elements_per_gpu = max_numel(comm_op='all_to_all', - dtype=getattr(torch, - args.dtype), + dtype=getattr(torch, args.dtype), mem_factor=args.mem_factor, local_rank=local_rank, args=args) try: - mat = torch.ones(elements_per_gpu, - dtype=getattr(torch, - args.dtype)).to( - get_accelerator().device_name(local_rank)) - assert mat.numel() % world_size == 0, f"tensor with {mat.numel()} elements cannot be divided in {world_size} chunks" + mat = torch.ones(elements_per_gpu, dtype=getattr(torch, + args.dtype)).to(get_accelerator().device_name(local_rank)) + assert mat.numel( + ) % world_size == 0, f"tensor with {mat.numel()} elements cannot be divided in {world_size} chunks" input = ((mat.mul_(float(global_rank))).view(-1)) # Delete original mat to avoid OOM del mat get_accelerator().empty_cache() - output = torch.zeros( - elements_per_gpu, - dtype=getattr(torch, - args.dtype)).to(get_accelerator().device_name(local_rank)) + output = torch.zeros(elements_per_gpu, + dtype=getattr(torch, args.dtype)).to(get_accelerator().device_name(local_rank)) except RuntimeError as e: if 'out of memory' in str(e): if dist.get_rank() == 0: - print( - 'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!' - ) + print('WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!') sync_all() return sync_all() diff --git a/benchmarks/communication/broadcast.py b/benchmarks/communication/broadcast.py index 633e46638..921968649 100644 --- a/benchmarks/communication/broadcast.py +++ b/benchmarks/communication/broadcast.py @@ -38,8 +38,7 @@ def timed_broadcast(input, args): if not args.raw: size = convert_size(size) - print_rank_0( - f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}") + print_rank_0(f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}") def run_broadcast(local_rank, args): @@ -64,12 +63,8 @@ def run_broadcast(local_rank, args): for M in M_LIST: global_rank = dist.get_rank() try: - mat = torch.ones(world_size, - M, - dtype=getattr( - torch, - args.dtype)).to( - get_accelerator().device_name(local_rank)) + mat = torch.ones(world_size, M, + dtype=getattr(torch, args.dtype)).to(get_accelerator().device_name(local_rank)) sync_all() input = ((mat.mul_(float(global_rank))).view(-1)) except RuntimeError as e: @@ -84,23 +79,18 @@ def run_broadcast(local_rank, args): # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor # Don't need output tensor, so we double mem_factor elements_per_gpu = max_numel(comm_op='broadcast', - dtype=getattr(torch, - args.dtype), + dtype=getattr(torch, args.dtype), mem_factor=args.mem_factor * 2, local_rank=local_rank, args=args) try: - mat = torch.ones(elements_per_gpu, - dtype=getattr(torch, - args.dtype)).to( - get_accelerator().device_name(local_rank)) + mat = torch.ones(elements_per_gpu, dtype=getattr(torch, + args.dtype)).to(get_accelerator().device_name(local_rank)) input = ((mat.mul_(float(global_rank))).view(-1)) except RuntimeError as e: if 'out of memory' in str(e): if dist.get_rank() == 0: - print( - 'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!' - ) + print('WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!') sync_all() return sync_all() diff --git a/benchmarks/communication/pt2pt.py b/benchmarks/communication/pt2pt.py index 1c890fc42..1d443827b 100644 --- a/benchmarks/communication/pt2pt.py +++ b/benchmarks/communication/pt2pt.py @@ -56,8 +56,7 @@ def timed_pt2pt(input, args): if not args.raw: size = convert_size(size) - print_rank_0( - f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}") + print_rank_0(f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}") def run_pt2pt(local_rank, args): @@ -82,12 +81,8 @@ def run_pt2pt(local_rank, args): for M in M_LIST: global_rank = dist.get_rank() try: - mat = torch.ones(world_size, - M, - dtype=getattr( - torch, - args.dtype)).to( - get_accelerator().device_name(local_rank)) + mat = torch.ones(world_size, M, + dtype=getattr(torch, args.dtype)).to(get_accelerator().device_name(local_rank)) sync_all() input = ((mat.mul_(float(global_rank))).view(-1)) except RuntimeError as e: @@ -102,23 +97,18 @@ def run_pt2pt(local_rank, args): # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor # Don't need output tensor, so double mem_factor elements_per_gpu = max_numel(comm_op='pt2pt', - dtype=getattr(torch, - args.dtype), + dtype=getattr(torch, args.dtype), mem_factor=args.mem_factor * 2, local_rank=local_rank, args=args) try: - mat = torch.ones(elements_per_gpu, - dtype=getattr(torch, - args.dtype)).to( - get_accelerator().device_name(local_rank)) + mat = torch.ones(elements_per_gpu, dtype=getattr(torch, + args.dtype)).to(get_accelerator().device_name(local_rank)) input = ((mat.mul_(float(global_rank))).view(-1)) except RuntimeError as e: if 'out of memory' in str(e): if dist.get_rank() == 0: - print( - 'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!' - ) + print('WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!') sync_all() return sync_all() diff --git a/benchmarks/communication/utils.py b/benchmarks/communication/utils.py index b913dda14..5f3e06cac 100644 --- a/benchmarks/communication/utils.py +++ b/benchmarks/communication/utils.py @@ -120,8 +120,7 @@ def max_numel(comm_op, dtype, mem_factor, local_rank, args): # Number of elements must be divisible by world_size # all_to_all performance is lower for non-powers of two. Round down like all_gather. elements_per_gpu = int(max_memory_per_gpu // dtype_size) - elements_per_gpu = int(dist.get_world_size() * - round(elements_per_gpu / dist.get_world_size())) + elements_per_gpu = int(dist.get_world_size() * round(elements_per_gpu / dist.get_world_size())) elements_per_gpu = int(pow(2, int(math.log(elements_per_gpu, 2)))) else: print(f"This communication operation: {comm_op} is not supported yet") @@ -162,59 +161,32 @@ def _element_size(dtype): def benchmark_parser(): parser = argparse.ArgumentParser() parser.add_argument("--local_rank", type=int) - parser.add_argument("--trials", - type=int, - default=DEFAULT_TRIALS, - help='Number of timed iterations') - parser.add_argument("--warmups", - type=int, - default=DEFAULT_WARMUPS, - help='Number of warmup (non-timed) iterations') - parser.add_argument("--maxsize", - type=int, - default=24, - help='Max message size as a power of 2') - parser.add_argument("--async-op", - action="store_true", - help='Enables non-blocking communication') - parser.add_argument("--bw-unit", - type=str, - default=DEFAULT_UNIT, - choices=['Gbps', - 'GBps']) + parser.add_argument("--trials", type=int, default=DEFAULT_TRIALS, help='Number of timed iterations') + parser.add_argument("--warmups", type=int, default=DEFAULT_WARMUPS, help='Number of warmup (non-timed) iterations') + parser.add_argument("--maxsize", type=int, default=24, help='Max message size as a power of 2') + parser.add_argument("--async-op", action="store_true", help='Enables non-blocking communication') + parser.add_argument("--bw-unit", type=str, default=DEFAULT_UNIT, choices=['Gbps', 'GBps']) parser.add_argument("--backend", type=str, default=DEFAULT_BACKEND, - choices=['nccl', - 'ccl'], + choices=['nccl', 'ccl'], help='Communication library to use') parser.add_argument("--dist", type=str, default=DEFAULT_DIST, - choices=['deepspeed', - 'torch'], + choices=['deepspeed', 'torch'], help='Distributed DL framework to use') - parser.add_argument("--scan", - action="store_true", - help='Enables scanning all message sizes') - parser.add_argument("--raw", - action="store_true", - help='Print the message size and latency without units') + parser.add_argument("--scan", action="store_true", help='Enables scanning all message sizes') + parser.add_argument("--raw", action="store_true", help='Print the message size and latency without units') parser.add_argument("--all-reduce", action="store_true", help='Run all_reduce') parser.add_argument("--all-gather", action="store_true", help='Run all_gather') parser.add_argument("--all-to-all", action="store_true", help='Run all_to_all') parser.add_argument("--pt2pt", action="store_true", help='Run pt2pt') parser.add_argument("--broadcast", action="store_true", help='Run broadcast') - parser.add_argument("--dtype", - type=str, - default=DEFAULT_TYPE, - help='PyTorch tensor dtype') - parser.add_argument( - "--mem-factor", - type=float, - default=.4, - help='Proportion of max available GPU memory to use for single-size evals') - parser.add_argument("--debug", - action="store_true", - help='Enables all_to_all debug prints') + parser.add_argument("--dtype", type=str, default=DEFAULT_TYPE, help='PyTorch tensor dtype') + parser.add_argument("--mem-factor", + type=float, + default=.4, + help='Proportion of max available GPU memory to use for single-size evals') + parser.add_argument("--debug", action="store_true", help='Enables all_to_all debug prints') return parser diff --git a/benchmarks/inference/collect_results.py b/benchmarks/inference/collect_results.py index 0e5103311..e85879b78 100644 --- a/benchmarks/inference/collect_results.py +++ b/benchmarks/inference/collect_results.py @@ -13,21 +13,9 @@ parser.add_argument( default="./results", help="directory containing sweep results", ) -parser.add_argument("--version", - "-v", - type=int, - default=0, - help="version to be collected") -parser.add_argument("--gen-text-n", - "-n", - type=int, - default=1, - help="expected number of generated text") -parser.add_argument("--output", - "-o", - type=str, - default="./results.csv", - help="output file") +parser.add_argument("--version", "-v", type=int, default=0, help="version to be collected") +parser.add_argument("--gen-text-n", "-n", type=int, default=1, help="expected number of generated text") +parser.add_argument("--output", "-o", type=str, default="./results.csv", help="output file") args = parser.parse_args() @@ -107,9 +95,7 @@ if __name__ == "__main__": params = get_benchmark_params(args.results_dir, file_path) if not params: - print( - f"WARNING: Could not detect benchmark settings for file {file_path}, skipping" - ) + print(f"WARNING: Could not detect benchmark settings for file {file_path}, skipping") continue # Verify that the version matches that which we want to collect @@ -121,9 +107,7 @@ if __name__ == "__main__": perf_data = get_perf_data(file_content) if not perf_data: - print( - f"WARNING: Could not detect benchmark performance data for file {file_path}" - ) + print(f"WARNING: Could not detect benchmark performance data for file {file_path}") generated_text = get_generated_text(file_content, args.gen_text_n) if not generated_text: @@ -135,12 +119,7 @@ if __name__ == "__main__": benchmarks_data.append({"branch": branch, **params, **error}) continue - benchmarks_data.append({ - "branch": branch, - **params, - **perf_data, - **generated_text - }) + benchmarks_data.append({"branch": branch, **params, **perf_data, **generated_text}) # Convert to a DataFrame and save benchmarks_df = pd.DataFrame(benchmarks_data) diff --git a/benchmarks/inference/gpt-bench.py b/benchmarks/inference/gpt-bench.py index 29578b30c..d293b5e32 100644 --- a/benchmarks/inference/gpt-bench.py +++ b/benchmarks/inference/gpt-bench.py @@ -11,26 +11,12 @@ from deepspeed.accelerator import get_accelerator parser = argparse.ArgumentParser() parser.add_argument("--model", "-m", type=str, help="hf model name") parser.add_argument("--deepspeed", action="store_true", help="use deepspeed inference") -parser.add_argument("--dtype", - type=str, - default="fp16", - choices=["fp16", - "fp32", - "int8"], - help="int8, fp16, or fp32") +parser.add_argument("--dtype", type=str, default="fp16", choices=["fp16", "fp32", "int8"], help="int8, fp16, or fp32") parser.add_argument("--graphs", action="store_true", help="CUDA Graphs on") parser.add_argument("--kernel-inject", action="store_true", help="inject kernels on") parser.add_argument("--max-tokens", type=int, default=50, help="max new tokens") -parser.add_argument("--local_rank", - type=int, - default=int(os.getenv("LOCAL_RANK", - "0")), - help="local rank") -parser.add_argument("--world_size", - type=int, - default=int(os.getenv("WORLD_SIZE", - "1")), - help="world size") +parser.add_argument("--local_rank", type=int, default=int(os.getenv("LOCAL_RANK", "0")), help="local rank") +parser.add_argument("--world_size", type=int, default=int(os.getenv("WORLD_SIZE", "1")), help="world size") parser.add_argument("--trials", type=int, default=30, help="number of trials") args = parser.parse_args() @@ -81,10 +67,7 @@ elif args.dtype == "fp16": else: dtype = torch.float32 -pipe = pipeline("text-generation", - model=args.model, - framework="pt", - device=args.local_rank) +pipe = pipeline("text-generation", model=args.model, framework="pt", device=args.local_rank) if dtype == torch.float16: pipe.model.half() @@ -115,9 +98,7 @@ for i in range(args.trials): if args.local_rank == 0: print_latency(times, "(e2e) latency") print_latency(mtimes, "(model-only) latency") - print_latency(map(lambda t: t / (args.max_tokens - 3), - times), - "(e2e) per token latency") + print_latency(map(lambda t: t / (args.max_tokens - 3), times), "(e2e) per token latency") print(f"RESPONSE 0:") print("-" * 30) print(responses[0][0]["generated_text"]) diff --git a/bin/ds_elastic b/bin/ds_elastic index c9987d456..1c78aea88 100755 --- a/bin/ds_elastic +++ b/bin/ds_elastic @@ -9,11 +9,7 @@ from deepspeed.elasticity import compute_elastic_config if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-c', '--config', type=str, help="DeepSpeed config json") - parser.add_argument('-w', - '--world-size', - type=int, - default=0, - help="Intended/current world size") + parser.add_argument('-w', '--world-size', type=int, default=0, help="Intended/current world size") args = parser.parse_args() ds_config = json.load(open(args.config, 'r')) @@ -26,7 +22,9 @@ if __name__ == '__main__': print(json.dumps(elastic_config, indent=4, sort_keys=True)) if args.world_size > 0: - final_batch_size, valid_gpus, micro_batch_size = compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version, world_size=args.world_size) + final_batch_size, valid_gpus, micro_batch_size = compute_elastic_config(ds_config=ds_config, + target_deepspeed_version=ds_version, + world_size=args.world_size) print('------------------------------------------') print(f"Calculated results for world size {args.world_size}:") print('------------------------------------------') diff --git a/csrc/aio/py_test/aio_bench_generate_param.py b/csrc/aio/py_test/aio_bench_generate_param.py index caa833f5f..9bc681e8c 100644 --- a/csrc/aio/py_test/aio_bench_generate_param.py +++ b/csrc/aio/py_test/aio_bench_generate_param.py @@ -14,13 +14,10 @@ from perf_sweep_utils import BENCH_LOG_DIR, READ_LOG_DIR, WRITE_LOG_DIR def parse_arguments(): parser = argparse.ArgumentParser() - parser.add_argument( - '--log_dir', - type=str, - default=BENCH_LOG_DIR, - help= - f'Folder of performance sweep logs. Default is {os.path.join(".", BENCH_LOG_DIR)}' - ) + parser.add_argument('--log_dir', + type=str, + default=BENCH_LOG_DIR, + help=f'Folder of performance sweep logs. Default is {os.path.join(".", BENCH_LOG_DIR)}') args = parser.parse_args() print(f'args = {args}') @@ -75,9 +72,7 @@ def generate_aio_param(read_log_dir, write_log_dir): optimal_config_read = read_results.get(read_perf_keys[optimal_key], None) optimal_config_write = write_results.get(write_perf_keys[optimal_key], None) - print( - f'Best performance (GB/sec): read = {optimal_config_read:5.2f}, write = {optimal_config_write:5.2f}' - ) + print(f'Best performance (GB/sec): read = {optimal_config_read:5.2f}, write = {optimal_config_write:5.2f}') print(json.dumps(aio_param, indent=3)) diff --git a/csrc/aio/py_test/aio_bench_perf_sweep.py b/csrc/aio/py_test/aio_bench_perf_sweep.py index eebea69b1..60a18e830 100644 --- a/csrc/aio/py_test/aio_bench_perf_sweep.py +++ b/csrc/aio/py_test/aio_bench_perf_sweep.py @@ -20,20 +20,16 @@ from deepspeed.ops.op_builder import AsyncIOBuilder OTHER_OPTIONS = '--handle' PERF_SCRIPT = 'test_ds_aio.py' DEFAULT_SWEEP_CONFIG = { - "block_size": ["128K", - "256K"], - "queue_depth": [4, - 16, - 32], - "overlap_events": [True, - False], - "io_parallel": [2, - 8], + "block_size": ["128K", "256K"], + "queue_depth": [4, 16, 32], + "overlap_events": [True, False], + "io_parallel": [2, 8], "single_submit": [False] } class Job(object): + def __init__(self, cmd_line, output_file=None, work_dir=None): self.cmd_line = cmd_line self.output_file = output_file @@ -63,6 +59,7 @@ class Job(object): class SweepConfig(object): + def __init__(self, args): self.nvme_dir = args.nvme_dir self.io_size = args.io_size @@ -78,52 +75,35 @@ class SweepConfig(object): def parse_arguments(): parser = argparse.ArgumentParser() - parser.add_argument( - '--nvme_dir', - required=True, - type=str, - help= - 'Directory in which to perform I/O tests. A writeable directory on a NVMe device.' - ) - - parser.add_argument('--sweep_config', + parser.add_argument('--nvme_dir', + required=True, type=str, - default=None, - help='Performance sweep configuration json file.') + help='Directory in which to perform I/O tests. A writeable directory on a NVMe device.') - parser.add_argument('--no_read', - action='store_true', - help='Disable read performance measurements.') + parser.add_argument('--sweep_config', type=str, default=None, help='Performance sweep configuration json file.') - parser.add_argument('--no_write', - action='store_true', - help='Disable write performance measurements.') + parser.add_argument('--no_read', action='store_true', help='Disable read performance measurements.') - parser.add_argument( - '--io_size', - type=str, - default="400M", - help='Number of I/O bytes to read/write for performance measurements.') + parser.add_argument('--no_write', action='store_true', help='Disable write performance measurements.') + + parser.add_argument('--io_size', + type=str, + default="400M", + help='Number of I/O bytes to read/write for performance measurements.') parser.add_argument( '--no_sudo', action='store_true', help= - 'Run without sudo access. Page cache will not be flushed and reported read speeds may be higher than actual.' - ) + 'Run without sudo access. Page cache will not be flushed and reported read speeds may be higher than actual.') parser.add_argument( '--log_dir', type=str, default=BENCH_LOG_DIR, - help= - f'Output directory for performance log files. Default is {os.path.join(".", BENCH_LOG_DIR)}' - ) + help=f'Output directory for performance log files. Default is {os.path.join(".", BENCH_LOG_DIR)}') - parser.add_argument('--loops', - type=int, - default=1, - help='Count of operation repetitions') + parser.add_argument('--loops', type=int, default=1, help='Count of operation repetitions') args = parser.parse_args() print(f'args = {args}') @@ -147,6 +127,7 @@ def get_sweep_config_dict(sweep_config_json): def get_sweep_cmd_lines(sweep_config_dict): + def flatten_options(key, value_list): flat_list = [] for v in value_list: @@ -170,11 +151,7 @@ def run_job(job): args = ' '.join(job.cmd()) print(f'args = {args}') job.open_output_file() - proc = subprocess.run(args=args, - shell=True, - stdout=job.get_stdout(), - stderr=job.get_stderr(), - cwd=job.get_cwd()) + proc = subprocess.run(args=args, shell=True, stdout=job.get_stdout(), stderr=job.get_stderr(), cwd=job.get_cwd()) job.close_output_file() assert proc.returncode == 0, \ f"This command failed: {job.cmd()}" @@ -240,14 +217,7 @@ def get_log_file(io_op_desc, cmd_line): return tag_key return f'{tag_key}{value}' - tag_list = [ - SINGLE_SUBMIT, - OVERLAP_EVENTS, - THREAD_COUNT, - IO_PARALLEL, - QUEUE_DEPTH, - BLOCK_SIZE - ] + tag_list = [SINGLE_SUBMIT, OVERLAP_EVENTS, THREAD_COUNT, IO_PARALLEL, QUEUE_DEPTH, BLOCK_SIZE] log_tags = [io_op_desc] cmd_tags = create_cmd_tags(cmd_line) for tag in tag_list: @@ -298,16 +268,10 @@ def create_read_file(sweep_config): os.makedirs(read_folder, exist_ok=True) read_file_name = os.path.join(read_folder, f'random_{sweep_config.io_size}B.pt') block_size, block_count = get_block_size_and_count(refine_integer_value(sweep_config.io_size)) - dd_job = Job(cmd_line=[ - f'dd if=/dev/urandom of={read_file_name} bs={block_size} count={block_count}' - ]) - print( - f'[Start] Create read file of {sweep_config.io_size} bytes by running {dd_job.cmd()} ....' - ) + dd_job = Job(cmd_line=[f'dd if=/dev/urandom of={read_file_name} bs={block_size} count={block_count}']) + print(f'[Start] Create read file of {sweep_config.io_size} bytes by running {dd_job.cmd()} ....') run_job(dd_job) - print( - f'[Done] Create read file of {sweep_config.io_size} bytes by running {dd_job.cmd()} ....' - ) + print(f'[Done] Create read file of {sweep_config.io_size} bytes by running {dd_job.cmd()} ....') return read_folder, read_file_name @@ -319,20 +283,15 @@ def remove_folder(folder): def run_read_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines): read_folder, read_file_name = create_read_file(sweep_config) read_option = f'--read_file {read_file_name}' - read_cmd_lines = [[f'{read_option} {sweep_config.other_options}'] + cmd - for cmd in cmd_lines] + read_cmd_lines = [[f'{read_option} {sweep_config.other_options}'] + cmd for cmd in cmd_lines] #dump_cmd_lines(read_cmd_lines) log_folder = os.path.join(sweep_config.log_dir, f'{READ_LOG_DIR}') os.makedirs(log_folder, exist_ok=True) - perf_jobs = create_perf_jobs(io_op_desc=READ_OP_DESC, - log_dir=log_folder, - cmd_lines=read_cmd_lines) + perf_jobs = create_perf_jobs(io_op_desc=READ_OP_DESC, log_dir=log_folder, cmd_lines=read_cmd_lines) - launch_sweep(sweep_jobs=perf_jobs, - sync_job=sync_job, - flush_cache_job=flush_cache_job) + launch_sweep(sweep_jobs=perf_jobs, sync_job=sync_job, flush_cache_job=flush_cache_job) remove_folder(read_folder) @@ -342,20 +301,15 @@ def run_write_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines): os.makedirs(write_folder, exist_ok=True) write_file_name = os.path.join(write_folder, f'random_{sweep_config.io_size}B.pt') write_option = f'--write_size {sweep_config.io_size} --write_file {write_file_name}' - write_cmd_lines = [[f'{write_option} {sweep_config.other_options}'] + cmd - for cmd in cmd_lines] + write_cmd_lines = [[f'{write_option} {sweep_config.other_options}'] + cmd for cmd in cmd_lines] #dump_cmd_lines(write_cmd_lines) log_folder = os.path.join(sweep_config.log_dir, f'{WRITE_LOG_DIR}') os.makedirs(log_folder, exist_ok=True) - perf_jobs = create_perf_jobs(io_op_desc=WRITE_OP_DESC, - log_dir=log_folder, - cmd_lines=write_cmd_lines) + perf_jobs = create_perf_jobs(io_op_desc=WRITE_OP_DESC, log_dir=log_folder, cmd_lines=write_cmd_lines) - launch_sweep(sweep_jobs=perf_jobs, - sync_job=sync_job, - flush_cache_job=flush_cache_job) + launch_sweep(sweep_jobs=perf_jobs, sync_job=sync_job, flush_cache_job=flush_cache_job) remove_folder(write_folder) @@ -376,10 +330,7 @@ def main(): cmd_lines = get_sweep_cmd_lines(sweep_config.search_space) if sweep_config.flush_cache: - flush_cache_job = Job( - cmd_line=['sudo', - 'bash -c', - "'echo 1 > /proc/sys/vm/drop_caches'"]) + flush_cache_job = Job(cmd_line=['sudo', 'bash -c', "'echo 1 > /proc/sys/vm/drop_caches'"]) else: flush_cache_job = None diff --git a/csrc/aio/py_test/ds_aio_basic.py b/csrc/aio/py_test/ds_aio_basic.py index d7f034ad9..66879b6ae 100755 --- a/csrc/aio/py_test/ds_aio_basic.py +++ b/csrc/aio/py_test/ds_aio_basic.py @@ -20,14 +20,8 @@ def pre_basic(args, tid, read_op): file = args.read_file if read_op else f'{args.write_file}.{tid}' task_log(tid, f'Allocate tensor of size {num_bytes} bytes') - buffer = get_accelerator().pin_memory( - torch.empty(num_bytes, - dtype=torch.uint8, - device='cpu')) - task_log( - tid, - f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}' - ) + buffer = get_accelerator().pin_memory(torch.empty(num_bytes, dtype=torch.uint8, device='cpu')) + task_log(tid, f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}') ctxt = {} ctxt['file'] = file @@ -60,13 +54,8 @@ def post_basic(pool_params): def main_basic_read(pool_params): args, tid, ctxt = pool_params start_time = time.time() - AsyncIOBuilder().load().aio_read(ctxt['buffer'], - ctxt['file'], - args.block_size, - args.queue_depth, - args.single_submit, - args.overlap_events, - args.validate) + AsyncIOBuilder().load().aio_read(ctxt['buffer'], ctxt['file'], args.block_size, args.queue_depth, + args.single_submit, args.overlap_events, args.validate) end_time = time.time() ctxt['elapsed_sec'] += end_time - start_time @@ -76,13 +65,8 @@ def main_basic_read(pool_params): def main_basic_write(pool_params): args, tid, ctxt = pool_params start_time = time.time() - AsyncIOBuilder().load().aio_write(ctxt['buffer'], - ctxt['file'], - args.block_size, - args.queue_depth, - args.single_submit, - args.overlap_events, - args.validate) + AsyncIOBuilder().load().aio_write(ctxt['buffer'], ctxt['file'], args.block_size, args.queue_depth, + args.single_submit, args.overlap_events, args.validate) end_time = time.time() ctxt['elapsed_sec'] += end_time - start_time diff --git a/csrc/aio/py_test/ds_aio_handle.py b/csrc/aio/py_test/ds_aio_handle.py index 7f0e44779..96a72d24f 100755 --- a/csrc/aio/py_test/ds_aio_handle.py +++ b/csrc/aio/py_test/ds_aio_handle.py @@ -20,27 +20,17 @@ def pre_handle(args, tid, read_op): file = args.read_file if read_op else f'{args.write_file}.{tid}' io_parallel = args.io_parallel if args.io_parallel else 1 - handle = AsyncIOBuilder().load().aio_handle(args.block_size, - args.queue_depth, - args.single_submit, - args.overlap_events, - io_parallel) + handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit, + args.overlap_events, io_parallel) task_log(tid, f'Created deepspeed aio handle') if args.gpu: - buffer = torch.empty(num_bytes, - dtype=torch.uint8, - device=get_accelerator().device_name()) + buffer = torch.empty(num_bytes, dtype=torch.uint8, device=get_accelerator().device_name()) else: if args.use_accelerator_pin_memory: - buffer = get_accelerator().pin_memory( - torch.empty(num_bytes, - dtype=torch.uint8, - device='cpu')) + buffer = get_accelerator().pin_memory(torch.empty(num_bytes, dtype=torch.uint8, device='cpu')) else: - buffer = handle.new_cpu_locked_tensor(num_bytes, - torch.empty(0, - dtype=torch.uint8)) + buffer = handle.new_cpu_locked_tensor(num_bytes, torch.empty(0, dtype=torch.uint8)) task_log(tid, f'Allocate tensor of size {num_bytes} bytes') @@ -51,10 +41,7 @@ def pre_handle(args, tid, read_op): ctxt['buffer'] = buffer ctxt['elapsed_sec'] = 0 - task_log( - tid, - f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}' - ) + task_log(tid, f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}') return ctxt diff --git a/csrc/aio/py_test/parse_aio_stats.py b/csrc/aio/py_test/parse_aio_stats.py index 2a3e64944..3027af8e7 100755 --- a/csrc/aio/py_test/parse_aio_stats.py +++ b/csrc/aio/py_test/parse_aio_stats.py @@ -19,10 +19,7 @@ METRIC_SEARCH = {READ_SPEED: 'E2E Read Speed', WRITE_SPEED: 'E2E Write Speed'} def parse_arguments(): parser = argparse.ArgumentParser() - parser.add_argument('--log_dir', - type=str, - required=True, - help='Folder of statistics logs') + parser.add_argument('--log_dir', type=str, required=True, help='Folder of statistics logs') parser.add_argument('--metric', type=str, @@ -125,10 +122,7 @@ def get_results(log_files, metric): def get_sorted_results(log_dir, metric): - log_files = [ - f for f in os.listdir(log_dir) if os.path.isfile(os.path.join(log_dir, - f)) - ] + log_files = [f for f in os.listdir(log_dir) if os.path.isfile(os.path.join(log_dir, f))] log_files_path = [os.path.join(log_dir, f) for f in log_files] results = get_results(log_files_path, metric) diff --git a/csrc/aio/py_test/test_ds_aio.py b/csrc/aio/py_test/test_ds_aio.py index 7cb737d68..384f4e3aa 100755 --- a/csrc/aio/py_test/test_ds_aio.py +++ b/csrc/aio/py_test/test_ds_aio.py @@ -20,46 +20,29 @@ def parse_arguments(): parser.add_argument('--write_file', type=str, default=None, help='Write file.') - parser.add_argument('--write_size', - type=str, - default=None, - help='Number of bytes to write.') + parser.add_argument('--write_size', type=str, default=None, help='Number of bytes to write.') parser.add_argument('--block_size', type=str, default='1M', help='I/O block size.') parser.add_argument('--queue_depth', type=int, default=32, help='I/O queue depth.') - parser.add_argument('--threads', - type=int, - default=1, - help='Thread parallelism count.') + parser.add_argument('--threads', type=int, default=1, help='Thread parallelism count.') - parser.add_argument( - '--single_submit', - action='store_true', - help= - 'Submit I/O requests in singles (default is submit queue_depth amount at once.).' - ) + parser.add_argument('--single_submit', + action='store_true', + help='Submit I/O requests in singles (default is submit queue_depth amount at once.).') parser.add_argument('--overlap_events', action='store_true', help='Overlap I/O submission and completion requests.') - parser.add_argument('--validate', - action='store_true', - help='Perform validation in library.') + parser.add_argument('--validate', action='store_true', help='Perform validation in library.') parser.add_argument('--handle', action='store_true', help='Use AIO handle.') - parser.add_argument('--loops', - type=int, - default=1, - help='Count of operation repetitions') + parser.add_argument('--loops', type=int, default=1, help='Count of operation repetitions') - parser.add_argument('--io_parallel', - type=int, - default=None, - help='Per iop parallelism') + parser.add_argument('--io_parallel', type=int, default=None, help='Per iop parallelism') parser.add_argument('--gpu', action='store_true', help='Use GPU memory') diff --git a/deepspeed/__init__.py b/deepspeed/__init__.py index 9e2e25513..d70a7434a 100755 --- a/deepspeed/__init__.py +++ b/deepspeed/__init__.py @@ -51,12 +51,10 @@ __git_branch__ = git_branch def initialize(args=None, model: torch.nn.Module = None, - optimizer: Optional[Union[Optimizer, - DeepSpeedOptimizerCallable]] = None, + optimizer: Optional[Union[Optimizer, DeepSpeedOptimizerCallable]] = None, model_parameters: Optional[torch.nn.Module] = None, training_data: Optional[torch.utils.data.Dataset] = None, - lr_scheduler: Optional[Union[_LRScheduler, - DeepSpeedSchedulerCallable]] = None, + lr_scheduler: Optional[Union[_LRScheduler, DeepSpeedSchedulerCallable]] = None, mpu=None, dist_init_required: Optional[bool] = None, collate_fn=None, @@ -110,10 +108,8 @@ def initialize(args=None, * ``lr_scheduler``: Wrapped lr scheduler if user ``lr_scheduler`` is passed, or if ``lr_scheduler`` specified in JSON configuration. Otherwise ``None``. """ - log_dist("DeepSpeed info: version={}, git-hash={}, git-branch={}".format( - __version__, - __git_hash__, - __git_branch__), + log_dist("DeepSpeed info: version={}, git-hash={}, git-branch={}".format(__version__, __git_hash__, + __git_branch__), ranks=[0]) # Disable zero.Init context if it's currently enabled @@ -147,12 +143,7 @@ def initialize(args=None, config=config, config_params=config_params) - return_items = [ - engine, - engine.optimizer, - engine.training_dataloader, - engine.lr_scheduler - ] + return_items = [engine, engine.optimizer, engine.training_dataloader, engine.lr_scheduler] return tuple(return_items) @@ -171,38 +162,28 @@ def _add_core_arguments(parser): """ group = parser.add_argument_group('DeepSpeed', 'DeepSpeed configurations') - group.add_argument( - '--deepspeed', - default=False, - action='store_true', - help= - 'Enable DeepSpeed (helper flag for user code, no impact on DeepSpeed backend)') + group.add_argument('--deepspeed', + default=False, + action='store_true', + help='Enable DeepSpeed (helper flag for user code, no impact on DeepSpeed backend)') - group.add_argument('--deepspeed_config', - default=None, - type=str, - help='DeepSpeed json configuration file.') + group.add_argument('--deepspeed_config', default=None, type=str, help='DeepSpeed json configuration file.') - group.add_argument( - '--deepscale', - default=False, - action='store_true', - help= - 'Deprecated enable DeepSpeed (helper flag for user code, no impact on DeepSpeed backend)' - ) + group.add_argument('--deepscale', + default=False, + action='store_true', + help='Deprecated enable DeepSpeed (helper flag for user code, no impact on DeepSpeed backend)') group.add_argument('--deepscale_config', default=None, type=str, help='Deprecated DeepSpeed json configuration file.') - group.add_argument( - '--deepspeed_mpi', - default=False, - action='store_true', - help= - "Run via MPI, this will attempt to discover the necessary variables to initialize torch " - "distributed from the MPI environment") + group.add_argument('--deepspeed_mpi', + default=False, + action='store_true', + help="Run via MPI, this will attempt to discover the necessary variables to initialize torch " + "distributed from the MPI environment") return parser @@ -278,10 +259,8 @@ def init_inference(model, config=None, **kwargs): Returns: A deepspeed.InferenceEngine wrapped model. """ - log_dist("DeepSpeed info: version={}, git-hash={}, git-branch={}".format( - __version__, - __git_hash__, - __git_branch__), + log_dist("DeepSpeed info: version={}, git-hash={}, git-branch={}".format(__version__, __git_hash__, + __git_branch__), ranks=[0]) # Load config_dict from config first @@ -293,17 +272,14 @@ def init_inference(model, config=None, **kwargs): elif isinstance(config, dict): config_dict = config else: - raise ValueError( - f"'config' argument expected string or dictionary, got {type(config)}") + raise ValueError(f"'config' argument expected string or dictionary, got {type(config)}") # Update with values from kwargs, ensuring no conflicting overlap between config and kwargs overlap_keys = set(config_dict.keys()).intersection(kwargs.keys()) # If there is overlap, error out if values are different for key in overlap_keys: if config_dict[key] != kwargs[key]: - raise ValueError( - f"Conflicting argument '{key}' in 'config':{config_dict[key]} and kwargs:{kwargs[key]}" - ) + raise ValueError(f"Conflicting argument '{key}' in 'config':{config_dict[key]} and kwargs:{kwargs[key]}") config_dict.update(kwargs) ds_inference_config = DeepSpeedInferenceConfig(**config_dict) diff --git a/deepspeed/autotuning/autotuner.py b/deepspeed/autotuning/autotuner.py index 786597273..1711526b1 100755 --- a/deepspeed/autotuning/autotuner.py +++ b/deepspeed/autotuning/autotuner.py @@ -40,6 +40,7 @@ class Autotuner: """The DeepSpeed Autotuner automatically discovers the optimal DeepSpeed configuration that delivers good training speed. The Autotuner uses model information, system information, and heuristics to efficiently tune system knobs that affect compute and memory efficiencies, such as ZeRO optimization stages, micro-batch sizes, and many other ZeRO optimization configurations. It not only reduces the time and resources user spend on tuning, but also can discover configurations better than hand-tuned methods. Autotuning with DeepSpeed requires no code change from DeepSpeed users. Please refer to the README for usage details. """ + def __init__(self, args, active_resources): self.args = args self.selected_exp_dir = None @@ -92,7 +93,8 @@ class Autotuner: assert self.exp_num_gpus <= self.rm.num_gpus_per_node, "num_gpus in the autotuning configuration must not be less than the --num_gpus value in the train script if any" assert self.exp_num_nodes <= len( - self.rm.nodes), "num_nodes in the autotuning configuration must not be less than the --num_nodes value in the train script if any" + self.rm.nodes + ), "num_nodes in the autotuning configuration must not be less than the --num_nodes value in the train script if any" self.records = {} self.optimal_cmd = None @@ -125,18 +127,10 @@ class Autotuner: row.append(val[0]['name']) tab.append(row) summary = tabulate(tab, - headers=[ - "tuning_space", - "num_experiments", - "best_metric_val", - "best_exp_name" - ], + headers=["tuning_space", "num_experiments", "best_metric_val", "best_exp_name"], tablefmt="pipe") print(summary) - with open(os.path.join(self.results_dir, - 'summary.txt'), - 'w', - buffering=BUFSIZE) as fd: + with open(os.path.join(self.results_dir, 'summary.txt'), 'w', buffering=BUFSIZE) as fd: fd.write(summary) fd.flush() os.fsync(fd) @@ -148,9 +142,7 @@ class Autotuner: f"{best_exp['name']} is the optimal setup after tuning. The exp result is at {best_exp['result_dir']}." ) else: - logger.info( - f"No optimal setup is found. Please check that experiments were run successfully." - ) + logger.info(f"No optimal setup is found. Please check that experiments were run successfully.") tuning_duration = datetime.timedelta(seconds=(time.time() - self.start_time)) logger.info(f"Tuning completed in {tuning_duration}") @@ -172,8 +164,8 @@ class Autotuner: user_config_file = None if "--deepspeed_config" in user_args: idx = user_args.index("--deepspeed_config") - assert ".json" in user_args[idx + - 1], "DeepSpeed --deepspeed_config requires a json file to specify the configuration" + assert ".json" in user_args[ + idx + 1], "DeepSpeed --deepspeed_config requires a json file to specify the configuration" user_config_file = user_args[idx + 1] elif "--deepspeed" in user_args: @@ -183,15 +175,10 @@ class Autotuner: logger.debug(f"user_config_file = {user_config_file}") if user_config_file is not None: - assert os.path.isfile( - user_config_file - ), "DeepSpeed configuration file: {} is not an existing file".format( - user_config_file - ) + assert os.path.isfile(user_config_file), "DeepSpeed configuration file: {} is not an existing file".format( + user_config_file) if os.path.exists(user_config_file): - return json.load(open(user_config_file, - "r"), - object_pairs_hook=dict_raise_error_on_duplicate_keys) + return json.load(open(user_config_file, "r"), object_pairs_hook=dict_raise_error_on_duplicate_keys) return None @@ -258,13 +245,11 @@ class Autotuner: return self.autotuning_config.mp_size def max_train_micro_batch_size_per_gpu(self): - if self.max_train_batch_size() and self.max_train_batch_size( - ) > 0: # if the user specifies a max_train_batch_size - max_train_micro_batch_size = self.max_train_batch_size() * self.mp_size( - ) // (self.exp_num_gpus * self.exp_num_nodes - ) # gradient accumulation steps >=1 - return min(self.autotuning_config.max_train_micro_batch_size_per_gpu, - max_train_micro_batch_size) + if self.max_train_batch_size( + ) and self.max_train_batch_size() > 0: # if the user specifies a max_train_batch_size + max_train_micro_batch_size = self.max_train_batch_size() * self.mp_size() // ( + self.exp_num_gpus * self.exp_num_nodes) # gradient accumulation steps >=1 + return min(self.autotuning_config.max_train_micro_batch_size_per_gpu, max_train_micro_batch_size) else: return self.autotuning_config.max_train_micro_batch_size_per_gpu @@ -361,19 +346,14 @@ class Autotuner: if model_info and "hidden_size" in model_info: hs = model_info["hidden_size"] template_config[ZERO_OPTIMIZATION]['reduce_bucket_size'] = hs * hs - template_config[ZERO_OPTIMIZATION][ - 'stage3_prefetch_bucket_size'] = 0.9 * hs * hs - template_config[ZERO_OPTIMIZATION][ - 'stage3_param_persistence_threshold'] = 10 * hs + template_config[ZERO_OPTIMIZATION]['stage3_prefetch_bucket_size'] = 0.9 * hs * hs + template_config[ZERO_OPTIMIZATION]['stage3_param_persistence_threshold'] = 10 * hs prefix = "z3_" else: return exps # replace the corresponding parameter values if the user specifies them in the DeepSpeed configuration file - replace_dict(tuning_space, - self.user_config, - [ZERO_OPTIMIZATION, - TRAIN_MICRO_BATCH_SIZE_PER_GPU]) + replace_dict(tuning_space, self.user_config, [ZERO_OPTIMIZATION, TRAIN_MICRO_BATCH_SIZE_PER_GPU]) logger.debug(f"tuning_space = {json.dumps(tuning_space)}") @@ -397,11 +377,9 @@ class Autotuner: # if the config does not use offloading, remove the offloading section config_zero = config.get(ZERO_OPTIMIZATION, None) if config_zero: - if OFFLOAD_OPTIMIZER not in config_zero and OFFLOAD_OPTIMIZER in exp_config[ - ZERO_OPTIMIZATION]: + if OFFLOAD_OPTIMIZER not in config_zero and OFFLOAD_OPTIMIZER in exp_config[ZERO_OPTIMIZATION]: del exp_config[ZERO_OPTIMIZATION][OFFLOAD_OPTIMIZER] - if OFFLOAD_PARAM not in config_zero and OFFLOAD_PARAM in exp_config[ - ZERO_OPTIMIZATION]: + if OFFLOAD_PARAM not in config_zero and OFFLOAD_PARAM in exp_config[ZERO_OPTIMIZATION]: del exp_config[ZERO_OPTIMIZATION][OFFLOAD_PARAM] # set gradient accumulation steps according to max_train_batch_size_per_gpu mbs = exp_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU] @@ -438,13 +416,10 @@ class Autotuner: else: return - logger.info( - f"The model has {number_to_string(self.get_model_num_params())} parameters.") + logger.info(f"The model has {number_to_string(self.get_model_num_params())} parameters.") self.gpu_mem = self.get_gpu_memory_info() - logger.info( - f"Memory per GPU in the system is {memory_to_string(self.gpu_mem, postfix='B')}." - ) + logger.info(f"Memory per GPU in the system is {memory_to_string(self.gpu_mem, postfix='B')}.") self.activation_mem = self.get_activation_memory_per_gpu() logger.info( @@ -452,9 +427,7 @@ class Autotuner: ) #TODO: FIX THIS - stage = self.user_config.get(ZERO_OPTIMIZATION, - {}).get(ZERO_OPTIMIZATION_STAGE, - "all") + stage = self.user_config.get(ZERO_OPTIMIZATION, {}).get(ZERO_OPTIMIZATION_STAGE, "all") stage = "all" user_zero_stages = [stage] if not isinstance(stage, list) else stage logger.info(f"User-defined zero stages are {stage}.") @@ -463,15 +436,13 @@ class Autotuner: max_mbs = 0 metric_val = 0 - required_gpu_mem = self.get_instantiation_memory_required_per_gpu( - ZeroStageEnum.disabled) + self.activation_mem + required_gpu_mem = self.get_instantiation_memory_required_per_gpu(ZeroStageEnum.disabled) + self.activation_mem if self.gpu_mem > required_gpu_mem: if "all" in user_zero_stages or ZeroStageEnum.disabled in user_zero_stages: logger.info( f"The model might be runable with ZERO 0 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1), adding DEFAULT_TUNING_SPACE_ZERO_0 to the global tuning space" ) - next_max_mbs, next_mbs, next_metric_val = self.tune_space( - DEFAULT_TUNING_SPACE_ZERO_0) + next_max_mbs, next_mbs, next_metric_val = self.tune_space(DEFAULT_TUNING_SPACE_ZERO_0) if next_mbs > mbs: mbs = next_mbs max_mbs = next_max_mbs @@ -490,8 +461,10 @@ class Autotuner: logger.info( f"The model might be runable with ZERO 1 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory), adding DEFAULT_TUNING_SPACE_ZERO_1 to the global tuning space" ) - next_max_mbs, next_mbs, next_metric_val = self.tune_space( - DEFAULT_TUNING_SPACE_ZERO_1, prev_max_mbs = max_mbs, prev_best_mbs=mbs, prev_best_metric_val=metric_val) + next_max_mbs, next_mbs, next_metric_val = self.tune_space(DEFAULT_TUNING_SPACE_ZERO_1, + prev_max_mbs=max_mbs, + prev_best_mbs=mbs, + prev_best_metric_val=metric_val) if next_mbs > mbs: mbs = next_mbs max_mbs = next_max_mbs @@ -510,8 +483,10 @@ class Autotuner: logger.info( f"The model might be runable with ZERO 2 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory), adding DEFAULT_TUNING_SPACE_ZERO_2 to the global tuning space" ) - next_max_mbs, next_mbs, next_metric_val = self.tune_space( - DEFAULT_TUNING_SPACE_ZERO_2, prev_max_mbs = max_mbs, prev_best_mbs=mbs, prev_best_metric_val=metric_val) + next_max_mbs, next_mbs, next_metric_val = self.tune_space(DEFAULT_TUNING_SPACE_ZERO_2, + prev_max_mbs=max_mbs, + prev_best_mbs=mbs, + prev_best_metric_val=metric_val) if next_mbs > mbs: mbs = next_mbs max_mbs = next_max_mbs @@ -523,15 +498,16 @@ class Autotuner: f"The model is not runable with ZERO stage {ZeroStageEnum.gradients} (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1)" ) - required_gpu_mem = self.get_instantiation_memory_required_per_gpu( - ZeroStageEnum.weights) + self.activation_mem + required_gpu_mem = self.get_instantiation_memory_required_per_gpu(ZeroStageEnum.weights) + self.activation_mem if self.gpu_mem > required_gpu_mem: if "all" in user_zero_stages or ZeroStageEnum.weights in user_zero_stages: logger.info( f"The model might be runable with ZERO 3 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory), adding DEFAULT_TUNING_SPACE_ZERO_3 to the global tuning space" ) - _, _, next_metric_val = self.tune_space( - DEFAULT_TUNING_SPACE_ZERO_3, prev_max_mbs = max_mbs, prev_best_mbs=mbs, prev_best_metric_val=metric_val) + _, _, next_metric_val = self.tune_space(DEFAULT_TUNING_SPACE_ZERO_3, + prev_max_mbs=max_mbs, + prev_best_mbs=mbs, + prev_best_metric_val=metric_val) if has_mlflow: mlflow.log_metric(f"z3{self.metric()}", next_metric_val) else: @@ -542,11 +518,7 @@ class Autotuner: if has_mlflow: mlflow.end_run() - def tune_space(self, - tuning_space, - prev_max_mbs=0, - prev_best_mbs=0, - prev_best_metric_val=0): + def tune_space(self, tuning_space, prev_max_mbs=0, prev_best_mbs=0, prev_best_metric_val=0): config_zero = tuning_space.get(ZERO_OPTIMIZATION, {}) stage = config_zero.get(ZERO_OPTIMIZATION_STAGE, None) tuning_space_name = TUNING_MICRO_BATCH_SIZE_PREFIX + str(stage) @@ -557,26 +529,20 @@ class Autotuner: # calculate max micro batch size using gpu memory, model instantiation memory and activation memory # calculated_max_micro_batch_size = (memory_per_gpu - instantiation_memory) // activation_memory_micro_batch_size_1 calculated_max_micro_batch_size = int( - self.gpu_mem - - self.get_instantiation_memory_required_per_gpu(stage)) // self.activation_mem + self.gpu_mem - self.get_instantiation_memory_required_per_gpu(stage)) // self.activation_mem logger.info( f"Start tuning for space {tuning_space_name}, calculated_max_micro_batch_size = {calculated_max_micro_batch_size}" ) if calculated_max_micro_batch_size < prev_max_mbs: - logger.info( - f"No need to tune Zero stage {stage}. End tuning for space {tuning_space_name}" - ) + logger.info(f"No need to tune Zero stage {stage}. End tuning for space {tuning_space_name}") return 0, 0, 0 if TRAIN_MICRO_BATCH_SIZE_PER_GPU in self.user_config and isinstance( - self.user_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU], - list): + self.user_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU], list): # user-specified micro batch size per gpu is a list which overwrites the default tuning behavior tuning_micro_batch_sizes = [ - s for s in self.user_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU] - if isinstance(s, - int) + s for s in self.user_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU] if isinstance(s, int) ] gas = self.get_gas_from_user_config() min_micro_batch_size = min(tuning_micro_batch_sizes) @@ -589,9 +555,7 @@ class Autotuner: stage, prev_max_mbs, calculated_max_micro_batch_size) if max_micro_batch_size < prev_max_mbs: - logger.info( - f"No need to tune Zero stage {stage}. End tuning for space {tuning_space_name}" - ) + logger.info(f"No need to tune Zero stage {stage}. End tuning for space {tuning_space_name}") return 0, 0, 0 tuning_micro_batch_sizes, max_train_batch_size_per_gpu = self.get_tuning_micro_batch_size_list( @@ -609,19 +573,15 @@ class Autotuner: return 0, 0, 0 # tune micro batch sizes and gradient accumulation steps given max_train_batch_size_per_gpu - tuning_micro_batch_sizes = self.run_tuning_micro_batch_sizes( - tuning_micro_batch_sizes, - max_train_batch_size_per_gpu, - min_micro_batch_size, - stage, - tuning_micro_batch_sizes_overwritten) + tuning_micro_batch_sizes = self.run_tuning_micro_batch_sizes(tuning_micro_batch_sizes, + max_train_batch_size_per_gpu, + min_micro_batch_size, stage, + tuning_micro_batch_sizes_overwritten) fast_best_record = self.get_best_space_record(tuning_space_name) fast_best_metric_val = fast_best_record[1] if fast_best_record else 0 - fast_best_mbs = fast_best_record[0][DS_CONFIG][ - TRAIN_MICRO_BATCH_SIZE_PER_GPU] if fast_best_record else 0 - logger.info( - f"fast_best_mbs = {fast_best_mbs}, name = {fast_best_record[0]['name']}") + fast_best_mbs = fast_best_record[0][DS_CONFIG][TRAIN_MICRO_BATCH_SIZE_PER_GPU] if fast_best_record else 0 + logger.info(f"fast_best_mbs = {fast_best_mbs}, name = {fast_best_record[0]['name']}") if self.fast_enabled() or stage == 0: logger.info(f"End tuning for space: {tuning_space_name}") @@ -631,8 +591,7 @@ class Autotuner: if stage > 0: if fast_best_mbs <= prev_best_mbs or fast_best_metric_val < prev_best_metric_val: logger.info( - f"End tuning for space: {tuning_space_name}. No need to tune other Zero configuration parameters." - ) + f"End tuning for space: {tuning_space_name}. No need to tune other Zero configuration parameters.") return max_micro_batch_size, fast_best_mbs, fast_best_metric_val tuning_space[TRAIN_MICRO_BATCH_SIZE_PER_GPU] = tuning_micro_batch_sizes @@ -654,8 +613,7 @@ class Autotuner: else: t = GridSearchTuner(exps, self.rm, self.metric()) - sample_size = len(self.rm.nodes) * self.rm.num_gpus_per_node // ( - self.exp_num_gpus * self.exp_num_nodes) + sample_size = len(self.rm.nodes) * self.rm.num_gpus_per_node // (self.exp_num_gpus * self.exp_num_nodes) num_exps = t.tune(sample_size=sample_size, n_trials=self.autotuning_config.tuner_num_trials, early_stopping=self.autotuning_config.tuner_early_stopping) @@ -669,8 +627,7 @@ class Autotuner: if full_best_metric_val > fast_best_metric_val: best_metric_val = full_best_metric_val - best_mbs = full_best_record[0][DS_CONFIG][ - TRAIN_MICRO_BATCH_SIZE_PER_GPU] if full_best_record else -1 + best_mbs = full_best_record[0][DS_CONFIG][TRAIN_MICRO_BATCH_SIZE_PER_GPU] if full_best_record else -1 else: best_metric_val = fast_best_metric_val best_mbs = fast_best_mbs @@ -682,9 +639,7 @@ class Autotuner: if tuning_space_name not in self.records: return 0 space_records = self.records[tuning_space_name] - sorted_space_records = sorted( - space_records, - key=lambda x: x[0][DS_CONFIG][TRAIN_MICRO_BATCH_SIZE_PER_GPU]) + sorted_space_records = sorted(space_records, key=lambda x: x[0][DS_CONFIG][TRAIN_MICRO_BATCH_SIZE_PER_GPU]) prev_metric_val = None prev_micro_batch_size = 0 for (exp, metric_val, _) in sorted_space_records: @@ -692,8 +647,7 @@ class Autotuner: if metric_val < prev_metric_val: break if (metric_val >= prev_metric_val - and (metric_val - prev_metric_val) / prev_metric_val < - METRIC_PERCENT_DIFF_CONST): + and (metric_val - prev_metric_val) / prev_metric_val < METRIC_PERCENT_DIFF_CONST): break prev_metric_val = metric_val prev_micro_batch_size = exp[DS_CONFIG][TRAIN_MICRO_BATCH_SIZE_PER_GPU] @@ -718,16 +672,8 @@ class Autotuner: ds_config = copy.deepcopy(self.user_config) replace_dict(ds_config, DEFAULT_MIN_MEM_CONFIG) - model_info_path = os.path.join(self.results_dir, - "profile_model_info", - "model_info.json") - ds_config[AUTOTUNING] = { - "enabled": True, - "model_info_path": model_info_path, - "model_info": { - "profile": True - } - } + model_info_path = os.path.join(self.results_dir, "profile_model_info", "model_info.json") + ds_config[AUTOTUNING] = {"enabled": True, "model_info_path": model_info_path, "model_info": {"profile": True}} exp_config = {} exp_name = "profile_model_info" @@ -748,8 +694,7 @@ class Autotuner: for exp_id, (exp_json, err) in self.rm.finished_experiments.items(): self.rm.clear() if err: - logger.error( - f"The model is not runnable with DeepSpeed with error = {err}") + logger.error(f"The model is not runnable with DeepSpeed with error = {err}") return None if os.path.exists(model_info_path): @@ -790,12 +735,8 @@ class Autotuner: best_space_records[GLOBAL_TUNING_SPACE] = global_best_record return best_space_records - def run_tuning_micro_batch_sizes(self, - tuning_micro_batch_sizes, - max_train_batch_size_per_gpu, - min_micro_batch_size, - stage, - tuning_micro_batch_sizes_overwritten): + def run_tuning_micro_batch_sizes(self, tuning_micro_batch_sizes, max_train_batch_size_per_gpu, + min_micro_batch_size, stage, tuning_micro_batch_sizes_overwritten): assert tuning_micro_batch_sizes, "the tuning micro batch size list is empty" tuning_micro_batch_sizes.sort() max_micro_batch_size = tuning_micro_batch_sizes[-1] @@ -838,8 +779,7 @@ class Autotuner: results = hjson.load(f) metric_val = results[self.metric()] self.update_records(tuning_space_name, exp, metric_val, 1) - if max_micro_batch_size == exp[DS_CONFIG][ - TRAIN_MICRO_BATCH_SIZE_PER_GPU]: + if max_micro_batch_size == exp[DS_CONFIG][TRAIN_MICRO_BATCH_SIZE_PER_GPU]: max_micro_batch_size_metric_val = metric_val if has_mlflow: os.environ.pop('MLFLOW_RUN_ID') @@ -862,9 +802,8 @@ class Autotuner: # in a auto-detected tuning_micro_batch_sizs list, max_micro_batch_size might not be performant as the memory consumption is close to max # try smaller values while gas stays the same # if finding a more performant mbs value, use it to replace max_micro_batch_size in the list - min_micro_batch_size_with_same_gas = ( - tuning_micro_batch_sizes[-2] + - 1) if len(tuning_micro_batch_sizes) > 1 else min_micro_batch_size + min_micro_batch_size_with_same_gas = (tuning_micro_batch_sizes[-2] + + 1) if len(tuning_micro_batch_sizes) > 1 else min_micro_batch_size prev_best_metric_val = max_micro_batch_size_metric_val prev_best_mbs = max_micro_batch_size @@ -872,10 +811,7 @@ class Autotuner: stride = (max_micro_batch_size - min_micro_batch_size_with_same_gas) // 3 if stride == 0: stride = 1 - for mbs in reversed( - range(min_micro_batch_size_with_same_gas, - max_micro_batch_size, - stride)): + for mbs in reversed(range(min_micro_batch_size_with_same_gas, max_micro_batch_size, stride)): ds_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU] = mbs gas = max_train_batch_size_per_gpu // mbs ds_config[GRADIENT_ACCUMULATION_STEPS] = gas @@ -908,10 +844,7 @@ class Autotuner: tuning_micro_batch_sizes[-1] = prev_best_mbs return tuning_micro_batch_sizes - def get_min_max_micro_batch_size(self, - stage, - min_micro_batch_size, - calculated_max_micro_batch_size): + def get_min_max_micro_batch_size(self, stage, min_micro_batch_size, calculated_max_micro_batch_size): # get min and max micro batch size with gradient accumulation steps = 1 if min_micro_batch_size > calculated_max_micro_batch_size: return -1, -1 @@ -927,8 +860,7 @@ class Autotuner: # search for the min micro batch size if min_micro_batch_size < 1: if TRAIN_MICRO_BATCH_SIZE_PER_GPU in self.user_config and isinstance( - self.user_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU], - int): + self.user_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU], int): # user specifies train_micro_batch_size_per_gpu as an int mbs = int(self.user_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU]) else: @@ -951,8 +883,7 @@ class Autotuner: min_micro_batch_size = mbs else: self.update_records(tuning_space_name, exp, 0, 1) - logger.info( - f"User-specified micro batch size per GPU {mbs} does not run") + logger.info(f"User-specified micro batch size per GPU {mbs} does not run") if self.min_train_micro_batch_size_per_gpu() == mbs: return -1, -1 mbs = self.min_train_micro_batch_size_per_gpu() @@ -964,8 +895,7 @@ class Autotuner: exp, metric_val = self.run_ds_config(ds_config, exp_name) if not metric_val: self.update_records(tuning_space_name, exp, 0, 1) - logger.info( - f"min_train_micro_batch_size_per_gpu {mbs} is not runnable.") + logger.info(f"min_train_micro_batch_size_per_gpu {mbs} is not runnable.") return -1, -1 self.update_records(tuning_space_name, exp, metric_val, 1) min_micro_batch_size = mbs @@ -975,8 +905,7 @@ class Autotuner: ds_config[GRADIENT_ACCUMULATION_STEPS] = gas ds_config[TRAIN_BATCH_SIZE] = min_micro_batch_size * gas * \ self.exp_num_gpus * self.exp_num_nodes // self.mp_size() - exp_name = tuning_space_name + "_gas" + str(gas) + "_tmbspg" + str( - min_micro_batch_size) + exp_name = tuning_space_name + "_gas" + str(gas) + "_tmbspg" + str(min_micro_batch_size) exp, metric_val = self.run_ds_config(ds_config, exp_name) if metric_val: self.update_records(tuning_space_name, exp, metric_val, 1) @@ -986,13 +915,8 @@ class Autotuner: return -1, -1 # search for the max micro batch size - max_micro_batch_size = min(calculated_max_micro_batch_size, - self.max_train_micro_batch_size_per_gpu()) - for mbs in [ - math.ceil(1.05 * max_micro_batch_size), - max_micro_batch_size, - int(0.95 * max_micro_batch_size) - ]: + max_micro_batch_size = min(calculated_max_micro_batch_size, self.max_train_micro_batch_size_per_gpu()) + for mbs in [math.ceil(1.05 * max_micro_batch_size), max_micro_batch_size, int(0.95 * max_micro_batch_size)]: if mbs > self.max_train_micro_batch_size_per_gpu(): continue if mbs in used_micro_batch_sizes: @@ -1011,12 +935,11 @@ class Autotuner: else: self.update_records(tuning_space_name, exp, 0, 1) - space_records = self.records[ - tuning_space_name] if tuning_space_name in self.records else [] + space_records = self.records[tuning_space_name] if tuning_space_name in self.records else [] if space_records: prev_idx = min(range(len(space_records)), - key=lambda i: abs(space_records[i][0][DS_CONFIG][ - TRAIN_MICRO_BATCH_SIZE_PER_GPU] - min_micro_batch_size)) + key=lambda i: abs(space_records[i][0][DS_CONFIG][TRAIN_MICRO_BATCH_SIZE_PER_GPU] - + min_micro_batch_size)) prev_metric_val = space_records[prev_idx][1] else: prev_metric_val = None @@ -1037,8 +960,8 @@ class Autotuner: low = mid + 1 self.update_records(tuning_space_name, exp, metric_val, 1) used_micro_batch_sizes.append(mid) - if prev_metric_val and ((metric_val - prev_metric_val) / - prev_metric_val) < METRIC_PERCENT_DIFF_CONST: + if prev_metric_val and ( + (metric_val - prev_metric_val) / prev_metric_val) < METRIC_PERCENT_DIFF_CONST: logger.info(f"performance plateaus at mbs = {low}") break prev_metric_val = metric_val @@ -1049,9 +972,7 @@ class Autotuner: low = mid + 1 max_micro_batch_size = low - 1 - logger.info( - f"min_micro_batch_size = {min_micro_batch_size}, max_micro_batch_size = {max_micro_batch_size}." - ) + logger.info(f"min_micro_batch_size = {min_micro_batch_size}, max_micro_batch_size = {max_micro_batch_size}.") return min_micro_batch_size, max_micro_batch_size @@ -1067,8 +988,7 @@ class Autotuner: gas = int(val) elif isinstance(gas_in_config, list): logger.info( - f"Specifying a list of {GRADIENT_ACCUMULATION_STEPS} to tune is not supported. 1 would be used." - ) + f"Specifying a list of {GRADIENT_ACCUMULATION_STEPS} to tune is not supported. 1 would be used.") assert gas > 0, "Gradient accumulation steps must be positive." return gas @@ -1083,9 +1003,7 @@ class Autotuner: return (user_args[idx + 1]) return None - def get_tuning_micro_batch_size_list(self, - min_micro_batch_size, - max_micro_batch_size, + def get_tuning_micro_batch_size_list(self, min_micro_batch_size, max_micro_batch_size, num_tuning_micro_batch_sizes): """Get a list of micro batch sizes to tune based on min and max values, as well as the size of the list. Args: @@ -1098,17 +1016,16 @@ class Autotuner: """ if min_micro_batch_size <= 0 or max_micro_batch_size <= 0: logger.info( - f"min_micro_batch_size = {min_micro_batch_size}, max_micro_batch_size = {max_micro_batch_size}" - ) + f"min_micro_batch_size = {min_micro_batch_size}, max_micro_batch_size = {max_micro_batch_size}") return [], 0 # NUM_GPUS=$(( ${NUM_WORKERS} * ${NUM_GPUS_PER_WORKER} )) # DP_SIZE=$(( ${NUM_GPUS} / (${PP_SIZE} * ${MP_SIZE}) )) # GRAD_ACC_STEPS=$(( ${TARGET_GLOBAL_BATCH_SIZE} / (${BATCH_SIZE} * ${DP_SIZE}) )) - if self.max_train_batch_size() and self.max_train_batch_size( - ) > 0: # if the user specifies a max_train_batch_size - max_train_batch_size_per_gpu = self.max_train_batch_size() * self.mp_size( - ) // (self.exp_num_gpus * self.exp_num_nodes) + if self.max_train_batch_size( + ) and self.max_train_batch_size() > 0: # if the user specifies a max_train_batch_size + max_train_batch_size_per_gpu = self.max_train_batch_size() * self.mp_size() // (self.exp_num_gpus * + self.exp_num_nodes) else: gas = self.get_gas_from_user_config() max_train_batch_size_per_gpu = max_micro_batch_size * gas // self.mp_size() @@ -1117,8 +1034,7 @@ class Autotuner: min_micro_batch_size = max_micro_batch_size // 2 # constant stride - stride = (max_micro_batch_size - - min_micro_batch_size) // num_tuning_micro_batch_sizes + stride = (max_micro_batch_size - min_micro_batch_size) // num_tuning_micro_batch_sizes if stride == 0: stride = 1 ls = [] @@ -1187,8 +1103,6 @@ class Autotuner: result = subprocess.Popen(self.optimal_cmd) result.wait() - logger.info( - f"Done running with the optimal DeepSpeed configuration using {self.optimal_cmd}" - ) + logger.info(f"Done running with the optimal DeepSpeed configuration using {self.optimal_cmd}") else: logger.info(f"No optimal DeepSpeed configuration found by autotuning.") diff --git a/deepspeed/autotuning/config.py b/deepspeed/autotuning/config.py index 6f6b6903e..671555e23 100644 --- a/deepspeed/autotuning/config.py +++ b/deepspeed/autotuning/config.py @@ -9,6 +9,7 @@ from deepspeed.autotuning.constants import * class DeepSpeedAutotuningConfig(DeepSpeedConfigObject): + def __init__(self, param_dict): super(DeepSpeedAutotuningConfig, self).__init__() @@ -31,102 +32,65 @@ class DeepSpeedAutotuningConfig(DeepSpeedConfigObject): self._initialize(autotuning_dict) def _initialize(self, autotuning_dict): - self.enabled = get_scalar_param(autotuning_dict, - AUTOTUNING_ENABLED, - AUTOTUNING_ENABLED_DEFAULT) + self.enabled = get_scalar_param(autotuning_dict, AUTOTUNING_ENABLED, AUTOTUNING_ENABLED_DEFAULT) - self.fast = get_scalar_param(autotuning_dict, - AUTOTUNING_FAST, - AUTOTUNING_FAST_DEFAULT) + self.fast = get_scalar_param(autotuning_dict, AUTOTUNING_FAST, AUTOTUNING_FAST_DEFAULT) - self.results_dir = get_scalar_param(autotuning_dict, - AUTOTUNING_RESULTS_DIR, - AUTOTUNING_RESULTS_DIR_DEFAULT) + self.results_dir = get_scalar_param(autotuning_dict, AUTOTUNING_RESULTS_DIR, AUTOTUNING_RESULTS_DIR_DEFAULT) assert self.results_dir, "results_dir cannot be empty" - self.exps_dir = get_scalar_param(autotuning_dict, - AUTOTUNING_EXPS_DIR, - AUTOTUNING_EXPS_DIR_DEFAULT) + self.exps_dir = get_scalar_param(autotuning_dict, AUTOTUNING_EXPS_DIR, AUTOTUNING_EXPS_DIR_DEFAULT) assert self.exps_dir, "exps_dir cannot be empty" - self.overwrite = get_scalar_param(autotuning_dict, - AUTOTUNING_OVERWRITE, - AUTOTUNING_OVERWRITE_DEFAULT) + self.overwrite = get_scalar_param(autotuning_dict, AUTOTUNING_OVERWRITE, AUTOTUNING_OVERWRITE_DEFAULT) - self.start_profile_step = get_scalar_param( - autotuning_dict, - AUTOTUNING_START_PROFILE_STEP, - AUTOTUNING_START_PROFILE_STEP_DEFAULT) + self.start_profile_step = get_scalar_param(autotuning_dict, AUTOTUNING_START_PROFILE_STEP, + AUTOTUNING_START_PROFILE_STEP_DEFAULT) - self.end_profile_step = get_scalar_param(autotuning_dict, - AUTOTUNING_END_PROFILE_STEP, + self.end_profile_step = get_scalar_param(autotuning_dict, AUTOTUNING_END_PROFILE_STEP, AUTOTUNING_END_PROFILE_STEP_DEFAULT) - self.metric = get_scalar_param(autotuning_dict, - AUTOTUNING_METRIC, - AUTOTUNING_METRIC_DEFAULT) + self.metric = get_scalar_param(autotuning_dict, AUTOTUNING_METRIC, AUTOTUNING_METRIC_DEFAULT) - self.metric_path = get_scalar_param(autotuning_dict, - AUTOTUNING_METRIC_PATH, - AUTOTUNING_METRIC_PATH_DEFAULT) + self.metric_path = get_scalar_param(autotuning_dict, AUTOTUNING_METRIC_PATH, AUTOTUNING_METRIC_PATH_DEFAULT) - self.tuner_type = get_scalar_param(autotuning_dict, - AUTOTUNING_TUNER_TYPE, - AUTOTUNING_TUNER_TYPE_DEFAULT) + self.tuner_type = get_scalar_param(autotuning_dict, AUTOTUNING_TUNER_TYPE, AUTOTUNING_TUNER_TYPE_DEFAULT) - self.tuner_early_stopping = get_scalar_param( - autotuning_dict, - AUTOTUNING_TUNER_EARLY_STOPPING, - AUTOTUNING_TUNER_EARLY_STOPPING_DEFAULT) + self.tuner_early_stopping = get_scalar_param(autotuning_dict, AUTOTUNING_TUNER_EARLY_STOPPING, + AUTOTUNING_TUNER_EARLY_STOPPING_DEFAULT) - self.tuner_num_trials = get_scalar_param(autotuning_dict, - AUTOTUNING_TUNER_NUM_TRIALS, + self.tuner_num_trials = get_scalar_param(autotuning_dict, AUTOTUNING_TUNER_NUM_TRIALS, AUTOTUNING_TUNER_NUM_TRIALS_DEFAULT) - self.arg_mappings = get_dict_param(autotuning_dict, - AUTOTUNING_ARG_MAPPINGS, - AUTOTUNING_ARG_MAPPINGS_DEFAULT) + self.arg_mappings = get_dict_param(autotuning_dict, AUTOTUNING_ARG_MAPPINGS, AUTOTUNING_ARG_MAPPINGS_DEFAULT) self.model_info = get_model_info_config(autotuning_dict) - self.model_info_path = get_scalar_param(autotuning_dict, - AUTOTUNING_MODEL_INFO_PATH, + self.model_info_path = get_scalar_param(autotuning_dict, AUTOTUNING_MODEL_INFO_PATH, AUTOTUNING_MODEL_INFO_PATH_DEFAULT) - self.mp_size = get_scalar_param(autotuning_dict, - AUTOTUNING_MP_SIZE, - AUTOTUNING_MP_SIZE_DEFAULT) + self.mp_size = get_scalar_param(autotuning_dict, AUTOTUNING_MP_SIZE, AUTOTUNING_MP_SIZE_DEFAULT) - self.max_train_batch_size = get_dict_param( - autotuning_dict, - AUTOTUNING_MAX_TRAIN_BATCH_SIZE, - AUTOTUNING_MAX_TRAIN_BATCH_SIZE_DEFAULT) + self.max_train_batch_size = get_dict_param(autotuning_dict, AUTOTUNING_MAX_TRAIN_BATCH_SIZE, + AUTOTUNING_MAX_TRAIN_BATCH_SIZE_DEFAULT) - self.min_train_batch_size = get_dict_param( - autotuning_dict, - AUTOTUNING_MIN_TRAIN_BATCH_SIZE, - AUTOTUNING_MIN_TRAIN_BATCH_SIZE_DEFAULT) + self.min_train_batch_size = get_dict_param(autotuning_dict, AUTOTUNING_MIN_TRAIN_BATCH_SIZE, + AUTOTUNING_MIN_TRAIN_BATCH_SIZE_DEFAULT) self.max_train_micro_batch_size_per_gpu = get_dict_param( - autotuning_dict, - AUTOTUNING_MAX_TRAIN_MICRO_BATCH_SIZE_PER_GPU, + autotuning_dict, AUTOTUNING_MAX_TRAIN_MICRO_BATCH_SIZE_PER_GPU, AUTOTUNING_MAX_TRAIN_MICRO_BATCH_SIZE_PER_GPU_DEFAULT) self.min_train_micro_batch_size_per_gpu = get_dict_param( - autotuning_dict, - AUTOTUNING_MIN_TRAIN_MICRO_BATCH_SIZE_PER_GPU, + autotuning_dict, AUTOTUNING_MIN_TRAIN_MICRO_BATCH_SIZE_PER_GPU, AUTOTUNING_MIN_TRAIN_MICRO_BATCH_SIZE_PER_GPU_DEFAULT) - self.num_tuning_micro_batch_sizes = get_dict_param( - autotuning_dict, - AUTOTUNING_NUM_TUNING_MICRO_BATCH_SIZES, - AUTOTUNING_NUM_TUNING_MICRO_BATCH_SIZES_DEFAULT) + self.num_tuning_micro_batch_sizes = get_dict_param(autotuning_dict, AUTOTUNING_NUM_TUNING_MICRO_BATCH_SIZES, + AUTOTUNING_NUM_TUNING_MICRO_BATCH_SIZES_DEFAULT) def get_model_info_config(param_dict): if MODEL_INFO in param_dict and param_dict[MODEL_INFO] is not None: model_info_config = {} for key, default_value in MODEL_INFO_KEY_DEFAULT_DICT.items(): - model_info_config[key] = get_scalar_param(param_dict[MODEL_INFO], - key, - default_value) + model_info_config[key] = get_scalar_param(param_dict[MODEL_INFO], key, default_value) return model_info_config return None diff --git a/deepspeed/autotuning/constants.py b/deepspeed/autotuning/constants.py index d0306bb09..65cad8cb6 100644 --- a/deepspeed/autotuning/constants.py +++ b/deepspeed/autotuning/constants.py @@ -10,17 +10,13 @@ Licensed under the MIT license. import os -DEFAULT_TEMPLATE_PATH_ZERO_0 = os.path.join(os.path.dirname(os.path.realpath(__file__)), - "config_templates", +DEFAULT_TEMPLATE_PATH_ZERO_0 = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config_templates", "template_zero0.json") -DEFAULT_TEMPLATE_PATH_ZERO_1 = os.path.join(os.path.dirname(os.path.realpath(__file__)), - "config_templates", +DEFAULT_TEMPLATE_PATH_ZERO_1 = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config_templates", "template_zero1.json") -DEFAULT_TEMPLATE_PATH_ZERO_2 = os.path.join(os.path.dirname(os.path.realpath(__file__)), - "config_templates", +DEFAULT_TEMPLATE_PATH_ZERO_2 = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config_templates", "template_zero2.json") -DEFAULT_TEMPLATE_PATH_ZERO_3 = os.path.join(os.path.dirname(os.path.realpath(__file__)), - "config_templates", +DEFAULT_TEMPLATE_PATH_ZERO_3 = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config_templates", "template_zero3.json") METRIC_PERCENT_DIFF_CONST = 0.05 @@ -157,50 +153,31 @@ DEFAULT_TUNING_SPACE_ZERO_0 = {"zero_optimization": {"stage": 0}} DEFAULT_TUNING_SPACE_ZERO_1 = { "zero_optimization": { "stage": 1, - "reduce_bucket_size": [5e7, - 5e8, - 1e9], - "allgather_bucket_size": [5e7, - 5e8, - 1e9], + "reduce_bucket_size": [5e7, 5e8, 1e9], + "allgather_bucket_size": [5e7, 5e8, 1e9], } } DEFAULT_TUNING_SPACE_ZERO_2 = { "zero_optimization": { "stage": 2, - "overlap_comm": [True, - False], - "reduce_scatter": [False, - True], - "reduce_bucket_size": [5e7, - 5e8, - 1e9], - "allgather_bucket_size": [5e7, - 5e8, - 1e9], - "contiguous_gradients": [False, - True] + "overlap_comm": [True, False], + "reduce_scatter": [False, True], + "reduce_bucket_size": [5e7, 5e8, 1e9], + "allgather_bucket_size": [5e7, 5e8, 1e9], + "contiguous_gradients": [False, True] }, } DEFAULT_TUNING_SPACE_ZERO_3 = { "zero_optimization": { "stage": 3, - "overlap_comm": [True, - False], - "reduce_scatter": [False, - True], - "reduce_bucket_size": [5e7, - 5e8, - 1e9], - "allgather_partitions": [True, - False], - "allgather_bucket_size": [5e7, - 5e8, - 1e9], - "contiguous_gradients": [False, - True] + "overlap_comm": [True, False], + "reduce_scatter": [False, True], + "reduce_bucket_size": [5e7, 5e8, 1e9], + "allgather_partitions": [True, False], + "allgather_bucket_size": [5e7, 5e8, 1e9], + "contiguous_gradients": [False, True] }, } diff --git a/deepspeed/autotuning/scheduler.py b/deepspeed/autotuning/scheduler.py index 2a4c0c70d..5df6e706b 100755 --- a/deepspeed/autotuning/scheduler.py +++ b/deepspeed/autotuning/scheduler.py @@ -28,13 +28,8 @@ TIMEOUT = 5 class ResourceManager: - def __init__(self, - args, - hosts, - num_gpus_per_node, - results_dir, - exps_dir, - arg_mappings): + + def __init__(self, args, hosts, num_gpus_per_node, results_dir, exps_dir, arg_mappings): self.results_dir = results_dir self.exps_dir = exps_dir @@ -69,13 +64,10 @@ class ResourceManager: exp["exp_id"] = self.experiment_count self.experiment_count += 1 - result_dir = exp["result_dir"] = os.path.join( - self.results_dir, - exp['name']) + result_dir = exp["result_dir"] = os.path.join(self.results_dir, exp['name']) if AUTOTUNING in exp["ds_config"]: metric_file = os.path.join(result_dir, "metrics.json") - exp["ds_config"][AUTOTUNING][ - AUTOTUNING_METRIC_PATH] = metric_file + exp["ds_config"][AUTOTUNING][AUTOTUNING_METRIC_PATH] = metric_file stderr_file = os.path.join(result_dir, "stderr.log") model_info_file = os.path.join(result_dir, "model_info.json") metric_file = os.path.join(result_dir, "metrics.json") @@ -86,11 +78,8 @@ class ResourceManager: err = search_error(stderr_file) exp_id = exp["exp_id"] self.finished_experiments[exp_id] = (exp, err) - if err or os.path.exists(metric_file) or os.path.exists( - model_info_file): - logger.info( - f"Skipping exp {exp['name']} whose result already exists" - ) + if err or os.path.exists(metric_file) or os.path.exists(model_info_file): + logger.info(f"Skipping exp {exp['name']} whose result already exists") continue self.experiment_queue.append(exp) @@ -113,11 +102,7 @@ class ResourceManager: user_args.append(val) user_args.append(str(nval)) - t = threading.Thread(target=run_experiment, - args=(exp, - reservations, - user_script, - user_args)) + t = threading.Thread(target=run_experiment, args=(exp, reservations, user_script, user_args)) t.start() self.running_experiments[exp_id] = (t, exp, reservations, time.time()) @@ -270,6 +255,7 @@ class ResourceManager: class Node: + def __init__(self, host, max_slots): self.host = host self.max_slots = max_slots @@ -284,6 +270,7 @@ class Node: class Reservation: + def __init__(self, node, slots): self.node = node self.slots = slots @@ -389,9 +376,8 @@ def run_experiment(exp: dict, reservations, user_script, user_args): f"Launching exp_id = {exp['exp_id']}, exp_name = {exp['name']}, with resource = {include_str}, and ds_config = {os.path.abspath(ds_config_path)}" ) - with open(os.path.join(exp_dir, "stdout.log"), "wb") as out, open( - os.path.join(exp_dir, "stderr.log"), "wb" - ) as err: + with open(os.path.join(exp_dir, "stdout.log"), "wb") as out, open(os.path.join(exp_dir, "stderr.log"), + "wb") as err: result = subprocess.Popen(cmd, stdout=out, stderr=err) result.wait() out.flush() @@ -401,9 +387,7 @@ def run_experiment(exp: dict, reservations, user_script, user_args): clean_up(exp, reservations) - logger.info( - f"Done running exp_id = {exp['exp_id']}, exp_name = {exp['name']}, with resource = {include_str}" - ) + logger.info(f"Done running exp_id = {exp['exp_id']}, exp_name = {exp['name']}, with resource = {include_str}") PDSH_MAX_FAN_OUT = 1024 @@ -417,8 +401,7 @@ def clean_up(exp: dict, reservations): for reservation in reservations: nodes_str += f"{reservation.node.host}," nodes_str = nodes_str[:-1] - logger.debug( - f"Cleaning up exp_id = {exp['exp_id']} on the following workers: {nodes_str}") + logger.debug(f"Cleaning up exp_id = {exp['exp_id']} on the following workers: {nodes_str}") # PDSH flags for max node fan out and specific hosts to launch on # See https://linux.die.net/man/1/pdsh for flag details @@ -441,6 +424,4 @@ def clean_up(exp: dict, reservations): if result.returncode > 0: sys.exit(result.returncode) - logger.info( - f"Done cleaning up exp_id = {exp['exp_id']} on the following workers: {nodes_str}" - ) + logger.info(f"Done cleaning up exp_id = {exp['exp_id']} on the following workers: {nodes_str}") diff --git a/deepspeed/autotuning/tuner/base_tuner.py b/deepspeed/autotuning/tuner/base_tuner.py index 89eb2d384..a5dd64221 100755 --- a/deepspeed/autotuning/tuner/base_tuner.py +++ b/deepspeed/autotuning/tuner/base_tuner.py @@ -8,6 +8,7 @@ from deepspeed.utils import logger class BaseTuner: + def __init__(self, exps, resource_manager, metric): self.all_exps = exps self.rm = resource_manager @@ -42,8 +43,8 @@ class BaseTuner: self.rm.schedule_experiments(exp_paths) self.rm.run() exp, metric_val = self.rm.parse_results(self.metric) - if self.best_exp == None or self.best_metric_val == None or ( - metric_val and metric_val > self.best_metric_val): + if self.best_exp == None or self.best_metric_val == None or (metric_val + and metric_val > self.best_metric_val): # logger.info(f"tuner finds better = {exp}") self.best_exp = exp self.best_metric_val = metric_val diff --git a/deepspeed/autotuning/tuner/cost_model.py b/deepspeed/autotuning/tuner/cost_model.py index 858ab6d3d..75e19cfbf 100755 --- a/deepspeed/autotuning/tuner/cost_model.py +++ b/deepspeed/autotuning/tuner/cost_model.py @@ -9,6 +9,7 @@ except ImportError: class XGBoostCostModel(): + def __init__(self, loss_type, num_threads=None, log_interval=25, upper_model=None): assert xgb is not None, "missing requirements, please install deepspeed w. 'autotuning_ml' extra." diff --git a/deepspeed/autotuning/tuner/index_based_tuner.py b/deepspeed/autotuning/tuner/index_based_tuner.py index f19694871..430d127d2 100755 --- a/deepspeed/autotuning/tuner/index_based_tuner.py +++ b/deepspeed/autotuning/tuner/index_based_tuner.py @@ -7,6 +7,7 @@ from .base_tuner import BaseTuner class RandomTuner(BaseTuner): """Explore the search space in random order""" + def __init__(self, exps: list, resource_manager, metric): super().__init__(exps, resource_manager, metric) @@ -22,6 +23,7 @@ class RandomTuner(BaseTuner): class GridSearchTuner(BaseTuner): """Explore the search space in sequential order""" + def __init__(self, exps: list, resource_manager, metric): super().__init__(exps, resource_manager, metric) diff --git a/deepspeed/autotuning/tuner/model_based_tuner.py b/deepspeed/autotuning/tuner/model_based_tuner.py index ec475005a..aee0d4c0c 100755 --- a/deepspeed/autotuning/tuner/model_based_tuner.py +++ b/deepspeed/autotuning/tuner/model_based_tuner.py @@ -15,6 +15,7 @@ INIT_NUM = 2 class ModelBasedTuner(BaseTuner): """Exploring the search space with a cost model""" + def __init__(self, exps: list, resource_manager, metric, tuning_sapce): super().__init__(exps, resource_manager, metric) self.tuning_space = tuning_sapce @@ -25,8 +26,7 @@ class ModelBasedTuner(BaseTuner): self.dims = dict_to_dims(self.tuning_space) - logger.info( - f"Create config dim: {self.dims}, all configs: {self.num_all_configs}") + logger.info(f"Create config dim: {self.dims}, all configs: {self.num_all_configs}") self.visited = set([]) @@ -71,9 +71,7 @@ class ModelBasedTuner(BaseTuner): n = len(estimates) top_idx = np.argsort(estimates) - top_idx_ret = top_idx if self.metric == AUTOTUNING_METRIC_LATENCY else top_idx[:: - -1][: - n] + top_idx_ret = top_idx if self.metric == AUTOTUNING_METRIC_LATENCY else top_idx[::-1][:n] # top_configs = [self.all_configs[i] for i in top_idx] @@ -145,9 +143,7 @@ class ModelBasedTuner(BaseTuner): self.evaluated_configs.append(feature_val) self.evaluated_perf.append(curr_iter) - logger.debug( - f"**Evaluated configs: {len(self.evaluated_configs)}, evaluated perf: {self.evaluated_perf}" - ) + logger.debug(f"**Evaluated configs: {len(self.evaluated_configs)}, evaluated perf: {self.evaluated_perf}") self.cost_model.fit(self.evaluated_configs, self.evaluated_perf) diff --git a/deepspeed/autotuning/tuner/utils.py b/deepspeed/autotuning/tuner/utils.py index f6719f0b3..68382695d 100755 --- a/deepspeed/autotuning/tuner/utils.py +++ b/deepspeed/autotuning/tuner/utils.py @@ -44,9 +44,7 @@ def gen_combinations(d: dict): for v in values: if not isinstance(v, list): v = [v] - values_choices = (gen_combinations(v) if isinstance(v, - dict) else get_list(v) - for v in values) + values_choices = (gen_combinations(v) if isinstance(v, dict) else get_list(v) for v in values) for comb in itertools.product(*values_choices): yield dict(zip(keys, comb)) diff --git a/deepspeed/autotuning/utils.py b/deepspeed/autotuning/utils.py index 40da7ec96..6103ef8e8 100644 --- a/deepspeed/autotuning/utils.py +++ b/deepspeed/autotuning/utils.py @@ -176,6 +176,7 @@ def fetch_hostfile(hostfile_path): def validate_ds_config(config: dict): + def is_False(config: dict, key): if config is None: return False @@ -189,9 +190,7 @@ def validate_ds_config(config: dict): if stage == 1: return True elif stage == 2: - if is_False(config_zero, - "cpu_offload") and is_False(config_zero, - "cpu_offload_params"): + if is_False(config_zero, "cpu_offload") and is_False(config_zero, "cpu_offload_params"): return False elif stage == 3: offload_devices = ["cpu", "nvme"] @@ -289,14 +288,13 @@ def get_all_configs(tuning_space: dict, ignore_keys=None): Args: tuning_space (dict): the tuning space where tunable parameters are lists of values. """ + def gen_combinations(d: dict): keys, values = d.keys(), d.values() for v in values: if not isinstance(v, list): v = [v] - values_choices = (gen_combinations(v) if isinstance(v, - dict) else get_list(v) - for v in values) + values_choices = (gen_combinations(v) if isinstance(v, dict) else get_list(v) for v in values) for comb in itertools.product(*values_choices): yield dict(zip(keys, comb)) diff --git a/deepspeed/checkpoint/__init__.py b/deepspeed/checkpoint/__init__.py index 2ef0817c7..e44523944 100644 --- a/deepspeed/checkpoint/__init__.py +++ b/deepspeed/checkpoint/__init__.py @@ -4,9 +4,7 @@ from .reshape_meg_2d import reshape_meg_2d_parallel from .deepspeed_checkpoint import DeepSpeedCheckpoint -from .utils import (get_layer_ckpt_name_for_rank, - get_model_ckpt_name_for_rank, - get_zero_ckpt_name_for_rank) +from .utils import (get_layer_ckpt_name_for_rank, get_model_ckpt_name_for_rank, get_zero_ckpt_name_for_rank) from .reshape_utils import (merge_state) diff --git a/deepspeed/checkpoint/deepspeed_checkpoint.py b/deepspeed/checkpoint/deepspeed_checkpoint.py index c1a31b0fb..2e7a07b73 100644 --- a/deepspeed/checkpoint/deepspeed_checkpoint.py +++ b/deepspeed/checkpoint/deepspeed_checkpoint.py @@ -5,11 +5,7 @@ from typing import Dict import torch from .reshape_3d_utils import model_3d_desc -from .reshape_utils import (basic_folder_validation, - merge_state, - partition_data, - get_files, - get_files_with_prefix) +from .reshape_utils import (basic_folder_validation, merge_state, partition_data, get_files, get_files_with_prefix) from .constants import (MODEL_FILE_PREFIX, LAYER_FILE_PREFIX) @@ -24,19 +20,15 @@ CHECKPOINT_INFO_KEY = 'checkpoint_info' ITERATION_KEY = 'iteration' SEQUENTIAL_LAYERS = [ - 'input_layernorm.weight', - 'input_layernorm.bias', - 'self_attention.dense.bias', - 'post_attention_layernorm.weight', - 'post_attention_layernorm.bias', - 'mlp.dense_4h_to_h.bias', - 'position_embeddings.weight' + 'input_layernorm.weight', 'input_layernorm.bias', 'self_attention.dense.bias', 'post_attention_layernorm.weight', + 'post_attention_layernorm.bias', 'mlp.dense_4h_to_h.bias', 'position_embeddings.weight' ] LAYER_CONCAT_DIM = {'self_attention.dense.weight': 1, 'mlp.dense_4h_to_h.weight': 1} class DeepSpeedCheckpoint(object): + def __init__(self, dir, tp_degree=None, pp_degree=None, dp_degree=None): self.dir = dir self._validate_folder(dir) @@ -50,33 +42,24 @@ class DeepSpeedCheckpoint(object): self.layer_keys = self._get_layer_keys() self.layer_count = len(self.layer_keys) - self.tp_degree = self.zero_checkpoint.get_src_tp_degree( - ) if tp_degree is None else tp_degree - self.pp_degree = self.zero_checkpoint.get_src_pp_degree( - ) if pp_degree is None else pp_degree - self.dp_degree = self.zero_checkpoint.get_src_dp_degree( - ) if dp_degree is None else dp_degree + self.tp_degree = self.zero_checkpoint.get_src_tp_degree() if tp_degree is None else tp_degree + self.pp_degree = self.zero_checkpoint.get_src_pp_degree() if pp_degree is None else pp_degree + self.dp_degree = self.zero_checkpoint.get_src_dp_degree() if dp_degree is None else dp_degree - self.original_world_size = self.zero_checkpoint.get_src_tp_degree( - ) * self.zero_checkpoint.get_src_pp_degree( + self.original_world_size = self.zero_checkpoint.get_src_tp_degree() * self.zero_checkpoint.get_src_pp_degree( ) * self.zero_checkpoint.get_src_dp_degree() self.world_size = self.tp_degree * self.pp_degree * self.dp_degree self.old_2d_map = meg_2d_parallel_map(self.zero_checkpoint.get_src_pp_degree(), self.zero_checkpoint.get_src_tp_degree()) self.old_2d_map.simple_init() - self.new_2d_map = reshape_meg_2d_parallel( - old_pp_degree=self.zero_checkpoint.get_src_pp_degree(), - old_tp_degree=self.zero_checkpoint.get_src_tp_degree(), - new_pp_degree=self.pp_degree, - new_tp_degree=self.tp_degree) + self.new_2d_map = reshape_meg_2d_parallel(old_pp_degree=self.zero_checkpoint.get_src_pp_degree(), + old_tp_degree=self.zero_checkpoint.get_src_tp_degree(), + new_pp_degree=self.pp_degree, + new_tp_degree=self.tp_degree) - if self.is_change_pp_degree() or self.is_change_tp_degree( - ) or self.is_change_dp_degree(): - self.zero_checkpoint.reshape( - model_3d_desc(self.pp_degree, - self.tp_degree, - self.dp_degree)) + if self.is_change_pp_degree() or self.is_change_tp_degree() or self.is_change_dp_degree(): + self.zero_checkpoint.reshape(model_3d_desc(self.pp_degree, self.tp_degree, self.dp_degree)) self.global_state = {} @@ -84,8 +67,7 @@ class DeepSpeedCheckpoint(object): self.pp_to_transformer_map = self._build_pp_transformer_map() self.transformer_file_map = self._build_transformer_file_map() self.tp_to_embedding_map = self._build_tp_other_layer_map(EMBEDDING_LAYER_INDEX) - self.tp_to_final_norm_map = self._build_tp_other_layer_map( - FINAL_LAYER_NORM_INDEX) + self.tp_to_final_norm_map = self._build_tp_other_layer_map(FINAL_LAYER_NORM_INDEX) self._build_global_state() def is_change_tp_degree(self): @@ -131,9 +113,7 @@ class DeepSpeedCheckpoint(object): keys_to_ignore=[PARAM_SHAPES]) def get_zero_files(self, pp_index, tp_index, dp_index) -> list: - return self.zero_checkpoint.get_files_for_rank(pp_index=pp_index, - tp_index=tp_index, - dp_index=dp_index) + return self.zero_checkpoint.get_files_for_rank(pp_index=pp_index, tp_index=tp_index, dp_index=dp_index) def get_embedding_layer_id(self): return self.layer_keys[EMBEDDING_LAYER_INDEX] @@ -150,11 +130,7 @@ class DeepSpeedCheckpoint(object): def get_embedding_state(self, tp_index: int) -> Dict: assert tp_index in self.tp_to_embedding_map.keys() - sd_list = [ - torch.load(fname, - map_location=torch.device('cpu')) - for fname in self.tp_to_embedding_map[tp_index] - ] + sd_list = [torch.load(fname, map_location=torch.device('cpu')) for fname in self.tp_to_embedding_map[tp_index]] sd = self._merge_state_dicts(sd_list) return sd @@ -179,10 +155,7 @@ class DeepSpeedCheckpoint(object): assert tp_index < self.tp_degree assert pp_index < self.pp_degree fname_list = self.get_2d_parallel_files(tp_index=tp_index, pp_index=pp_index) - sd_list = [ - torch.load(fname, - map_location=torch.device('cpu')) for fname in fname_list - ] + sd_list = [torch.load(fname, map_location=torch.device('cpu')) for fname in fname_list] merged_sd = None for sd in sd_list: @@ -198,10 +171,7 @@ class DeepSpeedCheckpoint(object): assert pp_index < self.pp_degree t_list = [] for fname_list in self.transformer_file_map[(tp_index, pp_index)]: - sd_list = [ - torch.load(fname, - map_location=torch.device('cpu')) for fname in fname_list - ] + sd_list = [torch.load(fname, map_location=torch.device('cpu')) for fname in fname_list] sd = self._merge_state_dicts(sd_list) t_list.append(sd) return t_list @@ -212,8 +182,7 @@ class DeepSpeedCheckpoint(object): def get_final_norm_state(self, tp_index: int) -> Dict: assert tp_index in self.tp_to_final_norm_map.keys() - sd = torch.load(self.tp_to_final_norm_map[tp_index][0], - map_location=torch.device('cpu')) + sd = torch.load(self.tp_to_final_norm_map[tp_index][0], map_location=torch.device('cpu')) return sd def get_final_norm_files(self, tp_index: int) -> list: @@ -222,8 +191,7 @@ class DeepSpeedCheckpoint(object): def _build_tp_other_layer_map(self, layer_index: int): assert layer_index < len(self.layer_files) - layer_files = get_files_with_prefix(self.layer_files, - self.layer_keys[layer_index]) + layer_files = get_files_with_prefix(self.layer_files, self.layer_keys[layer_index]) layer_file_partitions = partition_data(layer_files, self.tp_degree) data_map = {i: flist for i, flist in enumerate(layer_file_partitions)} return data_map @@ -238,11 +206,7 @@ class DeepSpeedCheckpoint(object): data_map = {} transformer_layers = self.layer_keys[1:-1] layers_per_pp = len(transformer_layers) // self.pp_degree - data_map = { - i: transformer_layers[i * layers_per_pp:(i + 1) * layers_per_pp] - for i in range(0, - self.pp_degree) - } + data_map = {i: transformer_layers[i * layers_per_pp:(i + 1) * layers_per_pp] for i in range(0, self.pp_degree)} return data_map def _dump_mapping(self, data_map, map_tag=None): @@ -308,10 +272,8 @@ class DeepSpeedCheckpoint(object): file_list = get_files(dir) - for file_prefix in [ - MODEL_FILE_PREFIX, - LAYER_FILE_PREFIX, - f'{LAYER_FILE_PREFIX}01' - ]: + for file_prefix in [MODEL_FILE_PREFIX, LAYER_FILE_PREFIX, f'{LAYER_FILE_PREFIX}01']: ckpt_files = get_files_with_prefix(file_list, file_prefix) - assert len(ckpt_files) > 0, f'{dir} seems a bogus DeepSpeed checkpoint folder: Cannot find {file_prefix}* files in there.' + assert len( + ckpt_files + ) > 0, f'{dir} seems a bogus DeepSpeed checkpoint folder: Cannot find {file_prefix}* files in there.' diff --git a/deepspeed/checkpoint/reshape_3d_utils.py b/deepspeed/checkpoint/reshape_3d_utils.py index 15faffb2a..0c7d3e7ac 100644 --- a/deepspeed/checkpoint/reshape_3d_utils.py +++ b/deepspeed/checkpoint/reshape_3d_utils.py @@ -1,9 +1,6 @@ '''Copyright The Microsoft DeepSpeed Team''' -from .reshape_utils import (get_files, - get_files_with_prefix, - partition_data, - get_zero_files) +from .reshape_utils import (get_files, get_files_with_prefix, partition_data, get_zero_files) from .constants import (MODEL_FILE_PREFIX, LAYER_FILE_PREFIX) @@ -15,6 +12,7 @@ DP_DIM = 'DP' class model_3d_desc(object): + def __init__(self, pp_degree=1, tp_degree=1, dp_degree=1): self.pp_degree = pp_degree self.tp_degree = tp_degree @@ -33,8 +31,7 @@ class model_3d_desc(object): src_2d_size=self.pp_degree * self.tp_degree, dp_degree=self.dp_degree) - return unflatten_dp_dimension(meg_2d_map=flat_3d_map, - dp_degree=target_3d_desc.dp_degree) + return unflatten_dp_dimension(meg_2d_map=flat_3d_map, dp_degree=target_3d_desc.dp_degree) def get_desc(self): return f'{PP_DIM},{TP_DIM},{DP_DIM} = ({self.pp_degree}, {self.tp_degree}, {self.dp_degree})' @@ -45,14 +42,11 @@ class model_3d_desc(object): def is_valid(self, pp_index, tp_index, dp_index): err_msg = [] valid = True - for index, degree, dim_name in [ - (pp_index, self.pp_degree, PP_DIM), - (tp_index, self.tp_degree, TP_DIM), - (dp_index, self.dp_degree, DP_DIM)]: + for index, degree, dim_name in [(pp_index, self.pp_degree, PP_DIM), (tp_index, self.tp_degree, TP_DIM), + (dp_index, self.dp_degree, DP_DIM)]: if index >= degree: valid = False - err_msg.append( - f'{dim_name} indexing error: index {index} >= degree {degree}') + err_msg.append(f'{dim_name} indexing error: index {index} >= degree {degree}') return valid, err_msg @@ -60,18 +54,15 @@ class model_3d_desc(object): err_msg = [] if target_3d_desc.pp_degree > self.pp_degree: err_msg.append( - f'Expansion reshape not supported - {PP_DIM}: {self.pp_degree} ---> {target_3d_desc.pp_degree}' - ) + f'Expansion reshape not supported - {PP_DIM}: {self.pp_degree} ---> {target_3d_desc.pp_degree}') if target_3d_desc.tp_degree > self.tp_degree: err_msg.append( - f'Expansion reshape not supported - {TP_DIM}: {self.tp_degree} ---> {target_3d_desc.tp_degree}' - ) + f'Expansion reshape not supported - {TP_DIM}: {self.tp_degree} ---> {target_3d_desc.tp_degree}') if target_3d_desc.dp_degree > self.dp_degree: err_msg.append( - f'Expansion reshape not supported - {DP_DIM}: {self.dp_degree} ---> {target_3d_desc.dp_degree}' - ) + f'Expansion reshape not supported - {DP_DIM}: {self.dp_degree} ---> {target_3d_desc.dp_degree}') return len(err_msg) == 0, err_msg @@ -106,10 +97,7 @@ def flatten_dp_dimension(meg_2d_map, src_2d_size, dp_degree): def unflatten_dp_dimension(meg_2d_map, dp_degree): pp_degree = meg_2d_map.pp_degree tp_degree = meg_2d_map.tp_degree - meg_2d_map_list = [ - meg_2d_parallel_map(pp_degree=pp_degree, - tp_degree=tp_degree) for _ in range(dp_degree) - ] + meg_2d_map_list = [meg_2d_parallel_map(pp_degree=pp_degree, tp_degree=tp_degree) for _ in range(dp_degree)] for pp_index in range(pp_degree): for tp_index in range(tp_degree): flat_dp_indices = meg_2d_map.get_data(pp_index, tp_index) diff --git a/deepspeed/checkpoint/reshape_meg_2d.py b/deepspeed/checkpoint/reshape_meg_2d.py index 4800b5558..f615e7385 100644 --- a/deepspeed/checkpoint/reshape_meg_2d.py +++ b/deepspeed/checkpoint/reshape_meg_2d.py @@ -4,6 +4,7 @@ from .reshape_utils import partition_data class meg_2d_parallel_map(object): + def __init__(self, pp_degree, tp_degree): self.pp_degree = pp_degree self.tp_degree = tp_degree @@ -11,8 +12,7 @@ class meg_2d_parallel_map(object): def simple_init(self): self.map = { - self._make_key(i // self.tp_degree, - i % self.tp_degree): [i] + self._make_key(i // self.tp_degree, i % self.tp_degree): [i] for i in range(self.pp_degree * self.tp_degree) } @@ -74,11 +74,7 @@ def _reshape_pp_dimension(old_2d_map, new_pp_degree): return new_2d_map -def reshape_meg_2d_parallel(old_pp_degree, - old_tp_degree, - new_pp_degree, - new_tp_degree, - verbose=False): +def reshape_meg_2d_parallel(old_pp_degree, old_tp_degree, new_pp_degree, new_tp_degree, verbose=False): assert new_pp_degree <= old_pp_degree assert new_tp_degree <= old_tp_degree @@ -137,8 +133,7 @@ def get_mpu_ranks(tp_size=1, pp_size=1, dp_size=1, virtual_pp_size=None): tensor_model_parallel_size = min(tp_size, world_size) pipeline_model_parallel_size = min(pp_size, world_size) - data_parallel_size = world_size // (tensor_model_parallel_size * - pipeline_model_parallel_size) + data_parallel_size = world_size // (tensor_model_parallel_size * pipeline_model_parallel_size) num_tensor_model_parallel_groups = world_size // tensor_model_parallel_size num_pipeline_model_parallel_groups = world_size // pipeline_model_parallel_size @@ -158,10 +153,7 @@ def get_mpu_ranks(tp_size=1, pp_size=1, dp_size=1, virtual_pp_size=None): # Build the model-parallel groups. all_pp_group_ranks = [] for i in range(data_parallel_size): - ranks = [ - data_parallel_group_ranks[i] - for data_parallel_group_ranks in all_dp_group_ranks - ] + ranks = [data_parallel_group_ranks[i] for data_parallel_group_ranks in all_dp_group_ranks] all_pp_group_ranks.append(list(ranks)) print(f"PP", all_pp_group_ranks) @@ -169,8 +161,7 @@ def get_mpu_ranks(tp_size=1, pp_size=1, dp_size=1, virtual_pp_size=None): # Build the tensor model-parallel groups. all_tp_group_ranks = [] for i in range(num_tensor_model_parallel_groups): - ranks = range(i * tensor_model_parallel_size, - (i + 1) * tensor_model_parallel_size) + ranks = range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size) all_tp_group_ranks.append(list(ranks)) print(f"TP", all_tp_group_ranks) diff --git a/deepspeed/checkpoint/reshape_utils.py b/deepspeed/checkpoint/reshape_utils.py index 4d6b272f4..91b082c1a 100644 --- a/deepspeed/checkpoint/reshape_utils.py +++ b/deepspeed/checkpoint/reshape_utils.py @@ -49,11 +49,7 @@ def partition_data(data_list, num_partitions): num_elems = len(data_list) assert num_elems % num_partitions == 0 partition_size = num_elems // num_partitions - partitions_list = [ - data_list[i:i + partition_size] for i in range(0, - num_elems, - partition_size) - ] + partitions_list = [data_list[i:i + partition_size] for i in range(0, num_elems, partition_size)] return partitions_list @@ -76,9 +72,7 @@ def merge_state_dict(dict_a, dict_b, key_list): def merge_state_list(list_a, list_b, key_list): if len(list_a) != len(list_b): print(f'{_key_list_to_string(key_list)}') - raise ValueError( - f'Cannot merge lists of different lengths, a = {len(list_a)} b = {len(list_b)}' - ) + raise ValueError(f'Cannot merge lists of different lengths, a = {len(list_a)} b = {len(list_b)}') return [merge_state(a, b, key_list) for a, b in zip(list_a, list_b)] @@ -87,8 +81,7 @@ def merge_state(state_a, state_b, key_list=[]): if type(state_a) != type(state_b): key_list_string = _key_list_to_string(key_list) print(f'key_list = {key_list_string}') - raise ValueError( - f'Cannot merge two states of types {type(state_a)} and type {type(state_b)}') + raise ValueError(f'Cannot merge two states of types {type(state_a)} and type {type(state_b)}') if type(state_a) in (dict, OrderedDict): return merge_state_dict(state_a, state_b, key_list) diff --git a/deepspeed/checkpoint/universal_checkpoint.py b/deepspeed/checkpoint/universal_checkpoint.py index b58de4871..a4b039a56 100644 --- a/deepspeed/checkpoint/universal_checkpoint.py +++ b/deepspeed/checkpoint/universal_checkpoint.py @@ -4,10 +4,7 @@ Copyright 2022 The Microsoft DeepSpeed Team import os import torch import types -from .constants import (FP32_WEIGHT_KEY, - PARAM, - VOCAB_DIVISIBILITY_PADDING_TENSOR, - CAT_DIM) +from .constants import (FP32_WEIGHT_KEY, PARAM, VOCAB_DIVISIBILITY_PADDING_TENSOR, CAT_DIM) def load_hp_checkpoint_state(self, folder, tp_rank, tp_world_size): @@ -44,9 +41,7 @@ def load_hp_checkpoint_state(self, folder, tp_rank, tp_world_size): # the converter to universal currently strips the original padding completely so the saved # weight is padding-free and we just need to add new padding depending on the target TP # degree - vocab_divisibility_padding_tensor = ckpt_dict.get( - VOCAB_DIVISIBILITY_PADDING_TENSOR, - None) + vocab_divisibility_padding_tensor = ckpt_dict.get(VOCAB_DIVISIBILITY_PADDING_TENSOR, None) if vocab_divisibility_padding_tensor is not None: # In the absence of data passed from the user wrt new padded vocab specific to tp degree # we can again derive that data by reverse engineering the target shapes like so: @@ -56,13 +51,7 @@ def load_hp_checkpoint_state(self, folder, tp_rank, tp_world_size): padding_size = padded_target_vocab_size - full_hp_param.shape[0] # Implement the following concat in efficient way using pad #full_hp_param = torch.cat((full_hp_param, padding_tensor), 0) - full_hp_param = torch.nn.functional.pad(full_hp_param, - (0, - 0, - 0, - padding_size), - "constant", - 0) + full_hp_param = torch.nn.functional.pad(full_hp_param, (0, 0, 0, padding_size), "constant", 0) full_hp_param[:-padding_size, :] = vocab_divisibility_padding_tensor else: # Need to shrink or keep the same @@ -76,8 +65,7 @@ def load_hp_checkpoint_state(self, folder, tp_rank, tp_world_size): assert full_param_numel == tp_world_size * tp_slice_numel, \ f'Loading {ckpt_file} full param numel {full_param_numel} != tensor slice numel {tp_slice_numel} * tp_world_size {tp_world_size}' - dst_tensor = hp_mapping.hp_fragment if key == FP32_WEIGHT_KEY else hp_mapping.get_optim_state_fragment( - key) + dst_tensor = hp_mapping.hp_fragment if key == FP32_WEIGHT_KEY else hp_mapping.get_optim_state_fragment(key) # print(f"{full_hp_param.shape=} {full_param_numel=} {folder=}") # print(f"{dst_tensor.shape=} {dst_tensor.numel()=}{folder=}") @@ -90,9 +78,7 @@ def load_hp_checkpoint_state(self, folder, tp_rank, tp_world_size): tp_hp_slice = tp_hp_slice.flatten() lp_frag_address = hp_mapping.lp_fragment_address - tp_hp_fragment = tp_hp_slice.narrow(0, - lp_frag_address.start, - lp_frag_address.numel) + tp_hp_fragment = tp_hp_slice.narrow(0, lp_frag_address.start, lp_frag_address.numel) assert dst_tensor.numel() == lp_frag_address.numel, \ f'Load checkpoint {key} dst_tensor numel {dst_tensor.numel()} != src numel {lp_frag_address.numel}' @@ -104,5 +90,4 @@ def load_hp_checkpoint_state(self, folder, tp_rank, tp_world_size): def enable_universal_checkpoint(param_list): for param in param_list: - param.load_hp_checkpoint_state = types.MethodType(load_hp_checkpoint_state, - param) + param.load_hp_checkpoint_state = types.MethodType(load_hp_checkpoint_state, param) diff --git a/deepspeed/checkpoint/utils.py b/deepspeed/checkpoint/utils.py index 4e96d0730..c03ce85d0 100644 --- a/deepspeed/checkpoint/utils.py +++ b/deepspeed/checkpoint/utils.py @@ -1,10 +1,7 @@ '''Copyright The Microsoft DeepSpeed Team''' import os -from .constants import (MODEL_FILE_PREFIX, - MODEL_FILE_SUFFIX, - OPTIM_FILE_SUFFIX, - ZERO_FILE_PREFIX) +from .constants import (MODEL_FILE_PREFIX, MODEL_FILE_SUFFIX, OPTIM_FILE_SUFFIX, ZERO_FILE_PREFIX) def get_model_ckpt_name_for_rank(base_folder, mp_rank_str): diff --git a/deepspeed/checkpoint/zero_checkpoint.py b/deepspeed/checkpoint/zero_checkpoint.py index cb33e8e37..9a694b013 100644 --- a/deepspeed/checkpoint/zero_checkpoint.py +++ b/deepspeed/checkpoint/zero_checkpoint.py @@ -2,10 +2,7 @@ import torch -from .constants import (BASE_OPTIMIZER_STATE, - GROUP_PADDINGS, - OPTIMIZER_STATE_DICT, - PARTITION_COUNT) +from .constants import (BASE_OPTIMIZER_STATE, GROUP_PADDINGS, OPTIMIZER_STATE_DICT, PARTITION_COUNT) from .reshape_utils import (basic_folder_validation, get_zero_files, merge_state) @@ -15,6 +12,7 @@ GROUP_STATE_KEY = 'state' class ZeROCheckpoint(object): + def __init__(self, dir): basic_folder_validation(dir) self.dir = dir @@ -49,12 +47,7 @@ class ZeROCheckpoint(object): file_idx_list = self.get_file_indices_for_rank(pp_index, tp_index, dp_index) return [self.file_list[idx] for idx in file_idx_list] - def get_state_for_rank(self, - pp_index, - tp_index, - dp_index, - keys_to_ignore=[], - strip_tensor_paddings=True): + def get_state_for_rank(self, pp_index, tp_index, dp_index, keys_to_ignore=[], strip_tensor_paddings=True): state_file_list = self.get_files_for_rank(pp_index, tp_index, dp_index) merged_sd = None for state_file in state_file_list: @@ -111,10 +104,7 @@ class ZeROCheckpoint(object): for state_name, state_value in group_state.items(): if torch.is_tensor(state_value): raw_length = state_value.numel() - group_paddings[key] - group_state[state_name] = torch.narrow(state_value, - 0, - 0, - raw_length).clone() + group_state[state_name] = torch.narrow(state_value, 0, 0, raw_length).clone() def _clear_group_paddings(self, sd): group_paddings = self._get_optimizer_state(sd, GROUP_PADDINGS) @@ -144,5 +134,4 @@ class ZeROCheckpoint(object): partition_counts = self._get_optimizer_state(sd, PARTITION_COUNT) if partition_counts: num_groups = len(partition_counts) - sd[OPTIMIZER_STATE_DICT][PARTITION_COUNT] = [self.target_3d.dp_degree - ] * num_groups + sd[OPTIMIZER_STATE_DICT][PARTITION_COUNT] = [self.target_3d.dp_degree] * num_groups diff --git a/deepspeed/comm/__init__.py b/deepspeed/comm/__init__.py index 84d0f54c7..317c1bfde 100644 --- a/deepspeed/comm/__init__.py +++ b/deepspeed/comm/__init__.py @@ -39,13 +39,8 @@ if older_torch(): input_tensor_lst = list(chunk(input_tensor, get_world_size(group))) return reduce_scatter(output_tensor, input_tensor_lst, group=group) - def configure(deepspeed_config=None, - enabled=None, - prof_all=None, - prof_ops=None, - verbose=None): - utils.logger.warn( - "Communication logging is not supported in torch versions older than 1.8") + def configure(deepspeed_config=None, enabled=None, prof_all=None, prof_ops=None, verbose=None): + utils.logger.warn("Communication logging is not supported in torch versions older than 1.8") else: supported_torch_version = True diff --git a/deepspeed/comm/backend.py b/deepspeed/comm/backend.py index cd436865c..be228d194 100644 --- a/deepspeed/comm/backend.py +++ b/deepspeed/comm/backend.py @@ -20,6 +20,7 @@ In the future, directly use NCCL/MPI/Gloo/etc without requiring torch.distribute class Backend(object): + def __init__(self, name='backend', rank=0, size=1): self.name = name # The world size and rank of the world process group diff --git a/deepspeed/comm/comm.py b/deepspeed/comm/comm.py index 955d9708f..26dffa816 100644 --- a/deepspeed/comm/comm.py +++ b/deepspeed/comm/comm.py @@ -106,12 +106,13 @@ def configure( # Logging wrapper for timing ops def timed_op(func): + def log_wrapper(*args, **kwargs): # Add enabled flag so that overhead to each comm op is two if conditions at most if comms_logger.enabled: - if ('prof' in kwargs and kwargs['prof']) or comms_logger.prof_all or ( - 'log_name' in kwargs - and kwargs['log_name'] in comms_logger.prof_ops): + if ('prof' in kwargs + and kwargs['prof']) or comms_logger.prof_all or ('log_name' in kwargs + and kwargs['log_name'] in comms_logger.prof_ops): # Need func args for their defaults func_args = get_default_args(func) func_args.update(kwargs) @@ -129,8 +130,7 @@ def timed_op(func): if cdb.using_mpi: cdb.barrier() if ('prof' in kwargs and kwargs['prof']) or comms_logger.prof_all or ( - 'log_name' in kwargs - and kwargs['log_name'] in comms_logger.prof_ops): + 'log_name' in kwargs and kwargs['log_name'] in comms_logger.prof_ops): log_name = get_debug_log_name(func_args, comms_logger.debug) raw_name = func.__name__ timers(log_name).stop() @@ -178,7 +178,8 @@ def destroy_process_group(group=None): def new_group(ranks): global cdb - assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()' + assert cdb is not None and cdb.is_initialized( + ), 'DeepSpeed backend not set, please initialize it using init_process_group()' return cdb.new_group(ranks) @@ -197,9 +198,7 @@ def set_backend(backend): utils.logger.error( "DeepSpeed communication backend is required. Please use deepspeed.comm.init_distributed(backend, use_deepspeed=True) to use this functionality" ) - raise RuntimeError( - 'Error: Custom DeepSpeed backend called without initializing DeepSpeed distributed.' - ) + raise RuntimeError('Error: Custom DeepSpeed backend called without initializing DeepSpeed distributed.') global cdb global nccl_backend @@ -217,13 +216,7 @@ def set_backend(backend): @timed_op -def broadcast(tensor, - src, - group=None, - async_op=False, - prof=False, - log_name='broadcast', - debug=get_caller_func()): +def broadcast(tensor, src, group=None, async_op=False, prof=False, log_name='broadcast', debug=get_caller_func()): global cdb return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op) @@ -237,15 +230,13 @@ def all_gather(tensor_list, log_name='all_gather', debug=get_caller_func()): global cdb - return cdb.all_gather(tensor_list=tensor_list, - tensor=tensor, - group=group, - async_op=async_op) + return cdb.all_gather(tensor_list=tensor_list, tensor=tensor, group=group, async_op=async_op) def has_reduce_scatter_base(): global cdb - assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()' + assert cdb is not None and cdb.is_initialized( + ), 'DeepSpeed backend not set, please initialize it using init_process_group()' assert cdb.has_reduce_scatter_base is not None, 'has_reduce_scatter_base is not yet defined' return cdb.has_reduce_scatter_base @@ -258,7 +249,8 @@ def reduce_scatter_fn(output_tensor, prof=False, debug=get_caller_func()): global cdb - assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()' + assert cdb is not None and cdb.is_initialized( + ), 'DeepSpeed backend not set, please initialize it using init_process_group()' if cdb.has_reduce_scatter_base: return reduce_scatter_base(output_tensor, tensor, @@ -268,10 +260,9 @@ def reduce_scatter_fn(output_tensor, prof=prof, debug=debug) else: - utils.logger.warning_once( - "unable to find torch.distributed._reduce_scatter_base. will fall back to " - "torch.distributed.all_gather which will result in suboptimal performance. " - "please consider upgrading your pytorch installation.") + utils.logger.warning_once("unable to find torch.distributed._reduce_scatter_base. will fall back to " + "torch.distributed.all_gather which will result in suboptimal performance. " + "please consider upgrading your pytorch installation.") input_tensor_lst = list(torch.chunk(tensor, cdb.get_world_size(group))) return reduce_scatter(output_tensor, input_tensor_lst, @@ -308,44 +299,30 @@ def all_gather_base(output_tensor, log_name='all_gather_base', debug=get_caller_func()): global cdb - return cdb.all_gather_base(output_tensor=output_tensor, - input_tensor=tensor, - group=group, - async_op=async_op) + return cdb.all_gather_base(output_tensor=output_tensor, input_tensor=tensor, group=group, async_op=async_op) def has_allgather_base(): global cdb - assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()' + assert cdb is not None and cdb.is_initialized( + ), 'DeepSpeed backend not set, please initialize it using init_process_group()' assert cdb.has_allgather_base is not None, 'has_allgather_base is not yet defined' return cdb.has_allgather_base -def allgather_fn(output_tensor, - input_tensor, - group=None, - async_op=False, - debug=get_caller_func()): +def allgather_fn(output_tensor, input_tensor, group=None, async_op=False, debug=get_caller_func()): global cdb - assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()' + assert cdb is not None and cdb.is_initialized( + ), 'DeepSpeed backend not set, please initialize it using init_process_group()' if cdb.has_allgather_base: - return all_gather_base(output_tensor, - input_tensor, - group=group, - async_op=async_op, - debug=debug) + return all_gather_base(output_tensor, input_tensor, group=group, async_op=async_op, debug=debug) else: if get_rank() == 0: - utils.logger.warning_once( - "unable to find torch.distributed._all_gather_base. will fall back to " - "torch.distributed.all_gather which will result in suboptimal performance. " - "please consider upgrading your pytorch installation.") + utils.logger.warning_once("unable to find torch.distributed._all_gather_base. will fall back to " + "torch.distributed.all_gather which will result in suboptimal performance. " + "please consider upgrading your pytorch installation.") output_tensors = list(torch.chunk(output_tensor, cdb.get_world_size(group))) - return all_gather(output_tensors, - input_tensor, - group=group, - async_op=async_op, - debug=debug) + return all_gather(output_tensors, input_tensor, group=group, async_op=async_op, debug=debug) @timed_op @@ -368,49 +345,25 @@ def all_to_all_single(output, @timed_op -def send(tensor, - dst, - group=None, - tag=0, - prof=False, - log_name='send', - debug=get_caller_func()): +def send(tensor, dst, group=None, tag=0, prof=False, log_name='send', debug=get_caller_func()): global cdb return cdb.send(tensor=tensor, dst=dst, group=group, tag=tag) @timed_op -def recv(tensor, - src=None, - group=None, - tag=0, - prof=False, - log_name='recv', - debug=get_caller_func()): +def recv(tensor, src=None, group=None, tag=0, prof=False, log_name='recv', debug=get_caller_func()): global cdb return cdb.recv(tensor=tensor, src=src, group=group, tag=tag) @timed_op -def isend(tensor, - dst, - group=None, - tag=0, - prof=False, - log_name='isend', - debug=get_caller_func()): +def isend(tensor, dst, group=None, tag=0, prof=False, log_name='isend', debug=get_caller_func()): global cdb return cdb.send(tensor=tensor, dst=dst, group=group, tag=tag) @timed_op -def irecv(tensor, - src=None, - group=None, - tag=0, - prof=False, - log_name='irecv', - debug=get_caller_func()): +def irecv(tensor, src=None, group=None, tag=0, prof=False, log_name='irecv', debug=get_caller_func()): global cdb return cdb.recv(tensor=tensor, src=src, group=group, tag=tag) @@ -425,11 +378,7 @@ def gather(tensor, log_name='gather', debug=get_caller_func()): global cdb - return cdb.gather(tensor=tensor, - gather_list=gather_list, - dst=dst, - group=group, - async_op=async_op) + return cdb.gather(tensor=tensor, gather_list=gather_list, dst=dst, group=group, async_op=async_op) @timed_op @@ -442,20 +391,11 @@ def scatter(tensor, log_name='scatter', debug=get_caller_func()): global cdb - return cdb.scatter(tensor=tensor, - scatter_list=scatter_list, - src=src, - group=group, - async_op=async_op) + return cdb.scatter(tensor=tensor, scatter_list=scatter_list, src=src, group=group, async_op=async_op) @timed_op -def barrier(group=None, - async_op=False, - device_ids=None, - prof=False, - log_name='barrier', - debug=get_caller_func()): +def barrier(group=None, async_op=False, device_ids=None, prof=False, log_name='barrier', debug=get_caller_func()): global cdb return cdb.barrier(group=group, async_op=async_op, device_ids=device_ids) @@ -502,11 +442,7 @@ def reduce_scatter(output, log_name='reduce_scatter', debug=get_caller_func()): global cdb - return cdb.reduce_scatter(output=output, - input_list=input_list, - op=op, - group=group, - async_op=async_op) + return cdb.reduce_scatter(output=output, input_list=input_list, op=op, group=group, async_op=async_op) @timed_op @@ -528,7 +464,8 @@ def all_reduce(tensor, def get_world_group(): global cdb - assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()' + assert cdb is not None and cdb.is_initialized( + ), 'DeepSpeed backend not set, please initialize it using init_process_group()' return cdb.get_world_group() @@ -544,7 +481,8 @@ def get_world_size(group=None) -> int: """ global cdb - assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()' + assert cdb is not None and cdb.is_initialized( + ), 'DeepSpeed backend not set, please initialize it using init_process_group()' return cdb.get_world_size(group) @@ -563,7 +501,8 @@ def get_rank(group=None): -1, if not part of the group """ global cdb - assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()' + assert cdb is not None and cdb.is_initialized( + ), 'DeepSpeed backend not set, please initialize it using init_process_group()' return cdb.get_rank(group) @@ -576,13 +515,15 @@ def get_local_rank(): local rank (= GPU device ID) """ global cdb - assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()' + assert cdb is not None and cdb.is_initialized( + ), 'DeepSpeed backend not set, please initialize it using init_process_group()' return get_local_rank_from_launcher() def get_global_rank(group=None, group_rank=0): global cdb - assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()' + assert cdb is not None and cdb.is_initialized( + ), 'DeepSpeed backend not set, please initialize it using init_process_group()' return cdb.get_global_rank(group, group_rank) @@ -631,9 +572,7 @@ def init_distributed(dist_backend=None, required_env = ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "LOCAL_RANK"] if auto_mpi_discovery and not all(map(lambda v: v in os.environ, required_env)): if verbose: - utils.logger.info( - "Not using the DeepSpeed or dist launchers, attempting to detect MPI environment..." - ) + utils.logger.info("Not using the DeepSpeed or dist launchers, attempting to detect MPI environment...") if in_aml() and not in_dlts(): patch_aml_env_for_torch_nccl_backend(verbose=verbose) elif in_aws_sm(): @@ -649,9 +588,7 @@ def init_distributed(dist_backend=None, if dist_backend == None: dist_backend = get_accelerator().communication_backend_name() if int(os.getenv('RANK', '0')) == 0: - utils.logger.info( - 'Initializing TorchBackend in DeepSpeed with backend {}'.format( - dist_backend)) + utils.logger.info('Initializing TorchBackend in DeepSpeed with backend {}'.format(dist_backend)) # Create a torch backend object, initialize torch distributed, and assign to cdb cdb = TorchBackend(dist_backend, timeout, init_method, rank, world_size) @@ -686,16 +623,12 @@ def mpi_discovery(distributed_port=TORCH_DISTRIBUTED_DEFAULT_PORT, verbose=True) if verbose: utils.logger.info( - "Discovered MPI settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}" - .format(os.environ['RANK'], - os.environ['LOCAL_RANK'], - os.environ['WORLD_SIZE'], - os.environ['MASTER_ADDR'], - os.environ['MASTER_PORT'])) + "Discovered MPI settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}". + format(os.environ['RANK'], os.environ['LOCAL_RANK'], os.environ['WORLD_SIZE'], os.environ['MASTER_ADDR'], + os.environ['MASTER_PORT'])) if cdb is not None and cdb.is_initialized(): - assert cdb.get_rank() == rank, "MPI rank {} does not match torch rank {}".format( - rank, cdb.get_rank()) + assert cdb.get_rank() == rank, "MPI rank {} does not match torch rank {}".format(rank, cdb.get_rank()) assert cdb.get_world_size() == world_size, "MPI world size {} does not match torch world size {}".format( world_size, cdb.get_world_size()) @@ -722,8 +655,7 @@ def patch_aml_env_for_torch_nccl_backend(master_port=6105, verbose=True): """ os.environ["RANK"] = os.environ["OMPI_COMM_WORLD_RANK"] os.environ["WORLD_SIZE"] = os.environ["OMPI_COMM_WORLD_SIZE"] - single_node = int(os.environ["OMPI_COMM_WORLD_LOCAL_SIZE"]) == int( - os.environ["WORLD_SIZE"]) + single_node = int(os.environ["OMPI_COMM_WORLD_LOCAL_SIZE"]) == int(os.environ["WORLD_SIZE"]) if not single_node: master_node_params = os.environ["AZ_BATCH_MASTER_NODE"].split(":") @@ -736,8 +668,7 @@ def patch_aml_env_for_torch_nccl_backend(master_port=6105, verbose=True): os.environ["MASTER_PORT"] = DEFAULT_AML_MASTER_PORT if verbose: - utils.logger.info("NCCL_SOCKET_IFNAME original value = {}".format( - os.environ["NCCL_SOCKET_IFNAME"])) + utils.logger.info("NCCL_SOCKET_IFNAME original value = {}".format(os.environ["NCCL_SOCKET_IFNAME"])) os.environ["NCCL_SOCKET_IFNAME"] = DEFAULT_AML_NCCL_SOCKET_IFNAME os.environ['LOCAL_RANK'] = os.environ["OMPI_COMM_WORLD_LOCAL_RANK"] @@ -745,10 +676,7 @@ def patch_aml_env_for_torch_nccl_backend(master_port=6105, verbose=True): if verbose: utils.logger.info( "Discovered AzureML settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}" - .format(os.environ['RANK'], - os.environ['LOCAL_RANK'], - os.environ['WORLD_SIZE'], - os.environ['MASTER_ADDR'], + .format(os.environ['RANK'], os.environ['LOCAL_RANK'], os.environ['WORLD_SIZE'], os.environ['MASTER_ADDR'], os.environ['MASTER_PORT'])) @@ -762,8 +690,5 @@ def patch_aws_sm_env_for_torch_nccl_backend(verbose=True): if verbose: utils.logger.info( "Discovered AWS SageMaker settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}" - .format(os.environ['RANK'], - os.environ['LOCAL_RANK'], - os.environ['WORLD_SIZE'], - os.environ['MASTER_ADDR'], + .format(os.environ['RANK'], os.environ['LOCAL_RANK'], os.environ['WORLD_SIZE'], os.environ['MASTER_ADDR'], os.environ['MASTER_PORT'])) diff --git a/deepspeed/comm/config.py b/deepspeed/comm/config.py index 25839514f..f71fbcd53 100644 --- a/deepspeed/comm/config.py +++ b/deepspeed/comm/config.py @@ -9,6 +9,7 @@ from .constants import * class CommsConfig(BaseModel): + class Config: validate_all = True validate_assignment = True @@ -25,6 +26,7 @@ class CommsLoggerConfig(CommsConfig): class DeepSpeedCommsConfig: + def __init__(self, ds_config): self.comms_logger_enabled = 'comms_logger' in ds_config diff --git a/deepspeed/comm/torch.py b/deepspeed/comm/torch.py index 0753e7ce4..cf702e2bf 100644 --- a/deepspeed/comm/torch.py +++ b/deepspeed/comm/torch.py @@ -16,13 +16,8 @@ class TorchBackend(Backend): so no need to wrap all the functions. We can keep adding wrappers as needed. """ - def __init__(self, - backend, - timeout, - init_method, - rank=-1, - world_size=-1, - name='torch'): + + def __init__(self, backend, timeout, init_method, rank=-1, world_size=-1, name='torch'): super(TorchBackend, self).__init__() self.torch_version_before_18 = older_torch() self.has_allgather_base = has_allgather_base() @@ -44,30 +39,14 @@ class TorchBackend(Backend): world_size=world_size) self.using_mpi = torch.distributed.get_backend() == 'mpi' - def all_reduce(self, - tensor, - op=torch.distributed.ReduceOp.SUM, - group=None, - async_op=False): + def all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, group=None, async_op=False): op = self._reduce_op(op) - return torch.distributed.all_reduce(tensor=tensor, - op=op, - group=group, - async_op=async_op) + return torch.distributed.all_reduce(tensor=tensor, op=op, group=group, async_op=async_op) def reduce(self, tensor, dst, op=ReduceOp.SUM, group=None, async_op=False): - return torch.distributed.reduce(tensor=tensor, - dst=dst, - op=self._reduce_op(op), - group=group, - async_op=async_op) + return torch.distributed.reduce(tensor=tensor, dst=dst, op=self._reduce_op(op), group=group, async_op=async_op) - def reduce_scatter(self, - output, - input_list, - op=ReduceOp.SUM, - group=None, - async_op=False): + def reduce_scatter(self, output, input_list, op=ReduceOp.SUM, group=None, async_op=False): return torch.distributed.reduce_scatter(output=output, input_list=input_list, op=self._reduce_op(op), @@ -75,37 +54,24 @@ class TorchBackend(Backend): async_op=async_op) def broadcast(self, tensor, src, group=None, async_op=False): - return torch.distributed.broadcast(tensor=tensor, - src=src, - group=group, - async_op=async_op) + return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op) def all_gather(self, tensor_list, tensor, group=None, async_op=False): - return torch.distributed.all_gather(tensor_list=tensor_list, - tensor=tensor, - group=group, - async_op=async_op) + return torch.distributed.all_gather(tensor_list=tensor_list, tensor=tensor, group=group, async_op=async_op) def all_gather_base(self, output_tensor, input_tensor, group=None, async_op=False): if self.has_allgather_base: - return torch.distributed.distributed_c10d._all_gather_base( - output_tensor=output_tensor, - input_tensor=input_tensor, - group=group, - async_op=async_op) + return torch.distributed.distributed_c10d._all_gather_base(output_tensor=output_tensor, + input_tensor=input_tensor, + group=group, + async_op=async_op) else: - utils.logger.warning( - "unable to find torch.distributed._all_gather_base. will fall back to " - "torch.distributed.reduce_scatter which will result in suboptimal performance. " - "please consider upgrading your pytorch installation.") + utils.logger.warning("unable to find torch.distributed._all_gather_base. will fall back to " + "torch.distributed.reduce_scatter which will result in suboptimal performance. " + "please consider upgrading your pytorch installation.") pass - def reduce_scatter_base(self, - output_tensor, - input_tensor, - op=ReduceOp.SUM, - group=None, - async_op=False): + def reduce_scatter_base(self, output_tensor, input_tensor, op=ReduceOp.SUM, group=None, async_op=False): if self.has_reduce_scatter_base: return torch.distributed._reduce_scatter_base(output_tensor, input_tensor, @@ -113,10 +79,9 @@ class TorchBackend(Backend): group=group, async_op=async_op) else: - utils.logger.warning( - "unable to find torch.distributed._reduce_scatter_base. will fall back to " - "torch.distributed.reduce_scatter which will result in suboptimal performance. " - "please consider upgrading your pytorch installation.") + utils.logger.warning("unable to find torch.distributed._reduce_scatter_base. will fall back to " + "torch.distributed.reduce_scatter which will result in suboptimal performance. " + "please consider upgrading your pytorch installation.") pass def all_to_all_single(self, @@ -159,25 +124,15 @@ class TorchBackend(Backend): group=group, async_op=async_op) - def barrier(self, - group=torch.distributed.GroupMember.WORLD, - async_op=False, - device_ids=None): + def barrier(self, group=torch.distributed.GroupMember.WORLD, async_op=False, device_ids=None): if group is None: group = torch.distributed.GroupMember.WORLD - return torch.distributed.barrier(group=group, - async_op=async_op, - device_ids=device_ids) + return torch.distributed.barrier(group=group, async_op=async_op, device_ids=device_ids) - def monitored_barrier(self, - group=torch.distributed.GroupMember.WORLD, - timeout=None, - wait_all_ranks=False): + def monitored_barrier(self, group=torch.distributed.GroupMember.WORLD, timeout=None, wait_all_ranks=False): if group is None: group = torch.distributed.GroupMember.WORLD - return torch.distributed.monitored_barrier(group=group, - timeout=timeout, - wait_all_ranks=wait_all_ranks) + return torch.distributed.monitored_barrier(group=group, timeout=timeout, wait_all_ranks=wait_all_ranks) def get_rank(self, group=None): return torch.distributed.get_rank(group=group) diff --git a/deepspeed/comm/utils.py b/deepspeed/comm/utils.py index 120838e72..1ed9bae12 100644 --- a/deepspeed/comm/utils.py +++ b/deepspeed/comm/utils.py @@ -84,11 +84,7 @@ def get_world_size_from_launcher(): def get_default_args(func): signature = inspect.signature(func) - return { - k: v.default - for k, - v in signature.parameters.items() if v.default is not inspect.Parameter.empty - } + return {k: v.default for k, v in signature.parameters.items() if v.default is not inspect.Parameter.empty} # We need this hacky function since torch doesn't consistently name or place the input tensor args diff --git a/deepspeed/compression/basic_layer.py b/deepspeed/compression/basic_layer.py index a15f80da1..3b5d403d4 100644 --- a/deepspeed/compression/basic_layer.py +++ b/deepspeed/compression/basic_layer.py @@ -21,6 +21,7 @@ class QuantAct(nn.Module): Momentum for updating the activation quantization range. quant_mode : str, default 'symmetric' """ + def __init__(self, act_range_momentum=0.95, quant_mode='symmetric'): super(QuantAct, self).__init__() @@ -50,10 +51,8 @@ class QuantAct(nn.Module): self.x_min_max[1] = x_max # if do not need momentum, please set self.act_range_momentum = 0 - self.x_min_max[0] = self.x_min_max[0] * self.act_range_momentum + x_min * ( - 1 - self.act_range_momentum) - self.x_min_max[1] = self.x_min_max[1] * self.act_range_momentum + x_max * ( - 1 - self.act_range_momentum) + self.x_min_max[0] = self.x_min_max[0] * self.act_range_momentum + x_min * (1 - self.act_range_momentum) + self.x_min_max[1] = self.x_min_max[1] * self.act_range_momentum + x_max * (1 - self.act_range_momentum) x_q = self.act_function(x, num_bits, self.x_min_max[0], self.x_min_max[1]) @@ -61,6 +60,7 @@ class QuantAct(nn.Module): class Embedding_Compress(nn.Embedding): + def __init__(self, *kargs): super(Embedding_Compress, self).__init__(*kargs) self.weight.start_bits = None @@ -71,17 +71,10 @@ class Embedding_Compress(nn.Embedding): def extra_repr(self): return 'num_embeddings={}, embedding_dim={}, weight_quantization={}'.format( - self.num_embeddings, - self.embedding_dim, - self.weight.target_bits) + self.num_embeddings, self.embedding_dim, self.weight.target_bits) - def enable_weight_quantization(self, - start_bits, - target_bits, - quantization_period, - weight_quantization_enabled_in_forward, - quantization_type, - num_groups): + def enable_weight_quantization(self, start_bits, target_bits, quantization_period, + weight_quantization_enabled_in_forward, quantization_type, num_groups): self.weight.start_bits = start_bits self.weight.target_bits = target_bits self.weight.q_period = quantization_period @@ -105,31 +98,20 @@ class Embedding_Compress(nn.Embedding): self.weight_quantize_num_groups = self.weight.size(0) def fix_weight_quantization(self): - self.weight.data = self.weight_quantizer(self.weight, - self.weight.target_bits, - None, - None, + self.weight.data = self.weight_quantizer(self.weight, self.weight.target_bits, None, None, self.weight_quantize_num_groups).data self.weight_quantization_enabled_in_forward = False return None def forward(self, input): if self.weight_quantization_enabled_in_forward and self.weight_quantization_enabled: - weight = self.weight_quantizer(self.weight, - self.weight.target_bits, - None, - None, + weight = self.weight_quantizer(self.weight, self.weight.target_bits, None, None, self.weight_quantize_num_groups) else: weight = self.weight - out = nn.functional.embedding(input, - weight, - self.padding_idx, - self.max_norm, - self.norm_type, - self.scale_grad_by_freq, - self.sparse) + out = nn.functional.embedding(input, weight, self.padding_idx, self.max_norm, self.norm_type, + self.scale_grad_by_freq, self.sparse) return out @@ -137,6 +119,7 @@ class LinearLayer_Compress(nn.Linear): """ Linear layer with compression. """ + def __init__(self, *kargs, bias=True): super(LinearLayer_Compress, self).__init__(*kargs, bias=bias) self.sparse_pruning_method = None @@ -169,8 +152,7 @@ class LinearLayer_Compress(nn.Linear): mask = mask.to(self.weight.device) elif method == 'topk': self.sparse_mask_scores = nn.Parameter(torch.Tensor(self.weight.size())) - self.sparse_mask_scores.data = self.sparse_mask_scores.data.to( - self.weight.device) + self.sparse_mask_scores.data = self.sparse_mask_scores.data.to(self.weight.device) init.kaiming_uniform_(self.sparse_mask_scores, a=math.sqrt(5)) mask = None else: @@ -209,11 +191,9 @@ class LinearLayer_Compress(nn.Linear): raise NotImplementedError else: self.head_pruning_ratio = ratio - self.head_pruning_scores = nn.Parameter(torch.Tensor( - 1, - self.num_heads)) # we apply the pruning to O matrix - self.head_pruning_scores.data = self.head_pruning_scores.data.to( - self.weight.device) + self.head_pruning_scores = nn.Parameter(torch.Tensor(1, + self.num_heads)) # we apply the pruning to O matrix + self.head_pruning_scores.data = self.head_pruning_scores.data.to(self.weight.device) init.kaiming_uniform_(self.head_pruning_scores, a=math.sqrt(5)) def fix_sparse_pruning_helper(self): @@ -279,18 +259,17 @@ class LinearLayer_Compress(nn.Linear): start_bits = self.weight.start_bits target_bits = self.weight.target_bits q_period = self.weight.q_period - self.weight = nn.Parameter(self.weight.data.t().reshape(num_heads, -1)[mask.view(-1), :].reshape(-1, shape).t()) + self.weight = nn.Parameter(self.weight.data.t().reshape(num_heads, + -1)[mask.view(-1), :].reshape(-1, + shape).t()) self.weight.start_bits = start_bits self.weight.target_bits = target_bits self.weight.q_period = q_period else: shape = self.weight.size() - self.weight.data = (self.weight.data.t().reshape(self.num_heads, - -1) * - mask.view(-1, - 1)).reshape(shape[1], - shape[0]).t() + self.weight.data = (self.weight.data.t().reshape(self.num_heads, -1) * mask.view(-1, 1)).reshape( + shape[1], shape[0]).t() if self.head_pruning_method == 'topk': del self.head_pruning_scores @@ -316,37 +295,26 @@ class LinearLayer_Compress(nn.Linear): if self.sparse_pruning_method == 'l1': return self.sparse_pruning_mask.to(self.weight.device) elif self.sparse_pruning_method == 'topk': - return TopKBinarizer.apply(self.sparse_mask_scores, - self.sparse_pruning_ratio, - False) + return TopKBinarizer.apply(self.sparse_mask_scores, self.sparse_pruning_ratio, False) else: raise NotImplementedError if pruning_type == 'row': if self.row_pruning_method == 'l1': return self.row_pruning_mask.to(self.weight.device) elif self.row_pruning_method == 'topk': - return TopKBinarizer.apply(self.row_mask_scores, - self.row_pruning_ratio, - False) + return TopKBinarizer.apply(self.row_mask_scores, self.row_pruning_ratio, False) else: raise NotImplementedError elif pruning_type == 'head': if self.head_pruning_method == 'topk': - return TopKBinarizer.apply(self.head_pruning_scores, - self.head_pruning_ratio, - False) + return TopKBinarizer.apply(self.head_pruning_scores, self.head_pruning_ratio, False) else: raise NotImplementedError else: raise NotImplementedError - def enable_weight_quantization(self, - start_bits, - target_bits, - quantization_period, - weight_quantization_enabled_in_forward, - quantization_type, - num_groups): + def enable_weight_quantization(self, start_bits, target_bits, quantization_period, + weight_quantization_enabled_in_forward, quantization_type, num_groups): self.weight.start_bits = start_bits self.weight.target_bits = target_bits self.weight.q_period = quantization_period @@ -369,10 +337,7 @@ class LinearLayer_Compress(nn.Linear): self.weight_quantize_num_groups = num_groups def fix_weight_quantization(self): - self.weight.data = self.weight_quantizer(self.weight, - self.weight.target_bits, - None, - None, + self.weight.data = self.weight_quantizer(self.weight, self.weight.target_bits, None, None, self.weight_quantize_num_groups).data self.weight_quantization_enabled_in_forward = False return None @@ -391,18 +356,12 @@ class LinearLayer_Compress(nn.Linear): def head_pruning_reshape(self, w, mask): shape = w.shape - return (w.t().reshape(self.num_heads, - -1) * mask.view(-1, - 1)).reshape(shape[1], - shape[0]).t() + return (w.t().reshape(self.num_heads, -1) * mask.view(-1, 1)).reshape(shape[1], shape[0]).t() def forward(self, input, skip_bias_add=False): if self.weight_quantization_enabled_in_forward and self.weight_quantization_enabled: - weight = self.weight_quantizer(self.weight, - self.weight.target_bits, - None, - None, + weight = self.weight_quantizer(self.weight, self.weight.target_bits, None, None, self.weight_quantize_num_groups) bias = self.bias else: @@ -428,11 +387,7 @@ class LinearLayer_Compress(nn.Linear): num_groups = input.numel() // input.size(-1) else: num_groups = 1 - input = self.activation_quantizer(input, - self.activation_quantization_bits, - None, - None, - num_groups) + input = self.activation_quantizer(input, self.activation_quantization_bits, None, None, num_groups) if skip_bias_add: # used for mpu linear layers @@ -447,6 +402,7 @@ class Conv2dLayer_Compress(nn.Conv2d): """ Conv2D layer with compression. """ + def __init__(self, *kargs): super(Conv2dLayer_Compress, self).__init__(*kargs) self.sparse_pruning_method = None @@ -478,10 +434,8 @@ class Conv2dLayer_Compress(nn.Conv2d): output = s.format(**self.__dict__) return output + ' sparse pruning={}, channel pruning={}, activation quantization={}, weight_quantization={}'.format( - self.sparse_pruning_method is not None, - self.channel_pruning_method is not None, - self.activation_quantization_method is not None, - self.weight.target_bits) + self.sparse_pruning_method is not None, self.channel_pruning_method is not None, + self.activation_quantization_method is not None, self.weight.target_bits) def enable_sparse_pruning(self, ratio, method): self.sparse_pruning_ratio = ratio @@ -493,8 +447,7 @@ class Conv2dLayer_Compress(nn.Conv2d): mask = mask.to(self.weight.device) elif method == 'topk': self.sparse_mask_scores = nn.Parameter(torch.Tensor(self.weight.size())) - self.sparse_mask_scores.data = self.sparse_mask_scores.data.to( - self.weight.device) + self.sparse_mask_scores.data = self.sparse_mask_scores.data.to(self.weight.device) init.kaiming_uniform_(self.sparse_mask_scores, a=math.sqrt(5)) mask = None else: @@ -514,13 +467,8 @@ class Conv2dLayer_Compress(nn.Conv2d): mask = mask.view(-1, 1, 1, 1) mask = mask.to(self.weight.device) elif method == 'topk': - self.channel_mask_scores = nn.Parameter( - torch.Tensor(self.weight.size(0), - 1, - 1, - 1)) - self.channel_mask_scores.data = self.channel_mask_scores.data.to( - self.weight.device) + self.channel_mask_scores = nn.Parameter(torch.Tensor(self.weight.size(0), 1, 1, 1)) + self.channel_mask_scores.data = self.channel_mask_scores.data.to(self.weight.device) init.kaiming_uniform_(self.channel_mask_scores, a=math.sqrt(5)) mask = None else: @@ -579,39 +527,27 @@ class Conv2dLayer_Compress(nn.Conv2d): if self.sparse_pruning_method == 'l1': return self.sparse_pruning_mask.to(self.weight.device) elif self.sparse_pruning_method == 'topk': - return TopKBinarizer.apply(self.sparse_mask_scores, - self.sparse_pruning_ratio, - False) + return TopKBinarizer.apply(self.sparse_mask_scores, self.sparse_pruning_ratio, False) else: raise NotImplementedError elif pruning_type == 'channel': if self.channel_pruning_method == 'l1': return self.channel_pruning_mask.to(self.weight.device) elif self.channel_pruning_method == 'topk': - return TopKBinarizer.apply(self.channel_mask_scores, - self.channel_pruning_ratio, - False) + return TopKBinarizer.apply(self.channel_mask_scores, self.channel_pruning_ratio, False) else: raise NotImplementedError else: raise NotImplementedError def fix_weight_quantization(self): - self.weight.data = self.weight_quantizer(self.weight, - self.weight.target_bits, - None, - None, + self.weight.data = self.weight_quantizer(self.weight, self.weight.target_bits, None, None, self.weight_quantize_num_groups).data self.weight_quantization_enabled_in_forward = False return None - def enable_weight_quantization(self, - start_bits, - target_bits, - quantization_period, - weight_quantization_enabled_in_forward, - quantization_type, - num_groups): + def enable_weight_quantization(self, start_bits, target_bits, quantization_period, + weight_quantization_enabled_in_forward, quantization_type, num_groups): self.weight.start_bits = start_bits self.weight.target_bits = target_bits self.weight.q_period = quantization_period @@ -642,10 +578,7 @@ class Conv2dLayer_Compress(nn.Conv2d): def forward(self, input): if self.weight_quantization_enabled_in_forward and self.weight_quantization_enabled: - weight = self.weight_quantizer(self.weight, - self.weight.target_bits, - None, - None, + weight = self.weight_quantizer(self.weight, self.weight.target_bits, None, None, self.weight_quantize_num_groups) bias = self.bias else: @@ -667,22 +600,13 @@ class Conv2dLayer_Compress(nn.Conv2d): num_groups = input.numel() // input[0].numel() else: num_groups = 1 - input = self.activation_quantizer(input, - self.activation_quantization_bits, - None, - None, - num_groups) + input = self.activation_quantizer(input, self.activation_quantization_bits, None, None, num_groups) - return nn.functional.conv2d(input, - weight, - bias, - self.stride, - self.padding, - self.dilation, - self.groups) + return nn.functional.conv2d(input, weight, bias, self.stride, self.padding, self.dilation, self.groups) class BNLayer_Compress(nn.BatchNorm2d): + def fix_channel_pruning_helper(self, mask, dim_reduction=True): self.weight = nn.Parameter(self.weight.data[mask.view(-1)]) self.bias = nn.Parameter(self.bias.data[mask.view(-1)]) @@ -770,6 +694,7 @@ def _gather(input_): class _CopyToModelParallelRegion(torch.autograd.Function): """Pass the input to the model parallel region.""" + @staticmethod def forward(ctx, input_): return input_ @@ -781,6 +706,7 @@ class _CopyToModelParallelRegion(torch.autograd.Function): class _ReduceFromModelParallelRegion(torch.autograd.Function): """All-redcue the input from the model parallel region.""" + @staticmethod def forward(ctx, input_): return _reduce(input_) @@ -792,6 +718,7 @@ class _ReduceFromModelParallelRegion(torch.autograd.Function): class _ScatterToModelParallelRegion(torch.autograd.Function): """Split the input and keep only the corresponding chuck to the rank.""" + @staticmethod def forward(ctx, input_): return _split(input_) @@ -803,6 +730,7 @@ class _ScatterToModelParallelRegion(torch.autograd.Function): class _GatherFromModelParallelRegion(torch.autograd.Function): """Gather the input from model parallel region and concatinate.""" + @staticmethod def forward(ctx, input_): return _gather(input_) @@ -834,13 +762,8 @@ def gather_from_model_parallel_region(input_): class ColumnParallelLinear_Compress(LinearLayer_Compress): - def __init__(self, - mpu, - input_size, - output_size, - bias=True, - gather_output=True, - skip_bias_add=False): + + def __init__(self, mpu, input_size, output_size, bias=True, gather_output=True, skip_bias_add=False): # Keep input parameters global g_mpu g_mpu = mpu @@ -854,10 +777,7 @@ class ColumnParallelLinear_Compress(LinearLayer_Compress): assert output_size % world_size == 0 self.output_size_per_partition = output_size // world_size - super(ColumnParallelLinear_Compress, - self).__init__(self.input_size, - self.output_size_per_partition, - bias=bias) + super(ColumnParallelLinear_Compress, self).__init__(self.input_size, self.output_size_per_partition, bias=bias) def forward(self, input_): # Set up backprop all-reduce. @@ -877,13 +797,8 @@ class ColumnParallelLinear_Compress(LinearLayer_Compress): class RowParallelLinear_Compress(LinearLayer_Compress): - def __init__(self, - mpu, - input_size, - output_size, - bias=True, - input_is_parallel=False, - skip_bias_add=False): + + def __init__(self, mpu, input_size, output_size, bias=True, input_is_parallel=False, skip_bias_add=False): # Keep input parameters global g_mpu g_mpu = mpu @@ -897,10 +812,7 @@ class RowParallelLinear_Compress(LinearLayer_Compress): assert input_size % world_size == 0 self.input_size_per_partition = input_size // world_size - super(RowParallelLinear_Compress, - self).__init__(self.input_size_per_partition, - self.output_size, - bias=bias) + super(RowParallelLinear_Compress, self).__init__(self.input_size_per_partition, self.output_size, bias=bias) def forward(self, input_): # Set up backprop all-reduce. diff --git a/deepspeed/compression/compress.py b/deepspeed/compression/compress.py index bf3b6c276..bf14b907a 100644 --- a/deepspeed/compression/compress.py +++ b/deepspeed/compression/compress.py @@ -13,21 +13,13 @@ def check_deepspeed_config(config): if isinstance(config, dict): return config elif os.path.exists(config): - return json.load(open(config, - "r"), - object_pairs_hook=dict_raise_error_on_duplicate_keys) + return json.load(open(config, "r"), object_pairs_hook=dict_raise_error_on_duplicate_keys) else: raise ValueError( - f"Expected a string path to an existing deepspeed config, or a dictionary. Received: {config}" - ) + f"Expected a string path to an existing deepspeed config, or a dictionary. Received: {config}") -def get_module_name(group_name, - model, - key_word, - exist_module_name, - mpu=None, - verbose=True): +def get_module_name(group_name, model, key_word, exist_module_name, mpu=None, verbose=True): ''' get the associated module name from the model based on the key_word provided by users ''' @@ -40,8 +32,7 @@ def get_module_name(group_name, if name in exist_module_name and verbose: # logger.warning raise ValueError( - f"{name} is already added to compression, please check your config file for {group_name}." - ) + f"{name} is already added to compression, please check your config file for {group_name}.") if name not in exist_module_name: exist_module_name.add(name) return_module_name.append(name) @@ -56,8 +47,7 @@ def get_compress_methods(model, compress_methods, mpu=None): continue # for loop different methods, i.e., weight quantization, activation quantization etc exist_module_name = set() - shared_parameters = method_content[ - SHARED_PARAMETERS] # get all the shared parameters + shared_parameters = method_content[SHARED_PARAMETERS] # get all the shared parameters for group_name, method_parameters in method_content[DIFFERENT_GROUPS].items(): # for loop different groups, i.e., weight quantization group 1, weight quantization group 2 etc module_name_list = [] @@ -65,8 +55,13 @@ def get_compress_methods(model, compress_methods, mpu=None): if method_parameters[DIFFERENT_GROUPS_RELATED_MODULE_SCOPE]: # this is used for head/row/channel pruning, if users provide the related module scope, we can shrink the layer dim for them # otherwise we just mask those as zeros - for key_word, related_key_words in zip(method_parameters[DIFFERENT_GROUPS_MODULE_SCOPE], method_parameters[DIFFERENT_GROUPS_RELATED_MODULE_SCOPE]): - module_name, exist_module_name = get_module_name(group_name, model, key_word, exist_module_name, mpu=mpu) + for key_word, related_key_words in zip(method_parameters[DIFFERENT_GROUPS_MODULE_SCOPE], + method_parameters[DIFFERENT_GROUPS_RELATED_MODULE_SCOPE]): + module_name, exist_module_name = get_module_name(group_name, + model, + key_word, + exist_module_name, + mpu=mpu) module_name_list.append(module_name) tmp_related_module_name_list = [] for rkw in related_key_words: @@ -76,7 +71,11 @@ def get_compress_methods(model, compress_methods, mpu=None): related_module_name_list.append(tmp_related_module_name_list) else: for key_word in method_parameters[DIFFERENT_GROUPS_MODULE_SCOPE]: - module_name, exist_module_name = get_module_name(group_name, model, key_word, exist_module_name, mpu=mpu) + module_name, exist_module_name = get_module_name(group_name, + model, + key_word, + exist_module_name, + mpu=mpu) module_name_list.append(module_name) if module_name_list: @@ -85,13 +84,7 @@ def get_compress_methods(model, compress_methods, mpu=None): **(method_parameters.copy().pop(DIFFERENT_GROUPS_PARAMETERS)), **shared_parameters } - compression_item = [ - module_name_list, - related_module_name_list, - { - method: combined_method_parameters - } - ] + compression_item = [module_name_list, related_module_name_list, {method: combined_method_parameters}] layer_added_compress_methods.append(compression_item) return layer_added_compress_methods @@ -118,9 +111,7 @@ def init_compression(model, deepspeed_config, teacher_model=None, mpu=None): assert teacher_model is not None, "Teacher model is required for layer reduction" student_initialization(c_model, teacher_model, deepspeed_config) - layer_added_compress_methods = get_compress_methods(c_model, - compress_methods, - mpu=mpu) + layer_added_compress_methods = get_compress_methods(c_model, compress_methods, mpu=mpu) compression_preparation(c_model, layer_added_compress_methods, mpu) return model @@ -143,31 +134,20 @@ def redundancy_clean(model, deepspeed_config, mpu=None): else: c_model = model - layer_added_compress_methods_tmp = get_compress_methods(c_model, - compress_methods, - mpu=mpu) + layer_added_compress_methods_tmp = get_compress_methods(c_model, compress_methods, mpu=mpu) # sort methods order_list = [ - WEIGHT_QUANTIZATION, - SPARSE_PRUNING, - ROW_PRUNING, - HEAD_PRUNING, - CHANNEL_PRUNING, - ACTIVATION_QUANTIZATION + WEIGHT_QUANTIZATION, SPARSE_PRUNING, ROW_PRUNING, HEAD_PRUNING, CHANNEL_PRUNING, ACTIVATION_QUANTIZATION ] - layer_added_compress_methods = sorted( - layer_added_compress_methods_tmp, - key=lambda x: order_list.index(list(x[2].keys())[0])) + layer_added_compress_methods = sorted(layer_added_compress_methods_tmp, + key=lambda x: order_list.index(list(x[2].keys())[0])) for module_name_lists, related_module_name_lists, compression_technique in layer_added_compress_methods: stored_mask = [] need_mask = True if related_module_name_lists else False for i, mnl in enumerate(module_name_lists): for module_name in mnl: - mask = fix_compression(c_model, - module_name, - compression_technique, - dim_reduction=need_mask) + mask = fix_compression(c_model, module_name, compression_technique, dim_reduction=need_mask) if need_mask: stored_mask.append(mask) if need_mask: @@ -219,10 +199,8 @@ def student_initialization(student_model, teacher_model, deepspeed_config): ''' assert len(student_layer) == len(teacher_layer) for s_name, t_name in zip(student_layer, teacher_layer): - s_module = recursive_getattr(student_model, - module_name_prefix + '.' + str(s_name)) - t_module = recursive_getattr(teacher_model, - module_name_prefix + '.' + str(t_name)) + s_module = recursive_getattr(student_model, module_name_prefix + '.' + str(s_name)) + t_module = recursive_getattr(teacher_model, module_name_prefix + '.' + str(t_name)) for s_param, t_param in zip(s_module.parameters(), t_module.parameters()): s_param.data.copy_(t_param.data) for name in other_module_name: diff --git a/deepspeed/compression/config.py b/deepspeed/compression/config.py index e6a710dfa..37dfb8b0d 100644 --- a/deepspeed/compression/config.py +++ b/deepspeed/compression/config.py @@ -36,9 +36,7 @@ def get_layer_reduction(param_dict): def get_layer_reduction_enabled(param_dict): if LAYER_REDUCTION in param_dict.keys(): - return get_scalar_param(param_dict[LAYER_REDUCTION], - LAYER_REDUCTION_ENABLED, - LAYER_REDUCTION_ENABLED_DEFAULT) + return get_scalar_param(param_dict[LAYER_REDUCTION], LAYER_REDUCTION_ENABLED, LAYER_REDUCTION_ENABLED_DEFAULT) else: return False @@ -70,7 +68,8 @@ def get_weight_quantization(param_dict): output[SHARED_PARAMETERS] = get_weight_quantization_shared_parameters(sub_param_dict) # each sub-groups if output[SHARED_PARAMETERS][WEIGHT_QUANTIZE_ENABLED]: - assert DIFFERENT_GROUPS in sub_param_dict.keys(), f"Weigh Quantization is enabled, {DIFFERENT_GROUPS} must be specified" + assert DIFFERENT_GROUPS in sub_param_dict.keys( + ), f"Weigh Quantization is enabled, {DIFFERENT_GROUPS} must be specified" output[DIFFERENT_GROUPS] = get_weight_quantization_different_groups(sub_param_dict) return output @@ -79,51 +78,38 @@ def get_weight_quantization_shared_parameters(param_dict): output = {} if SHARED_PARAMETERS in param_dict.keys(): sub_param_dict = param_dict[SHARED_PARAMETERS] - output[WEIGHT_QUANTIZE_ENABLED] = get_scalar_param( - sub_param_dict, - WEIGHT_QUANTIZE_ENABLED, - WEIGHT_QUANTIZE_ENABLED_DEFAULT) - output[WEIGHT_QUANTIZE_KERNEL] = get_scalar_param( - sub_param_dict, - WEIGHT_QUANTIZE_KERNEL, - WEIGHT_QUANTIZE_KERNEL_DEFAULT) - output[WEIGHT_QUANTIZE_SCHEDULE_OFFSET] = get_scalar_param( - sub_param_dict, - WEIGHT_QUANTIZE_SCHEDULE_OFFSET, - WEIGHT_QUANTIZE_SCHEDULE_OFFSET_DEFAULT) - output[WEIGHT_QUANTIZE_GROUPS] = get_scalar_param( - sub_param_dict, - WEIGHT_QUANTIZE_GROUPS, - WEIGHT_QUANTIZE_GROUPS_DEFAULT) - output[WEIGHT_QUANTIZE_VERBOSE] = get_scalar_param( - sub_param_dict, - WEIGHT_QUANTIZE_VERBOSE, - WEIGHT_QUANTIZE_VERBOSE_DEFAULT) - output[WEIGHT_QUANTIZE_TYPE] = get_scalar_param(sub_param_dict, - WEIGHT_QUANTIZE_TYPE, + output[WEIGHT_QUANTIZE_ENABLED] = get_scalar_param(sub_param_dict, WEIGHT_QUANTIZE_ENABLED, + WEIGHT_QUANTIZE_ENABLED_DEFAULT) + output[WEIGHT_QUANTIZE_KERNEL] = get_scalar_param(sub_param_dict, WEIGHT_QUANTIZE_KERNEL, + WEIGHT_QUANTIZE_KERNEL_DEFAULT) + output[WEIGHT_QUANTIZE_SCHEDULE_OFFSET] = get_scalar_param(sub_param_dict, WEIGHT_QUANTIZE_SCHEDULE_OFFSET, + WEIGHT_QUANTIZE_SCHEDULE_OFFSET_DEFAULT) + output[WEIGHT_QUANTIZE_GROUPS] = get_scalar_param(sub_param_dict, WEIGHT_QUANTIZE_GROUPS, + WEIGHT_QUANTIZE_GROUPS_DEFAULT) + output[WEIGHT_QUANTIZE_VERBOSE] = get_scalar_param(sub_param_dict, WEIGHT_QUANTIZE_VERBOSE, + WEIGHT_QUANTIZE_VERBOSE_DEFAULT) + output[WEIGHT_QUANTIZE_TYPE] = get_scalar_param(sub_param_dict, WEIGHT_QUANTIZE_TYPE, WEIGHT_QUANTIZE_TYPE_DEFAULT) - output[WEIGHT_QUANTIZE_IN_FORWARD_ENABLED] = get_scalar_param( - sub_param_dict, - WEIGHT_QUANTIZE_IN_FORWARD_ENABLED, - WEIGHT_QUANTIZE_IN_FORWARD_ENABLED_DEFAULT) - assert output[WEIGHT_QUANTIZE_TYPE] in [WEIGHT_QUANTIZE_SYMMETRIC, WEIGHT_QUANTIZE_ASYMMETRIC], f"Invalid weight quantize type. Supported types: [{WEIGHT_QUANTIZE_SYMMETRIC}, {WEIGHT_QUANTIZE_ASYMMETRIC}]" - output[WEIGHT_QUANTIZE_ROUNDING] = get_scalar_param( - sub_param_dict, - WEIGHT_QUANTIZE_ROUNDING, - WEIGHT_QUANTIZE_ROUNDING_DEFAULT) - assert output[WEIGHT_QUANTIZE_ROUNDING] in [WEIGHT_QUANTIZE_NEAREST_ROUNDING, WEIGHT_QUANTIZE_STOCHASTIC_ROUNDING], f"Invalid weight quantize rounding. Supported types: [{WEIGHT_QUANTIZE_NEAREST_ROUNDING}, {WEIGHT_QUANTIZE_STOCHASTIC_ROUNDING}]" + output[WEIGHT_QUANTIZE_IN_FORWARD_ENABLED] = get_scalar_param(sub_param_dict, + WEIGHT_QUANTIZE_IN_FORWARD_ENABLED, + WEIGHT_QUANTIZE_IN_FORWARD_ENABLED_DEFAULT) + assert output[WEIGHT_QUANTIZE_TYPE] in [ + WEIGHT_QUANTIZE_SYMMETRIC, WEIGHT_QUANTIZE_ASYMMETRIC + ], f"Invalid weight quantize type. Supported types: [{WEIGHT_QUANTIZE_SYMMETRIC}, {WEIGHT_QUANTIZE_ASYMMETRIC}]" + output[WEIGHT_QUANTIZE_ROUNDING] = get_scalar_param(sub_param_dict, WEIGHT_QUANTIZE_ROUNDING, + WEIGHT_QUANTIZE_ROUNDING_DEFAULT) + assert output[WEIGHT_QUANTIZE_ROUNDING] in [ + WEIGHT_QUANTIZE_NEAREST_ROUNDING, WEIGHT_QUANTIZE_STOCHASTIC_ROUNDING + ], f"Invalid weight quantize rounding. Supported types: [{WEIGHT_QUANTIZE_NEAREST_ROUNDING}, {WEIGHT_QUANTIZE_STOCHASTIC_ROUNDING}]" if WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE in sub_param_dict.keys(): output[WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE] = get_scalar_param( - sub_param_dict[WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE], - WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE_ENABLED, + sub_param_dict[WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE], WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE_ENABLED, WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE_ENABLED_DEFAULT) output[WEIGHT_QUANTIZE_CHANGE_RATIO] = get_scalar_param( - sub_param_dict[WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE], - WEIGHT_QUANTIZE_CHANGE_RATIO, + sub_param_dict[WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE], WEIGHT_QUANTIZE_CHANGE_RATIO, WEIGHT_QUANTIZE_CHANGE_RATIO_DEFAULT) else: - output[ - WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE] = WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE_ENABLED_DEFAULT + output[WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE] = WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE_ENABLED_DEFAULT output[WEIGHT_QUANTIZE_CHANGE_RATIO] = WEIGHT_QUANTIZE_CHANGE_RATIO_DEFAULT else: output[WEIGHT_QUANTIZE_ENABLED] = WEIGHT_QUANTIZE_ENABLED_DEFAULT @@ -133,8 +119,7 @@ def get_weight_quantization_shared_parameters(param_dict): output[WEIGHT_QUANTIZE_VERBOSE] = WEIGHT_QUANTIZE_VERBOSE_DEFAULT output[WEIGHT_QUANTIZE_TYPE] = WEIGHT_QUANTIZE_TYPE_DEFAULT output[WEIGHT_QUANTIZE_ROUNDING] = WEIGHT_QUANTIZE_ROUNDING_DEFAULT - output[ - WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE] = WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE_ENABLED_DEFAULT + output[WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE] = WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE_ENABLED_DEFAULT output[WEIGHT_QUANTIZE_CHANGE_RATIO] = WEIGHT_QUANTIZE_CHANGE_RATIO_DEFAULT return output @@ -144,27 +129,21 @@ def get_weight_quantization_different_groups(param_dict): sub_param_dict = param_dict[DIFFERENT_GROUPS] def get_params(name, group_dict): - assert WEIGHT_QUANTIZE_START_BITS in group_dict.keys(), f"{WEIGHT_QUANTIZE_START_BITS} must be specified for weight quantization group {name}" - assert WEIGHT_QUANTIZE_TARGET_BITS in group_dict.keys(), f"{WEIGHT_QUANTIZE_TARGET_BITS} must be specified for weight quantization group {name}" - group_dict[WEIGHT_QUANTIZATION_PERIOD] = get_scalar_param( - group_dict, - WEIGHT_QUANTIZATION_PERIOD, - WEIGHT_QUANTIZATION_PERIOD_DEFAULT) + assert WEIGHT_QUANTIZE_START_BITS in group_dict.keys( + ), f"{WEIGHT_QUANTIZE_START_BITS} must be specified for weight quantization group {name}" + assert WEIGHT_QUANTIZE_TARGET_BITS in group_dict.keys( + ), f"{WEIGHT_QUANTIZE_TARGET_BITS} must be specified for weight quantization group {name}" + group_dict[WEIGHT_QUANTIZATION_PERIOD] = get_scalar_param(group_dict, WEIGHT_QUANTIZATION_PERIOD, + WEIGHT_QUANTIZATION_PERIOD_DEFAULT) return group_dict for k, v in sub_param_dict.items(): output[k] = {} - output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params( - k, - sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS]) - output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param( - sub_param_dict[k], - DIFFERENT_GROUPS_MODULE_SCOPE, - DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT) + output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params(k, sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS]) + output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param(sub_param_dict[k], DIFFERENT_GROUPS_MODULE_SCOPE, + DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT) output[k][DIFFERENT_GROUPS_RELATED_MODULE_SCOPE] = get_scalar_param( - sub_param_dict[k], - DIFFERENT_GROUPS_RELATED_MODULE_SCOPE, - DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT) + sub_param_dict[k], DIFFERENT_GROUPS_RELATED_MODULE_SCOPE, DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT) return output @@ -172,19 +151,15 @@ def get_weight_quantization_different_groups(param_dict): def get_activation_quantization(param_dict): output = {} if ACTIVATION_QUANTIZATION not in param_dict.keys(): - param_dict[ACTIVATION_QUANTIZATION] = { - SHARED_PARAMETERS: {}, - DIFFERENT_GROUPS: {} - } + param_dict[ACTIVATION_QUANTIZATION] = {SHARED_PARAMETERS: {}, DIFFERENT_GROUPS: {}} sub_param_dict = param_dict[ACTIVATION_QUANTIZATION] # shared parameters - output[SHARED_PARAMETERS] = get_activation_quantization_shared_parameters( - sub_param_dict) + output[SHARED_PARAMETERS] = get_activation_quantization_shared_parameters(sub_param_dict) # each sub-groups if output[SHARED_PARAMETERS][ACTIVATION_QUANTIZATION_ENABLED]: - assert DIFFERENT_GROUPS in sub_param_dict.keys(), f"Activation Quantization is enabled, {DIFFERENT_GROUPS} must be specified" - output[DIFFERENT_GROUPS] = get_activation_quantization_different_groups( - sub_param_dict) + assert DIFFERENT_GROUPS in sub_param_dict.keys( + ), f"Activation Quantization is enabled, {DIFFERENT_GROUPS} must be specified" + output[DIFFERENT_GROUPS] = get_activation_quantization_different_groups(sub_param_dict) return output @@ -192,30 +167,26 @@ def get_activation_quantization_shared_parameters(param_dict): output = {} if SHARED_PARAMETERS in param_dict.keys(): sub_param_dict = param_dict[SHARED_PARAMETERS] - output[ACTIVATION_QUANTIZATION_ENABLED] = get_scalar_param( - sub_param_dict, - ACTIVATION_QUANTIZATION_ENABLED, - ACTIVATION_QUANTIZATION_ENABLED_DEFAULT) - output[ACTIVATION_QUANTIZE_TYPE] = get_scalar_param( - sub_param_dict, - ACTIVATION_QUANTIZE_TYPE, - ACTIVATION_QUANTIZE_TYPE_DEFAULT) - assert output[ACTIVATION_QUANTIZE_TYPE] in [ACTIVATION_QUANTIZE_SYMMETRIC, ACTIVATION_QUANTIZE_ASYMMETRIC], f"Invalid activation quantize type. Supported types: [{ACTIVATION_QUANTIZE_SYMMETRIC}, {ACTIVATION_QUANTIZE_ASYMMETRIC}]" - output[ACTIVATION_QUANTIZE_RANGE] = get_scalar_param( - sub_param_dict, - ACTIVATION_QUANTIZE_RANGE, - ACTIVATION_QUANTIZE_RANGE_DEFAULT) - assert output[ACTIVATION_QUANTIZE_RANGE] in [ACTIVATION_QUANTIZE_RANGE_DYNAMIC, ACTIVATION_QUANTIZE_RANGE_STATIC], f"Invalid activation quantize range calibration. Supported types: [{ACTIVATION_QUANTIZE_RANGE_DYNAMIC}, {ACTIVATION_QUANTIZE_RANGE_STATIC}]" - output[ACTIVATION_QUANTIZE_SCHEDULE_OFFSET] = get_scalar_param( - sub_param_dict, - ACTIVATION_QUANTIZE_SCHEDULE_OFFSET, - ACTIVATION_QUANTIZE_SCHEDULE_OFFSET_DEFAULT) + output[ACTIVATION_QUANTIZATION_ENABLED] = get_scalar_param(sub_param_dict, ACTIVATION_QUANTIZATION_ENABLED, + ACTIVATION_QUANTIZATION_ENABLED_DEFAULT) + output[ACTIVATION_QUANTIZE_TYPE] = get_scalar_param(sub_param_dict, ACTIVATION_QUANTIZE_TYPE, + ACTIVATION_QUANTIZE_TYPE_DEFAULT) + assert output[ACTIVATION_QUANTIZE_TYPE] in [ + ACTIVATION_QUANTIZE_SYMMETRIC, ACTIVATION_QUANTIZE_ASYMMETRIC + ], f"Invalid activation quantize type. Supported types: [{ACTIVATION_QUANTIZE_SYMMETRIC}, {ACTIVATION_QUANTIZE_ASYMMETRIC}]" + output[ACTIVATION_QUANTIZE_RANGE] = get_scalar_param(sub_param_dict, ACTIVATION_QUANTIZE_RANGE, + ACTIVATION_QUANTIZE_RANGE_DEFAULT) + assert output[ACTIVATION_QUANTIZE_RANGE] in [ + ACTIVATION_QUANTIZE_RANGE_DYNAMIC, ACTIVATION_QUANTIZE_RANGE_STATIC + ], f"Invalid activation quantize range calibration. Supported types: [{ACTIVATION_QUANTIZE_RANGE_DYNAMIC}, {ACTIVATION_QUANTIZE_RANGE_STATIC}]" + output[ACTIVATION_QUANTIZE_SCHEDULE_OFFSET] = get_scalar_param(sub_param_dict, + ACTIVATION_QUANTIZE_SCHEDULE_OFFSET, + ACTIVATION_QUANTIZE_SCHEDULE_OFFSET_DEFAULT) else: output[ACTIVATION_QUANTIZATION_ENABLED] = ACTIVATION_QUANTIZATION_ENABLED_DEFAULT output[ACTIVATION_QUANTIZE_TYPE] = ACTIVATION_QUANTIZE_TYPE_DEFAULT output[ACTIVATION_QUANTIZE_RANGE] = ACTIVATION_QUANTIZE_RANGE_DEFAULT - output[ - ACTIVATION_QUANTIZE_SCHEDULE_OFFSET] = ACTIVATION_QUANTIZE_SCHEDULE_OFFSET_DEFAULT + output[ACTIVATION_QUANTIZE_SCHEDULE_OFFSET] = ACTIVATION_QUANTIZE_SCHEDULE_OFFSET_DEFAULT return output @@ -224,22 +195,17 @@ def get_activation_quantization_different_groups(param_dict): sub_param_dict = param_dict[DIFFERENT_GROUPS] def get_params(name, group_dict): - assert ACTIVATION_QUANTIZE_BITS in group_dict.keys(), f"{ACTIVATION_QUANTIZE_BITS} must be specified for activation quantization group {name}" + assert ACTIVATION_QUANTIZE_BITS in group_dict.keys( + ), f"{ACTIVATION_QUANTIZE_BITS} must be specified for activation quantization group {name}" return group_dict for k, v in sub_param_dict.items(): output[k] = {} - output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params( - k, - sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS]) - output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param( - sub_param_dict[k], - DIFFERENT_GROUPS_MODULE_SCOPE, - DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT) + output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params(k, sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS]) + output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param(sub_param_dict[k], DIFFERENT_GROUPS_MODULE_SCOPE, + DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT) output[k][DIFFERENT_GROUPS_RELATED_MODULE_SCOPE] = get_scalar_param( - sub_param_dict[k], - DIFFERENT_GROUPS_RELATED_MODULE_SCOPE, - DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT) + sub_param_dict[k], DIFFERENT_GROUPS_RELATED_MODULE_SCOPE, DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT) return output @@ -253,7 +219,8 @@ def get_sparse_pruning(param_dict): output[SHARED_PARAMETERS] = get_sparse_pruning_shared_parameters(sub_param_dict) # each sub-groups if output[SHARED_PARAMETERS][SPARSE_PRUNING_ENABLED]: - assert DIFFERENT_GROUPS in sub_param_dict.keys(), f"Sparse Pruning is enabled, {DIFFERENT_GROUPS} must be specified" + assert DIFFERENT_GROUPS in sub_param_dict.keys( + ), f"Sparse Pruning is enabled, {DIFFERENT_GROUPS} must be specified" output[DIFFERENT_GROUPS] = get_sparse_pruning_different_groups(sub_param_dict) return output @@ -262,18 +229,15 @@ def get_sparse_pruning_shared_parameters(param_dict): output = {} if SHARED_PARAMETERS in param_dict.keys(): sub_param_dict = param_dict[SHARED_PARAMETERS] - output[SPARSE_PRUNING_ENABLED] = get_scalar_param( - sub_param_dict, - SPARSE_PRUNING_ENABLED, - SPARSE_PRUNING_ENABLED_DEFAULT) - output[SPARSE_PRUNING_METHOD] = get_scalar_param(sub_param_dict, - SPARSE_PRUNING_METHOD, + output[SPARSE_PRUNING_ENABLED] = get_scalar_param(sub_param_dict, SPARSE_PRUNING_ENABLED, + SPARSE_PRUNING_ENABLED_DEFAULT) + output[SPARSE_PRUNING_METHOD] = get_scalar_param(sub_param_dict, SPARSE_PRUNING_METHOD, SPARSE_PRUNING_METHOD_DEFAULT) - assert output[SPARSE_PRUNING_METHOD] in [SPARSE_PRUNING_METHOD_L1, SPARSE_PRUNING_METHOD_TOPK], f"Invalid sparse pruning method. Supported types: [{SPARSE_PRUNING_METHOD_L1}, {SPARSE_PRUNING_METHOD_TOPK}]" - output[SPARSE_PRUNING_SCHEDULE_OFFSET] = get_scalar_param( - sub_param_dict, - SPARSE_PRUNING_SCHEDULE_OFFSET, - SPARSE_PRUNING_SCHEDULE_OFFSET_DEFAULT) + assert output[SPARSE_PRUNING_METHOD] in [ + SPARSE_PRUNING_METHOD_L1, SPARSE_PRUNING_METHOD_TOPK + ], f"Invalid sparse pruning method. Supported types: [{SPARSE_PRUNING_METHOD_L1}, {SPARSE_PRUNING_METHOD_TOPK}]" + output[SPARSE_PRUNING_SCHEDULE_OFFSET] = get_scalar_param(sub_param_dict, SPARSE_PRUNING_SCHEDULE_OFFSET, + SPARSE_PRUNING_SCHEDULE_OFFSET_DEFAULT) else: output[SPARSE_PRUNING_ENABLED] = SPARSE_PRUNING_ENABLED_DEFAULT output[SPARSE_PRUNING_METHOD] = SPARSE_PRUNING_METHOD_DEFAULT @@ -286,22 +250,17 @@ def get_sparse_pruning_different_groups(param_dict): sub_param_dict = param_dict[DIFFERENT_GROUPS] def get_params(name, group_dict): - assert SPARSE_PRUNING_DENSE_RATIO in group_dict.keys(), f"{SPARSE_PRUNING_DENSE_RATIO} must be specified for sparse pruning group {name}" + assert SPARSE_PRUNING_DENSE_RATIO in group_dict.keys( + ), f"{SPARSE_PRUNING_DENSE_RATIO} must be specified for sparse pruning group {name}" return group_dict for k, v in sub_param_dict.items(): output[k] = {} - output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params( - k, - sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS]) - output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param( - sub_param_dict[k], - DIFFERENT_GROUPS_MODULE_SCOPE, - DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT) + output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params(k, sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS]) + output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param(sub_param_dict[k], DIFFERENT_GROUPS_MODULE_SCOPE, + DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT) output[k][DIFFERENT_GROUPS_RELATED_MODULE_SCOPE] = get_scalar_param( - sub_param_dict[k], - DIFFERENT_GROUPS_RELATED_MODULE_SCOPE, - DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT) + sub_param_dict[k], DIFFERENT_GROUPS_RELATED_MODULE_SCOPE, DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT) return output @@ -315,7 +274,8 @@ def get_row_pruning(param_dict): output[SHARED_PARAMETERS] = get_row_pruning_shared_parameters(sub_param_dict) # each sub-groups if output[SHARED_PARAMETERS][ROW_PRUNING_ENABLED]: - assert DIFFERENT_GROUPS in sub_param_dict.keys(), f"Row Pruning is enabled, {DIFFERENT_GROUPS} must be specified" + assert DIFFERENT_GROUPS in sub_param_dict.keys( + ), f"Row Pruning is enabled, {DIFFERENT_GROUPS} must be specified" output[DIFFERENT_GROUPS] = get_row_pruning_different_groups(sub_param_dict) return output @@ -324,17 +284,14 @@ def get_row_pruning_shared_parameters(param_dict): output = {} if SHARED_PARAMETERS in param_dict.keys(): sub_param_dict = param_dict[SHARED_PARAMETERS] - output[ROW_PRUNING_ENABLED] = get_scalar_param(sub_param_dict, - ROW_PRUNING_ENABLED, + output[ROW_PRUNING_ENABLED] = get_scalar_param(sub_param_dict, ROW_PRUNING_ENABLED, ROW_PRUNING_ENABLED_DEFAULT) - output[ROW_PRUNING_METHOD] = get_scalar_param(sub_param_dict, - ROW_PRUNING_METHOD, - ROW_PRUNING_METHOD_DEFAULT) - assert output[ROW_PRUNING_METHOD] in [ROW_PRUNING_METHOD_L1, ROW_PRUNING_METHOD_TOPK], f"Invalid row pruning method. Supported types: [{ROW_PRUNING_METHOD_L1}, {ROW_PRUNING_METHOD_TOPK}]" - output[ROW_PRUNING_SCHEDULE_OFFSET] = get_scalar_param( - sub_param_dict, - ROW_PRUNING_SCHEDULE_OFFSET, - ROW_PRUNING_SCHEDULE_OFFSET_DEFAULT) + output[ROW_PRUNING_METHOD] = get_scalar_param(sub_param_dict, ROW_PRUNING_METHOD, ROW_PRUNING_METHOD_DEFAULT) + assert output[ROW_PRUNING_METHOD] in [ + ROW_PRUNING_METHOD_L1, ROW_PRUNING_METHOD_TOPK + ], f"Invalid row pruning method. Supported types: [{ROW_PRUNING_METHOD_L1}, {ROW_PRUNING_METHOD_TOPK}]" + output[ROW_PRUNING_SCHEDULE_OFFSET] = get_scalar_param(sub_param_dict, ROW_PRUNING_SCHEDULE_OFFSET, + ROW_PRUNING_SCHEDULE_OFFSET_DEFAULT) else: output[ROW_PRUNING_ENABLED] = ROW_PRUNING_ENABLED_DEFAULT output[ROW_PRUNING_METHOD] = ROW_PRUNING_METHOD_DEFAULT @@ -347,22 +304,17 @@ def get_row_pruning_different_groups(param_dict): sub_param_dict = param_dict[DIFFERENT_GROUPS] def get_params(name, group_dict): - assert ROW_PRUNING_DENSE_RATIO in group_dict.keys(), f"{ROW_PRUNING_DENSE_RATIO} must be specified for row pruning group {name}" + assert ROW_PRUNING_DENSE_RATIO in group_dict.keys( + ), f"{ROW_PRUNING_DENSE_RATIO} must be specified for row pruning group {name}" return group_dict for k, v in sub_param_dict.items(): output[k] = {} - output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params( - k, - sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS]) - output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param( - sub_param_dict[k], - DIFFERENT_GROUPS_MODULE_SCOPE, - DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT) + output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params(k, sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS]) + output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param(sub_param_dict[k], DIFFERENT_GROUPS_MODULE_SCOPE, + DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT) output[k][DIFFERENT_GROUPS_RELATED_MODULE_SCOPE] = get_scalar_param( - sub_param_dict[k], - DIFFERENT_GROUPS_RELATED_MODULE_SCOPE, - DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT) + sub_param_dict[k], DIFFERENT_GROUPS_RELATED_MODULE_SCOPE, DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT) return output @@ -375,7 +327,8 @@ def get_head_pruning(param_dict): output[SHARED_PARAMETERS] = get_head_pruning_shared_parameters(sub_param_dict) # each sub-groups if output[SHARED_PARAMETERS][HEAD_PRUNING_ENABLED]: - assert DIFFERENT_GROUPS in sub_param_dict.keys(), f"Head Pruning is enabled, {DIFFERENT_GROUPS} must be specified" + assert DIFFERENT_GROUPS in sub_param_dict.keys( + ), f"Head Pruning is enabled, {DIFFERENT_GROUPS} must be specified" output[DIFFERENT_GROUPS] = get_head_pruning_different_groups(sub_param_dict) return output @@ -384,19 +337,18 @@ def get_head_pruning_shared_parameters(param_dict): output = {} if SHARED_PARAMETERS in param_dict.keys(): sub_param_dict = param_dict[SHARED_PARAMETERS] - output[HEAD_PRUNING_ENABLED] = get_scalar_param(sub_param_dict, - HEAD_PRUNING_ENABLED, + output[HEAD_PRUNING_ENABLED] = get_scalar_param(sub_param_dict, HEAD_PRUNING_ENABLED, HEAD_PRUNING_ENABLED_DEFAULT) - output[HEAD_PRUNING_METHOD] = get_scalar_param(sub_param_dict, - HEAD_PRUNING_METHOD, + output[HEAD_PRUNING_METHOD] = get_scalar_param(sub_param_dict, HEAD_PRUNING_METHOD, HEAD_PRUNING_METHOD_DEFAULT) - assert output[HEAD_PRUNING_METHOD] in [HEAD_PRUNING_METHOD_L1, HEAD_PRUNING_METHOD_TOPK], f"Invalid head pruning method. Supported types: [{HEAD_PRUNING_METHOD_L1}, {HEAD_PRUNING_METHOD_TOPK}]" - output[HEAD_PRUNING_SCHEDULE_OFFSET] = get_scalar_param( - sub_param_dict, - HEAD_PRUNING_SCHEDULE_OFFSET, - HEAD_PRUNING_SCHEDULE_OFFSET_DEFAULT) + assert output[HEAD_PRUNING_METHOD] in [ + HEAD_PRUNING_METHOD_L1, HEAD_PRUNING_METHOD_TOPK + ], f"Invalid head pruning method. Supported types: [{HEAD_PRUNING_METHOD_L1}, {HEAD_PRUNING_METHOD_TOPK}]" + output[HEAD_PRUNING_SCHEDULE_OFFSET] = get_scalar_param(sub_param_dict, HEAD_PRUNING_SCHEDULE_OFFSET, + HEAD_PRUNING_SCHEDULE_OFFSET_DEFAULT) if output[HEAD_PRUNING_ENABLED]: - assert HEAD_PRUNING_NUM_HEADS in sub_param_dict.keys(), f"{HEAD_PRUNING_NUM_HEADS} must be specified for head pruning" + assert HEAD_PRUNING_NUM_HEADS in sub_param_dict.keys( + ), f"{HEAD_PRUNING_NUM_HEADS} must be specified for head pruning" output[HEAD_PRUNING_NUM_HEADS] = sub_param_dict[HEAD_PRUNING_NUM_HEADS] else: output[HEAD_PRUNING_ENABLED] = HEAD_PRUNING_ENABLED_DEFAULT @@ -410,22 +362,17 @@ def get_head_pruning_different_groups(param_dict): sub_param_dict = param_dict[DIFFERENT_GROUPS] def get_params(name, group_dict): - assert HEAD_PRUNING_DENSE_RATIO in group_dict.keys(), f"dense_ratio must be specified for head pruning group {name}" + assert HEAD_PRUNING_DENSE_RATIO in group_dict.keys( + ), f"dense_ratio must be specified for head pruning group {name}" return group_dict for k, v in sub_param_dict.items(): output[k] = {} - output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params( - k, - sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS]) - output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param( - sub_param_dict[k], - DIFFERENT_GROUPS_MODULE_SCOPE, - DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT) + output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params(k, sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS]) + output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param(sub_param_dict[k], DIFFERENT_GROUPS_MODULE_SCOPE, + DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT) output[k][DIFFERENT_GROUPS_RELATED_MODULE_SCOPE] = get_scalar_param( - sub_param_dict[k], - DIFFERENT_GROUPS_RELATED_MODULE_SCOPE, - DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT) + sub_param_dict[k], DIFFERENT_GROUPS_RELATED_MODULE_SCOPE, DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT) return output @@ -438,7 +385,8 @@ def get_channel_pruning(param_dict): output[SHARED_PARAMETERS] = get_channel_pruning_shared_parameters(sub_param_dict) # each sub-groups if output[SHARED_PARAMETERS][CHANNEL_PRUNING_ENABLED]: - assert DIFFERENT_GROUPS in sub_param_dict.keys(), f"Sparse Pruning is enabled, {DIFFERENT_GROUPS} must be specified" + assert DIFFERENT_GROUPS in sub_param_dict.keys( + ), f"Sparse Pruning is enabled, {DIFFERENT_GROUPS} must be specified" output[DIFFERENT_GROUPS] = get_channel_pruning_different_groups(sub_param_dict) return output @@ -447,19 +395,15 @@ def get_channel_pruning_shared_parameters(param_dict): output = {} if SHARED_PARAMETERS in param_dict.keys(): sub_param_dict = param_dict[SHARED_PARAMETERS] - output[CHANNEL_PRUNING_ENABLED] = get_scalar_param( - sub_param_dict, - CHANNEL_PRUNING_ENABLED, - CHANNEL_PRUNING_ENABLED_DEFAULT) - output[CHANNEL_PRUNING_METHOD] = get_scalar_param( - sub_param_dict, - CHANNEL_PRUNING_METHOD, - CHANNEL_PRUNING_METHOD_DEFAULT) - assert output[CHANNEL_PRUNING_METHOD] in [CHANNEL_PRUNING_METHOD_L1, CHANNEL_PRUNING_METHOD_TOPK], f"Invalid channel pruning method. Supported types: [{CHANNEL_PRUNING_METHOD_L1}, {CHANNEL_PRUNING_METHOD_TOPK}]" - output[CHANNEL_PRUNING_SCHEDULE_OFFSET] = get_scalar_param( - sub_param_dict, - CHANNEL_PRUNING_SCHEDULE_OFFSET, - CHANNEL_PRUNING_SCHEDULE_OFFSET_DEFAULT) + output[CHANNEL_PRUNING_ENABLED] = get_scalar_param(sub_param_dict, CHANNEL_PRUNING_ENABLED, + CHANNEL_PRUNING_ENABLED_DEFAULT) + output[CHANNEL_PRUNING_METHOD] = get_scalar_param(sub_param_dict, CHANNEL_PRUNING_METHOD, + CHANNEL_PRUNING_METHOD_DEFAULT) + assert output[CHANNEL_PRUNING_METHOD] in [ + CHANNEL_PRUNING_METHOD_L1, CHANNEL_PRUNING_METHOD_TOPK + ], f"Invalid channel pruning method. Supported types: [{CHANNEL_PRUNING_METHOD_L1}, {CHANNEL_PRUNING_METHOD_TOPK}]" + output[CHANNEL_PRUNING_SCHEDULE_OFFSET] = get_scalar_param(sub_param_dict, CHANNEL_PRUNING_SCHEDULE_OFFSET, + CHANNEL_PRUNING_SCHEDULE_OFFSET_DEFAULT) else: output[CHANNEL_PRUNING_ENABLED] = CHANNEL_PRUNING_ENABLED_DEFAULT output[CHANNEL_PRUNING_METHOD] = CHANNEL_PRUNING_METHOD_DEFAULT @@ -472,21 +416,16 @@ def get_channel_pruning_different_groups(param_dict): sub_param_dict = param_dict[DIFFERENT_GROUPS] def get_params(name, group_dict): - assert CHANNEL_PRUNING_DENSE_RATIO in group_dict.keys(), f"{CHANNEL_PRUNING_DENSE_RATIO} must be specified for channel pruning group {name}" + assert CHANNEL_PRUNING_DENSE_RATIO in group_dict.keys( + ), f"{CHANNEL_PRUNING_DENSE_RATIO} must be specified for channel pruning group {name}" return group_dict for k, v in sub_param_dict.items(): output[k] = {} - output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params( - k, - sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS]) - output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param( - sub_param_dict[k], - DIFFERENT_GROUPS_MODULE_SCOPE, - DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT) + output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params(k, sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS]) + output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param(sub_param_dict[k], DIFFERENT_GROUPS_MODULE_SCOPE, + DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT) output[k][DIFFERENT_GROUPS_RELATED_MODULE_SCOPE] = get_scalar_param( - sub_param_dict[k], - DIFFERENT_GROUPS_RELATED_MODULE_SCOPE, - DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT) + sub_param_dict[k], DIFFERENT_GROUPS_RELATED_MODULE_SCOPE, DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT) return output diff --git a/deepspeed/compression/helper.py b/deepspeed/compression/helper.py index e839a5d03..12052e239 100644 --- a/deepspeed/compression/helper.py +++ b/deepspeed/compression/helper.py @@ -59,23 +59,17 @@ def module_replacement(model, module_name, compression_technique=None, mpu=None) need_bias = True # Initialize the new module - if isinstance(old_module, - LinearLayer_Compress) or isinstance(old_module, - torch.nn.Linear): + if isinstance(old_module, LinearLayer_Compress) or isinstance(old_module, torch.nn.Linear): if isinstance(old_module, LinearLayer_Compress): new_module = old_module else: - new_module = LinearLayer_Compress(old_module.in_features, - old_module.out_features, - bias=need_bias).to( - device=old_module.weight.device, - dtype=old_module.weight.dtype) + new_module = LinearLayer_Compress(old_module.in_features, old_module.out_features, + bias=need_bias).to(device=old_module.weight.device, + dtype=old_module.weight.dtype) new_module.weight.data = old_module.weight.data if need_bias: new_module.bias.data = old_module.bias.data - elif isinstance(old_module, - Conv2dLayer_Compress) or isinstance(old_module, - torch.nn.Conv2d): + elif isinstance(old_module, Conv2dLayer_Compress) or isinstance(old_module, torch.nn.Conv2d): if isinstance(old_module, Conv2dLayer_Compress): new_module = old_module else: @@ -86,60 +80,48 @@ def module_replacement(model, module_name, compression_technique=None, mpu=None) if need_bias: new_module.bias.data = old_module.bias.data elif isinstance(old_module, torch.nn.BatchNorm2d): - new_module = BNLayer_Compress(old_module.num_features, - old_module.eps, - old_module.momentum, - old_module.affine, - old_module.track_running_stats).to( - old_module.weight.device, - old_module.weight.dtype) + new_module = BNLayer_Compress(old_module.num_features, old_module.eps, old_module.momentum, old_module.affine, + old_module.track_running_stats).to(old_module.weight.device, + old_module.weight.dtype) new_module.weight.data = old_module.weight.data if need_bias: new_module.bias.data = old_module.bias.data new_module.running_mean.data = old_module.running_mean.data new_module.running_var.data = old_module.running_var.data - elif isinstance(old_module, - Embedding_Compress) or isinstance(old_module, - torch.nn.Embedding): + elif isinstance(old_module, Embedding_Compress) or isinstance(old_module, torch.nn.Embedding): if isinstance(old_module, Embedding_Compress): new_module = old_module else: new_module = Embedding_Compress(old_module.num_embeddings, old_module.embedding_dim, old_module.padding_idx, old_module.max_norm, old_module.norm_type, \ old_module.scale_grad_by_freq, old_module.sparse).to(device=old_module.weight.device, dtype=old_module.weight.dtype) new_module.weight.data = old_module.weight.data - elif mpu is not None and (isinstance(old_module, - ColumnParallelLinear_Compress) - or isinstance(old_module, - mpu.ColumnParallelLinear)): + elif mpu is not None and (isinstance(old_module, ColumnParallelLinear_Compress) + or isinstance(old_module, mpu.ColumnParallelLinear)): if isinstance(old_module, ColumnParallelLinear_Compress): new_module = old_module else: - new_module = ColumnParallelLinear_Compress( - mpu, - old_module.input_size, - old_module.output_size, - gather_output=old_module.gather_output, - skip_bias_add=old_module.skip_bias_add, - bias=need_bias).to(device=old_module.weight.device, - dtype=old_module.weight.dtype) + new_module = ColumnParallelLinear_Compress(mpu, + old_module.input_size, + old_module.output_size, + gather_output=old_module.gather_output, + skip_bias_add=old_module.skip_bias_add, + bias=need_bias).to(device=old_module.weight.device, + dtype=old_module.weight.dtype) new_module.weight.data = old_module.weight.data if need_bias: new_module.bias.data = old_module.bias.data - elif mpu is not None and (isinstance(old_module, - RowParallelLinear_Compress) - or isinstance(old_module, - mpu.RowParallelLinear)): + elif mpu is not None and (isinstance(old_module, RowParallelLinear_Compress) + or isinstance(old_module, mpu.RowParallelLinear)): if isinstance(old_module, RowParallelLinear_Compress): new_module = old_module else: - new_module = RowParallelLinear_Compress( - mpu, - old_module.input_size, - old_module.output_size, - input_is_parallel=old_module.input_is_parallel, - skip_bias_add=old_module.skip_bias_add, - bias=need_bias).to(device=old_module.weight.device, - dtype=old_module.weight.dtype) + new_module = RowParallelLinear_Compress(mpu, + old_module.input_size, + old_module.output_size, + input_is_parallel=old_module.input_is_parallel, + skip_bias_add=old_module.skip_bias_add, + bias=need_bias).to(device=old_module.weight.device, + dtype=old_module.weight.dtype) new_module.weight.data = old_module.weight.data if need_bias: new_module.bias.data = old_module.bias.data @@ -150,39 +132,30 @@ def module_replacement(model, module_name, compression_technique=None, mpu=None) for k, v in compression_technique.items(): if k == SPARSE_PRUNING: if v[SPARSE_PRUNING_ENABLED]: - new_module.enable_sparse_pruning(v[SPARSE_PRUNING_DENSE_RATIO], - v[SPARSE_PRUNING_METHOD]) + new_module.enable_sparse_pruning(v[SPARSE_PRUNING_DENSE_RATIO], v[SPARSE_PRUNING_METHOD]) elif k == ROW_PRUNING: if v[ROW_PRUNING_ENABLED]: - new_module.enable_row_pruning(v[ROW_PRUNING_DENSE_RATIO], - v[ROW_PRUNING_METHOD]) + new_module.enable_row_pruning(v[ROW_PRUNING_DENSE_RATIO], v[ROW_PRUNING_METHOD]) elif k == HEAD_PRUNING: if v[HEAD_PRUNING_ENABLED]: - new_module.enable_head_pruning(v[HEAD_PRUNING_DENSE_RATIO], - v[HEAD_PRUNING_METHOD], + new_module.enable_head_pruning(v[HEAD_PRUNING_DENSE_RATIO], v[HEAD_PRUNING_METHOD], v[HEAD_PRUNING_NUM_HEADS]) elif k == ACTIVATION_QUANTIZATION: if v[ACTIVATION_QUANTIZATION_ENABLED]: - new_module.enable_activation_quantization( - v[ACTIVATION_QUANTIZE_BITS], - v[ACTIVATION_QUANTIZE_TYPE], - v[ACTIVATION_QUANTIZE_RANGE]) + new_module.enable_activation_quantization(v[ACTIVATION_QUANTIZE_BITS], v[ACTIVATION_QUANTIZE_TYPE], + v[ACTIVATION_QUANTIZE_RANGE]) elif k == WEIGHT_QUANTIZATION: if v[WEIGHT_QUANTIZE_ENABLED]: - new_module.enable_weight_quantization( - v[WEIGHT_QUANTIZE_START_BITS], - v[WEIGHT_QUANTIZE_TARGET_BITS], - v[WEIGHT_QUANTIZATION_PERIOD], - v[WEIGHT_QUANTIZE_IN_FORWARD_ENABLED], - v[WEIGHT_QUANTIZE_TYPE], - v[WEIGHT_QUANTIZE_GROUPS]) + new_module.enable_weight_quantization(v[WEIGHT_QUANTIZE_START_BITS], + v[WEIGHT_QUANTIZE_TARGET_BITS], + v[WEIGHT_QUANTIZATION_PERIOD], + v[WEIGHT_QUANTIZE_IN_FORWARD_ENABLED], + v[WEIGHT_QUANTIZE_TYPE], v[WEIGHT_QUANTIZE_GROUPS]) elif k == CHANNEL_PRUNING: if v[CHANNEL_PRUNING_ENABLED]: - new_module.enable_channel_pruning(v[CHANNEL_PRUNING_DENSE_RATIO], - v[CHANNEL_PRUNING_METHOD]) + new_module.enable_channel_pruning(v[CHANNEL_PRUNING_DENSE_RATIO], v[CHANNEL_PRUNING_METHOD]) else: - raise NotImplementedError( - 'Compression technique {} is not implemented'.format(k)) + raise NotImplementedError('Compression technique {} is not implemented'.format(k)) # Replace the old module with the new one recursive_setattr(model, module_name, new_module) @@ -195,10 +168,7 @@ def is_module_compressible(module, mpu=None): isinstance(module, torch.nn.BatchNorm2d) if mpu is not None: - ret = ret or isinstance(module, - mpu.RowParallelLinear) or isinstance( - module, - mpu.ColumnParallelLinear) + ret = ret or isinstance(module, mpu.RowParallelLinear) or isinstance(module, mpu.ColumnParallelLinear) return ret @@ -225,11 +195,7 @@ def compression_preparation(model, compression_techinique_list, mpu): return model -def fix_compression(model, - module_name, - compression_technique, - mask=None, - dim_reduction=False): +def fix_compression(model, module_name, compression_technique, mask=None, dim_reduction=False): """ Fix the compression technique of a module. Args: @@ -243,17 +209,14 @@ def fix_compression(model, # Here we can make things much simpler by just replacing the module module = recursive_getattr(model, module_name) for k, v in compression_technique.items(): - if k == WEIGHT_QUANTIZATION and v[WEIGHT_QUANTIZE_IN_FORWARD_ENABLED] and v[ - WEIGHT_QUANTIZE_ENABLED]: + if k == WEIGHT_QUANTIZATION and v[WEIGHT_QUANTIZE_IN_FORWARD_ENABLED] and v[WEIGHT_QUANTIZE_ENABLED]: return module.fix_weight_quantization() elif k == SPARSE_PRUNING and v[SPARSE_PRUNING_ENABLED]: return module.fix_sparse_pruning_helper() elif k == ROW_PRUNING and (v[ROW_PRUNING_ENABLED] or mask is not None): return module.fix_row_col_pruning_helper(mask, dim_reduction=dim_reduction) elif k == HEAD_PRUNING and (v[HEAD_PRUNING_ENABLED] or mask is not None): - return module.fix_head_pruning_helper(mask, - v[HEAD_PRUNING_NUM_HEADS], - dim_reduction=dim_reduction) + return module.fix_head_pruning_helper(mask, v[HEAD_PRUNING_NUM_HEADS], dim_reduction=dim_reduction) elif k == CHANNEL_PRUNING and (v[CHANNEL_PRUNING_ENABLED] or mask is not None): return module.fix_channel_pruning_helper(mask, dim_reduction=dim_reduction) @@ -270,10 +233,9 @@ def convert_conv1d_to_linear(model, convert_type): for name, module in c_model.named_modules(): if isinstance(module, convert_type): old_module = recursive_getattr(c_model, name) - new_module = torch.nn.Linear( - old_module.weight.data.size(0), - old_module.weight.data.size(1), - bias=True if old_module.bias is not None else False) + new_module = torch.nn.Linear(old_module.weight.data.size(0), + old_module.weight.data.size(1), + bias=True if old_module.bias is not None else False) new_module.weight.data = old_module.weight.data.t().contiguous() if new_module.bias is not None: new_module.bias.data = old_module.bias.data.view(-1) diff --git a/deepspeed/compression/scheduler.py b/deepspeed/compression/scheduler.py index 67955a825..7755e94c3 100644 --- a/deepspeed/compression/scheduler.py +++ b/deepspeed/compression/scheduler.py @@ -10,6 +10,7 @@ class compression_scheduler(): ''' Used to schedule different compression methods ''' + def __init__(self, model, compression_config): self.model = model self.compression_config = compression_config @@ -38,22 +39,22 @@ class compression_scheduler(): } exist_module_name = set() shared_parameters = method_content[SHARED_PARAMETERS] - self.different_compression_methods[method][ - TECHNIQUE_ENABLED] = shared_parameters[TECHNIQUE_ENABLED] - self.different_compression_methods[method][ - SHARED_PARAMETERS] = shared_parameters + self.different_compression_methods[method][TECHNIQUE_ENABLED] = shared_parameters[TECHNIQUE_ENABLED] + self.different_compression_methods[method][SHARED_PARAMETERS] = shared_parameters for group_name, method_parameters in method_content[DIFFERENT_GROUPS].items(): module_name_list = [] for key_word in method_parameters[DIFFERENT_GROUPS_MODULE_SCOPE]: - module_name, exist_module_name = get_module_name(group_name, self.model, key_word, exist_module_name, verbose=False) + module_name, exist_module_name = get_module_name(group_name, + self.model, + key_word, + exist_module_name, + verbose=False) module_name_list.extend(module_name) if module_name_list: - self.different_compression_methods[method][DIFFERENT_GROUPS].append([ - group_name, - module_name_list, - method_parameters.copy().pop('params') - ]) + self.different_compression_methods[method][DIFFERENT_GROUPS].append( + [group_name, module_name_list, + method_parameters.copy().pop('params')]) def check_weight_quantization(self): # check weight quantization @@ -69,8 +70,7 @@ class compression_scheduler(): module.weight_quantization_enabled = True if not self.verbose[WEIGHT_QUANTIZATION]: - logger.info( - f'Weight quantization is enabled at step {self.training_steps}') + logger.info(f'Weight quantization is enabled at step {self.training_steps}') self.weight_quantization_enabled = True self.verbose[WEIGHT_QUANTIZATION] = True @@ -87,9 +87,7 @@ class compression_scheduler(): module = recursive_getattr(self.model, module_name) module.activation_quantization_enabled = True if not self.verbose[ACTIVATION_QUANTIZATION]: - logger.info( - f'Activation quantization is enabled at step {self.training_steps}' - ) + logger.info(f'Activation quantization is enabled at step {self.training_steps}') self.verbose[ACTIVATION_QUANTIZATION] = True def check_sparse_pruning(self): @@ -105,8 +103,7 @@ class compression_scheduler(): module = recursive_getattr(self.model, module_name) module.sparse_pruning_enabled = True if not self.verbose[SPARSE_PRUNING]: - logger.info( - f'Sparse pruning is enabled at step {self.training_steps}') + logger.info(f'Sparse pruning is enabled at step {self.training_steps}') self.verbose[SPARSE_PRUNING] = True def check_head_pruning(self): @@ -154,8 +151,7 @@ class compression_scheduler(): module = recursive_getattr(self.model, module_name) module.channel_pruning_enabled = True if not self.verbose[CHANNEL_PRUNING]: - logger.info( - f'Channel pruning is enabled at step {self.training_steps}') + logger.info(f'Channel pruning is enabled at step {self.training_steps}') self.verbose[CHANNEL_PRUNING] = True def check_all_modules(self): diff --git a/deepspeed/compression/utils.py b/deepspeed/compression/utils.py index 2492c8ce9..9f6d50b45 100644 --- a/deepspeed/compression/utils.py +++ b/deepspeed/compression/utils.py @@ -13,6 +13,7 @@ class TopKBinarizer(autograd.Function): Implementation is inspired from: https://github.com/yaozhewei/MLPruning """ + @staticmethod def forward(ctx, inputs: torch.tensor, threshold: float, sigmoid: bool): """ @@ -59,6 +60,7 @@ class SymQuantizer(torch.autograd.Function): """ Symmetric quantization """ + @staticmethod def forward(ctx, input, num_bits, min_value=None, max_value=None, num_groups=1): """ @@ -75,9 +77,8 @@ class SymQuantizer(torch.autograd.Function): quantized_input (`torch.FloatTensor`) Quantized input """ - assert (min_value is None - and max_value is None) or (min_value is not None - and max_value is not None and num_groups == 1) + assert (min_value is None and max_value is None) or (min_value is not None and max_value is not None + and num_groups == 1) q_range = 2**num_bits input_shape = input.shape if min_value is None: @@ -101,6 +102,7 @@ class AsymQuantizer(torch.autograd.Function): """ Asymmetric quantization """ + @staticmethod def forward(ctx, input, num_bits, min_value=None, max_value=None, num_groups=1): """ @@ -118,9 +120,8 @@ class AsymQuantizer(torch.autograd.Function): Quantized input """ - assert (min_value is None - and max_value is None) or (min_value is not None - and max_value is not None and num_groups == 1) + assert (min_value is None and max_value is None) or (min_value is not None and max_value is not None + and num_groups == 1) q_range = 2**num_bits input_shape = input.shape if min_value is None: @@ -131,9 +132,7 @@ class AsymQuantizer(torch.autograd.Function): scale = (max_value - min_value) / q_range zero_point = (min_value / scale).round() * scale - output = ( - (input - zero_point) / scale).round().clamp(0, - q_range - 1) * scale + zero_point + output = ((input - zero_point) / scale).round().clamp(0, q_range - 1) * scale + zero_point output = output.reshape(input_shape).contiguous() return output @@ -147,6 +146,7 @@ class TernaryQuantizer(torch.autograd.Function): """ Ternary quantization """ + @staticmethod def forward(ctx, input, num_bits, min_value=None, max_value=None, num_groups=1): """ @@ -187,6 +187,7 @@ class BinaryQuantizer(torch.autograd.Function): """ Binary quantization """ + @staticmethod def forward(ctx, input, num_bits, min_value=None, max_value=None, num_groups=1): """ diff --git a/deepspeed/elasticity/config.py b/deepspeed/elasticity/config.py index ffbce7028..ff4312271 100644 --- a/deepspeed/elasticity/config.py +++ b/deepspeed/elasticity/config.py @@ -43,77 +43,64 @@ class ElasticityConfig: "version": 0.1 } """ + def __init__(self, param_dict): self.enabled = param_dict.get(ENABLED, ENABLED_DEFAULT) if self.enabled: if MAX_ACCEPTABLE_BATCH_SIZE in param_dict: self.max_acceptable_batch_size = param_dict[MAX_ACCEPTABLE_BATCH_SIZE] else: - raise ElasticityConfigError( - f"Elasticity config missing {MAX_ACCEPTABLE_BATCH_SIZE}") + raise ElasticityConfigError(f"Elasticity config missing {MAX_ACCEPTABLE_BATCH_SIZE}") if MICRO_BATCHES in param_dict: self.micro_batches = param_dict[MICRO_BATCHES] else: raise ElasticityConfigError(f"Elasticity config missing {MICRO_BATCHES}") else: - self.max_acceptable_batch_size = param_dict.get( - MAX_ACCEPTABLE_BATCH_SIZE, - MAX_ACCEPTABLE_BATCH_SIZE_DEFAULT) + self.max_acceptable_batch_size = param_dict.get(MAX_ACCEPTABLE_BATCH_SIZE, + MAX_ACCEPTABLE_BATCH_SIZE_DEFAULT) self.micro_batches = param_dict.get(MICRO_BATCHES, MICRO_BATCHES_DEFAULT) if not isinstance(self.micro_batches, list): raise ElasticityConfigError( f"Elasticity expected value of {MICRO_BATCHES} to be a " - f"list of micro batches, instead is: {type(self.micro_batches)}, containing: {self.micro_batches}" - ) + f"list of micro batches, instead is: {type(self.micro_batches)}, containing: {self.micro_batches}") if not all(map(lambda m: isinstance(m, int), self.micro_batches)): - raise ElasticityConfigError( - f"Elasticity expected {MICRO_BATCHES} to only contain a list of integers, " - f"instead contains: f{self.micro_batches}") + raise ElasticityConfigError(f"Elasticity expected {MICRO_BATCHES} to only contain a list of integers, " + f"instead contains: f{self.micro_batches}") if not all(map(lambda m: m > 0, self.micro_batches)): - raise ElasticityConfigError( - f"Elasticity expected {MICRO_BATCHES} to only contain positive integers, " - f"instead contains: f{self.micro_batches}") + raise ElasticityConfigError(f"Elasticity expected {MICRO_BATCHES} to only contain positive integers, " + f"instead contains: f{self.micro_batches}") self.min_gpus = param_dict.get(MIN_GPUS, MIN_GPUS_DEFAULT) self.max_gpus = param_dict.get(MAX_GPUS, MAX_GPUS_DEFAULT) if self.min_gpus < 1 or self.max_gpus < 1: - raise ElasticityConfigError( - "Elasticity min/max gpus must be > 0, " - f"given min_gpus: {self.min_gpus}, max_gpus: {self.max_gpus}") + raise ElasticityConfigError("Elasticity min/max gpus must be > 0, " + f"given min_gpus: {self.min_gpus}, max_gpus: {self.max_gpus}") if self.max_gpus < self.min_gpus: - raise ElasticityConfigError( - "Elasticity min_gpus cannot be greater than max_gpus, " - f"given min_gpus: {self.min_gpus}, max_gpus: {self.max_gpus}") + raise ElasticityConfigError("Elasticity min_gpus cannot be greater than max_gpus, " + f"given min_gpus: {self.min_gpus}, max_gpus: {self.max_gpus}") - self.model_parallel_size = param_dict.get(MODEL_PARLLEL_SIZE, - MODEL_PARLLEL_SIZE_DEFAULT) + self.model_parallel_size = param_dict.get(MODEL_PARLLEL_SIZE, MODEL_PARLLEL_SIZE_DEFAULT) if self.model_parallel_size < 1: - raise ElasticityConfigError( - "Model-Parallel size cannot be less than 1, " - f"given model-parallel size: {self.model_parallel_size}") + raise ElasticityConfigError("Model-Parallel size cannot be less than 1, " + f"given model-parallel size: {self.model_parallel_size}") - self.num_gpus_per_node = param_dict.get(NUM_GPUS_PER_NODE, - NUM_GPUS_PER_NODE_DEFAULT) + self.num_gpus_per_node = param_dict.get(NUM_GPUS_PER_NODE, NUM_GPUS_PER_NODE_DEFAULT) if self.num_gpus_per_node < 1: - raise ElasticityConfigError( - "Number of GPUs per node cannot be less than 1, " - f"given number of GPUs per node: {self.num_gpus_per_node}") + raise ElasticityConfigError("Number of GPUs per node cannot be less than 1, " + f"given number of GPUs per node: {self.num_gpus_per_node}") self.min_time = param_dict.get(MIN_TIME, MIN_TIME_DEFAULT) if self.min_time < 0: - raise ElasticityConfigError( - f"Elasticity min time needs to be >= 0: given {self.min_time}") + raise ElasticityConfigError(f"Elasticity min time needs to be >= 0: given {self.min_time}") self.version = param_dict.get(VERSION, VERSION_DEFAULT) - self.prefer_larger_batch_size = param_dict.get(PREFER_LARGER_BATCH, - PREFER_LARGER_BATCH_DEFAULT) - self.ignore_non_elastic_batch_info = param_dict.get( - IGNORE_NON_ELASTIC_BATCH_INFO, - IGNORE_NON_ELASTIC_BATCH_INFO_DEFAULT) + self.prefer_larger_batch_size = param_dict.get(PREFER_LARGER_BATCH, PREFER_LARGER_BATCH_DEFAULT) + self.ignore_non_elastic_batch_info = param_dict.get(IGNORE_NON_ELASTIC_BATCH_INFO, + IGNORE_NON_ELASTIC_BATCH_INFO_DEFAULT) def repr(self): return self.__dict__ diff --git a/deepspeed/elasticity/elastic_agent.py b/deepspeed/elasticity/elastic_agent.py index db0a61b4f..feeb4565a 100644 --- a/deepspeed/elasticity/elastic_agent.py +++ b/deepspeed/elasticity/elastic_agent.py @@ -23,6 +23,7 @@ import subprocess class DSElasticAgent(LocalElasticAgent): + def __init__( self, spec: WorkerSpec, @@ -35,9 +36,7 @@ class DSElasticAgent(LocalElasticAgent): self.ds_env = env @staticmethod - def _set_master_addr_port(store: Store, - master_addr: Optional[str], - master_port: Optional[int]): + def _set_master_addr_port(store: Store, master_addr: Optional[str], master_port: Optional[int]): if master_port is None: sock = _get_socket_with_port() with closing(sock): @@ -82,8 +81,7 @@ class DSElasticAgent(LocalElasticAgent): "TORCHELASTIC_MAX_RESTARTS": str(spec.max_restarts), "TORCHELASTIC_RUN_ID": spec.rdzv_handler.get_run_id(), "TORCHELASTIC_USE_AGENT_STORE": str(use_agent_store), - "NCCL_ASYNC_ERROR_HANDLING": os.getenv("NCCL_ASYNC_ERROR_HANDLING", - str(1)), + "NCCL_ASYNC_ERROR_HANDLING": os.getenv("NCCL_ASYNC_ERROR_HANDLING", str(1)), } worker_env_ds.update(worker_env_elastic) if "OMP_NUM_THREADS" in os.environ: @@ -120,8 +118,7 @@ class DSElasticAgent(LocalElasticAgent): spec = self._worker_group.spec role = spec.role - log.info( - f"[{role}] starting workers for entrypoint: {spec.get_entrypoint_name()}") + log.info(f"[{role}] starting workers for entrypoint: {spec.get_entrypoint_name()}") self._initialize_workers(self._worker_group) monitor_interval = spec.monitor_interval @@ -136,13 +133,10 @@ class DSElasticAgent(LocalElasticAgent): state = run_result.state self._worker_group.state = state - expire_time = datetime.utcnow() - ( - rdzv_handler._settings.keep_alive_interval * - rdzv_handler._settings.keep_alive_max_attempt) + expire_time = datetime.utcnow() - (rdzv_handler._settings.keep_alive_interval * + rdzv_handler._settings.keep_alive_max_attempt) _dead_nodes = [ - node for node, - last_heartbeat in - rdzv_handler._state_holder.state.last_heartbeats.items() + node for node, last_heartbeat in rdzv_handler._state_holder.state.last_heartbeats.items() if last_heartbeat < expire_time ] @@ -150,21 +144,16 @@ class DSElasticAgent(LocalElasticAgent): put_metric(f"workers.{role}.{state.name.lower()}", 1) if state == WorkerState.SUCCEEDED: - log.info( - f"[{role}] worker group successfully finished." - f" Waiting {self._exit_barrier_timeout} seconds for other agents to finish." - ) + log.info(f"[{role}] worker group successfully finished." + f" Waiting {self._exit_barrier_timeout} seconds for other agents to finish.") self._exit_barrier() return run_result - elif state in { - WorkerState.UNHEALTHY, - WorkerState.FAILED - } or len(participants) > len(rdzv_handler._state_holder.state.participants): + elif state in {WorkerState.UNHEALTHY, WorkerState.FAILED + } or len(participants) > len(rdzv_handler._state_holder.state.participants): if self._remaining_restarts > 0: - log.info( - f"[{role}] Worker group {state.name}. " - f"{self._remaining_restarts}/{spec.max_restarts} attempts left;" - f" will restart worker group") + log.info(f"[{role}] Worker group {state.name}. " + f"{self._remaining_restarts}/{spec.max_restarts} attempts left;" + f" will restart worker group") self._remaining_restarts -= 1 # rdzv_handler._state_holder.state.restart = False self._restart_workers(self._worker_group) diff --git a/deepspeed/elasticity/elasticity.py b/deepspeed/elasticity/elasticity.py index 17a8b6ecf..0f8f2ba58 100644 --- a/deepspeed/elasticity/elasticity.py +++ b/deepspeed/elasticity/elasticity.py @@ -17,44 +17,8 @@ from ..utils import logger # Thirty eight smallest highly composite numbers. The list should # be enough to support up to 720K batch size. HCN_LIST = [ - 1, - 2, - 4, - 6, - 12, - 24, - 36, - 48, - 60, - 120, - 180, - 240, - 360, - 720, - 840, - 1260, - 1680, - 2520, - 5040, - 7560, - 10080, - 15120, - 20160, - 25200, - 27720, - 45360, - 50400, - 55440, - 83160, - 110880, - 166320, - 221760, - 277200, - 332640, - 498960, - 554400, - 665280, - 720720 + 1, 2, 4, 6, 12, 24, 36, 48, 60, 120, 180, 240, 360, 720, 840, 1260, 1680, 2520, 5040, 7560, 10080, 15120, 20160, + 25200, 27720, 45360, 50400, 55440, 83160, 110880, 166320, 221760, 277200, 332640, 498960, 554400, 665280, 720720 ] @@ -94,11 +58,7 @@ def get_valid_gpus(batch_size, micro_batches, min_valid_gpus, max_valid_gpus): return valid_gpus -def get_best_candidates(candidate_batch_sizes, - micro_batches, - min_gpus, - max_gpus, - prefer_larger): +def get_best_candidates(candidate_batch_sizes, micro_batches, min_gpus, max_gpus, prefer_larger): max_valid_gpus = 0 valid_gpus = None @@ -106,15 +66,11 @@ def get_best_candidates(candidate_batch_sizes, for batch_size in candidate_batch_sizes: - current_valid_gpus = get_valid_gpus(batch_size, - micro_batches, - min_gpus, - max_gpus) + current_valid_gpus = get_valid_gpus(batch_size, micro_batches, min_gpus, max_gpus) - if (len(current_valid_gpus) > max_valid_gpus - or (len(current_valid_gpus) == max_valid_gpus and - ((prefer_larger and batch_size > final_batch_size) or - (not prefer_larger and batch_size < final_batch_size)))): + if (len(current_valid_gpus) > max_valid_gpus or (len(current_valid_gpus) == max_valid_gpus and + ((prefer_larger and batch_size > final_batch_size) or + (not prefer_larger and batch_size < final_batch_size)))): max_valid_gpus = len(current_valid_gpus) valid_gpus = current_valid_gpus final_batch_size = batch_size @@ -157,15 +113,10 @@ def _get_compatible_gpus_v01(micro_batches, base_list.extend(micro_batches) base_list.append(lcm) - candidate_batch_sizes = get_candidate_batch_sizes(base_list, - max_acceptable_batch_size) + candidate_batch_sizes = get_candidate_batch_sizes(base_list, max_acceptable_batch_size) - final_batch_size, valid_gpus = get_best_candidates( - candidate_batch_sizes, - micro_batches, - min_gpus, - max_gpus, - prefer_larger) + final_batch_size, valid_gpus = get_best_candidates(candidate_batch_sizes, micro_batches, min_gpus, max_gpus, + prefer_larger) return final_batch_size, valid_gpus @@ -203,11 +154,12 @@ def _get_compatible_gpus_v02(micro_batches, dp_size_per_node = num_gpus_per_node // model_parallel_size - final_batch_size, valid_world_size = _get_compatible_gpus_v01(micro_batches, - int(max_acceptable_batch_size/dp_size_per_node), - int(min_gpus/num_gpus_per_node), - int(max_gpus/num_gpus_per_node), # Passing number of max nodes as Elasticity v2 works at node level - prefer_larger=prefer_larger) + final_batch_size, valid_world_size = _get_compatible_gpus_v01( + micro_batches, + int(max_acceptable_batch_size / dp_size_per_node), + int(min_gpus / num_gpus_per_node), + int(max_gpus / num_gpus_per_node), # Passing number of max nodes as Elasticity v2 works at node level + prefer_larger=prefer_larger) final_batch_size = int(final_batch_size) * dp_size_per_node valid_dp_world_size = [i * dp_size_per_node for i in valid_world_size] @@ -256,38 +208,27 @@ def ensure_immutable_elastic_config(runtime_elastic_config_dict: dict): Ensure the resource scheduler saw the same elastic config we are using at runtime """ if DEEPSPEED_ELASTICITY_CONFIG in os.environ: - scheduler_elastic_config_dict = json.loads( - os.environ[DEEPSPEED_ELASTICITY_CONFIG]) + scheduler_elastic_config_dict = json.loads(os.environ[DEEPSPEED_ELASTICITY_CONFIG]) scheduler_elastic_config = ElasticityConfig(scheduler_elastic_config_dict) runtime_elastic_config = ElasticityConfig(runtime_elastic_config_dict) err_str = "Elastic config '{}={}' seen by resource scheduler does not match config passed to runtime {}={}" if runtime_elastic_config.max_acceptable_batch_size != scheduler_elastic_config.max_acceptable_batch_size: raise ElasticityConfigError( - err_str.format('max_acceptable_batch_size', - scheduler_elastic_config.max_acceptable_batch_size, - 'max_acceptable_batch_size', - runtime_elastic_config.max_acceptable_batch_size)) + err_str.format('max_acceptable_batch_size', scheduler_elastic_config.max_acceptable_batch_size, + 'max_acceptable_batch_size', runtime_elastic_config.max_acceptable_batch_size)) if runtime_elastic_config.micro_batches != scheduler_elastic_config.micro_batches: raise ElasticityConfigError( - err_str.format('micro_batches', - scheduler_elastic_config.micro_batches, - 'micro_batches', + err_str.format('micro_batches', scheduler_elastic_config.micro_batches, 'micro_batches', runtime_elastic_config.micro_batches)) if runtime_elastic_config.version != scheduler_elastic_config.version: raise ElasticityConfigError( - err_str.format('version', - scheduler_elastic_config.version, - 'version', - runtime_elastic_config.version)) + err_str.format('version', scheduler_elastic_config.version, 'version', runtime_elastic_config.version)) else: logger.warning("Unable to find DEEPSPEED_ELASTICITY_CONFIG environment variable, cannot " \ "guarantee resource scheduler will scale this job using compatible GPU counts.") -def compute_elastic_config(ds_config: dict, - target_deepspeed_version: str, - world_size=0, - return_microbatch=False): +def compute_elastic_config(ds_config: dict, target_deepspeed_version: str, world_size=0, return_microbatch=False): """Core deepspeed elasticity API. Given an elastic config (similar to the example below) DeepSpeed will compute a total train batch size corresponding valid GPU count list that provides a high level of elasticity. Elasticity in this case means we are safe to scale @@ -397,8 +338,7 @@ def compute_elastic_config(ds_config: dict, # ensure batch size is int dtype final_batch_size = int(final_batch_size) else: - raise NotImplementedError( - f"Unable to find elastic logic for version: {elastic_config.version}") + raise NotImplementedError(f"Unable to find elastic logic for version: {elastic_config.version}") logger.info(f"Valid World Size (GPUs / Model Parallel Size): {valid_gpus}") diff --git a/deepspeed/env_report.py b/deepspeed/env_report.py index e8be40a05..04a8d1bf6 100644 --- a/deepspeed/env_report.py +++ b/deepspeed/env_report.py @@ -48,8 +48,7 @@ def op_report(verbose=True): dots = "." * (max_dots - len(op_name)) is_compatible = OKAY if builder.is_compatible(verbose) else no is_installed = installed if installed_ops[op_name] else no - dots2 = '.' * ((len(h[1]) + (max_dots2 - len(h[1]))) - - (len(is_installed) - color_len)) + dots2 = '.' * ((len(h[1]) + (max_dots2 - len(h[1]))) - (len(is_installed) - color_len)) print(op_name, dots, is_installed, dots2, is_compatible) print("-" * (max_dots + max_dots2 + len(h[0]) + len(h[1]))) @@ -68,9 +67,7 @@ def nvcc_version(): if cuda_home is None: return f"{RED} [FAIL] cannot find CUDA_HOME via torch.utils.cpp_extension.CUDA_HOME={torch.utils.cpp_extension.CUDA_HOME} {END}" try: - output = subprocess.check_output([cuda_home + "/bin/nvcc", - "-V"], - universal_newlines=True) + output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"], universal_newlines=True) except FileNotFoundError: return f"{RED} [FAIL] nvcc missing {END}" output_split = output.split() @@ -82,32 +79,18 @@ def nvcc_version(): def debug_report(): max_dots = 33 - report = [ - ("torch install path", - torch.__path__), - ("torch version", - torch.__version__), - ("deepspeed install path", - deepspeed.__path__), - ("deepspeed info", - f"{deepspeed.__version__}, {deepspeed.__git_hash__}, {deepspeed.__git_branch__}" - ) - ] + report = [("torch install path", torch.__path__), ("torch version", torch.__version__), + ("deepspeed install path", deepspeed.__path__), + ("deepspeed info", f"{deepspeed.__version__}, {deepspeed.__git_hash__}, {deepspeed.__git_branch__}")] if get_accelerator().device_name() == 'cuda': hip_version = getattr(torch.version, "hip", None) - report.extend([("torch cuda version", - torch.version.cuda), - ("torch hip version", - hip_version), - ("nvcc version", - (None if hip_version else nvcc_version())), - ("deepspeed wheel compiled w.", - f"torch {torch_info['version']}, " + - (f"hip {torch_info['hip_version']}" - if hip_version else f"cuda {torch_info['cuda_version']}"))]) + report.extend([("torch cuda version", torch.version.cuda), ("torch hip version", hip_version), + ("nvcc version", (None if hip_version else nvcc_version())), + ("deepspeed wheel compiled w.", f"torch {torch_info['version']}, " + + (f"hip {torch_info['hip_version']}" if hip_version else f"cuda {torch_info['cuda_version']}")) + ]) else: - report.extend([("deepspeed wheel compiled w.", - f"torch {torch_info['version']} ")]) + report.extend([("deepspeed wheel compiled w.", f"torch {torch_info['version']} ")]) print("DeepSpeed general environment info:") for name, value in report: @@ -116,15 +99,10 @@ def debug_report(): def parse_arguments(): parser = argparse.ArgumentParser() - parser.add_argument( - '--hide_operator_status', - action='store_true', - help= - 'Suppress display of installation and compatibility statuses of DeepSpeed operators. ' - ) - parser.add_argument('--hide_errors_and_warnings', + parser.add_argument('--hide_operator_status', action='store_true', - help='Suppress warning and error messages.') + help='Suppress display of installation and compatibility statuses of DeepSpeed operators. ') + parser.add_argument('--hide_errors_and_warnings', action='store_true', help='Suppress warning and error messages.') args = parser.parse_args() return args @@ -137,8 +115,7 @@ def main(hide_operator_status=False, hide_errors_and_warnings=False): def cli_main(): args = parse_arguments() - main(hide_operator_status=args.hide_operator_status, - hide_errors_and_warnings=args.hide_errors_and_warnings) + main(hide_operator_status=args.hide_operator_status, hide_errors_and_warnings=args.hide_errors_and_warnings) if __name__ == "__main__": diff --git a/deepspeed/inference/config.py b/deepspeed/inference/config.py index 57733996c..94719d1ff 100644 --- a/deepspeed/inference/config.py +++ b/deepspeed/inference/config.py @@ -224,9 +224,7 @@ class DeepSpeedInferenceConfig(DeepSpeedConfigModel): replace_method: str = Field( "auto", deprecated=True, - deprecated_msg= - "This parameter is no longer needed, please remove from your call to DeepSpeed-inference" - ) + deprecated_msg="This parameter is no longer needed, please remove from your call to DeepSpeed-inference") injection_policy: Dict = Field(None, alias="injection_dict") """ @@ -237,9 +235,7 @@ class DeepSpeedInferenceConfig(DeepSpeedConfigModel): injection_policy_tuple: tuple = None """ TODO: Add docs """ - config: Dict = Field( - None, - alias="args") # todo: really no need for this field if we can refactor + config: Dict = Field(None, alias="args") # todo: really no need for this field if we can refactor max_out_tokens: int = Field(1024, alias="max_tokens") """ @@ -256,18 +252,10 @@ class DeepSpeedInferenceConfig(DeepSpeedConfigModel): """ mpu: object = Field(None, deprecated=True, new_param="tensor_parallel.mpu") ep_size: int = Field(1, deprecated=True, new_param="moe.ep_size") - ep_group: object = Field(None, - alias="expert_group", - deprecated=True, - new_param="moe.ep_group") - ep_mp_group: object = Field(None, - alias="expert_mp_group", - deprecated=True, - new_param="moe.ep_mp_group") + ep_group: object = Field(None, alias="expert_group", deprecated=True, new_param="moe.ep_group") + ep_mp_group: object = Field(None, alias="expert_mp_group", deprecated=True, new_param="moe.ep_mp_group") moe_experts: list = Field([1], deprecated=True, new_param="moe.moe_experts") - moe_type: MoETypeEnum = Field(MoETypeEnum.standard, - deprecated=True, - new_param="moe.type") + moe_type: MoETypeEnum = Field(MoETypeEnum.standard, deprecated=True, new_param="moe.type") @validator("moe") def moe_backward_compat(cls, field_value, values): diff --git a/deepspeed/inference/engine.py b/deepspeed/inference/engine.py index 123a028ce..d15f73a1b 100755 --- a/deepspeed/inference/engine.py +++ b/deepspeed/inference/engine.py @@ -80,8 +80,7 @@ class InferenceEngine(Module): self.checkpoint_engine = TorchCheckpointEngine() quantization_setting = None self._init_quantization_setting( - quantization_setting - ) # todo: update with the new quant config for weight quant + quantization_setting) # todo: update with the new quant config for weight quant self.model_profile_enabled = False self._model_times = [] @@ -100,8 +99,7 @@ class InferenceEngine(Module): self._convert_to_dtype(config) if self.mpu: - config.tensor_parallel.tp_size = dist.get_world_size( - group=self.mpu.get_model_parallel_group()) + config.tensor_parallel.tp_size = dist.get_world_size(group=self.mpu.get_model_parallel_group()) self.mp_group = self.mpu.get_model_parallel_group() elif config.tensor_parallel.tp_size > 1: self._create_model_parallel_group(config) @@ -149,8 +147,7 @@ class InferenceEngine(Module): self.module.to(device) if config.tensor_parallel.tp_size > 1: - _rng_state = get_accelerator().get_rng_state().to( - get_accelerator().current_device_name()) + _rng_state = get_accelerator().get_rng_state().to(get_accelerator().current_device_name()) dist.broadcast(_rng_state, 0) get_accelerator().set_rng_state(_rng_state.cpu()) @@ -172,9 +169,7 @@ class InferenceEngine(Module): # todo: remove this once all the config dicts are centralized from top level pydantic config def _get_model_config_generate(self, config): # this is being passed to replace_transformer_layer(config=self.user_model_config_dict) - self.config = getattr(self.module, - 'config', - None) if config.config is None else config.config + self.config = getattr(self.module, 'config', None) if config.config is None else config.config def remove_mask_prepare_for_bloom(self): if hasattr(self.module, 'transformer'): @@ -223,8 +218,7 @@ class InferenceEngine(Module): num_ep_groups = dist.get_world_size() // moe_ep_size for i in range(num_ep_groups): ep_cnt = i * moe_ep_size - size = dist.get_world_size( - ) if moe_ep_size > dist.get_world_size() else moe_ep_size + size = dist.get_world_size() if moe_ep_size > dist.get_world_size() else moe_ep_size ranks = list(range(ep_cnt, ep_cnt + size)) _ep_group = dist.new_group(ranks) if dist.get_rank() in ranks: @@ -234,9 +228,7 @@ class InferenceEngine(Module): num_expert_mp_groups = dist.get_world_size() // num_ep_groups expert_mp_size = dist.get_world_size() // moe_ep_size for i in range(num_expert_mp_groups): - expert_mp_comm_ranks = [ - i + nr * moe_ep_size for nr in range(expert_mp_size) - ] + expert_mp_comm_ranks = [i + nr * moe_ep_size for nr in range(expert_mp_size)] _expert_mp_group = dist.new_group(expert_mp_comm_ranks) if dist.get_rank() in expert_mp_comm_ranks: self.expert_mp_group.update({moe_ep_size: _expert_mp_group}) @@ -253,65 +245,48 @@ class InferenceEngine(Module): log_dist( f"quantize_bits = {self.quantize_bits} " f"mlp_extra_grouping = {self.mlp_extra_grouping}, " - f"quantize_groups = {self.quantize_groups}", - [0]) + f"quantize_groups = {self.quantize_groups}", [0]) # TODO: remove this function and add this functionality to pydantic config checking def _validate_args(self, mpu, replace_with_kernel_inject): # TODO: to support SD pipeline we need to avoid this check for now if replace_with_kernel_inject and not isinstance(self.module, Module): raise ValueError(f"model must be a torch.nn.Module, got {type(self.module)}") - if not isinstance(self._config.tensor_parallel.tp_size, - int) or self._config.tensor_parallel.tp_size < 1: - raise ValueError( - f"mp_size must be an int >= 1, got {self._config.tensor_parallel.tp_size}" - ) + if not isinstance(self._config.tensor_parallel.tp_size, int) or self._config.tensor_parallel.tp_size < 1: + raise ValueError(f"mp_size must be an int >= 1, got {self._config.tensor_parallel.tp_size}") if mpu: methods = ["get_model_parallel_group", "get_data_parallel_group"] for method in methods: if not hasattr(mpu, method): raise ValueError(f"mpu is missing {method}") - if self._config.checkpoint is not None and not isinstance( - self._config.checkpoint, - (str, - dict)): - raise ValueError( - f"checkpoint must be None, str or dict, got {type(self._config.checkpoint)}" - ) + if self._config.checkpoint is not None and not isinstance(self._config.checkpoint, (str, dict)): + raise ValueError(f"checkpoint must be None, str or dict, got {type(self._config.checkpoint)}") supported_dtypes = [None, torch.half, torch.int8, torch.float] if self._config.dtype not in supported_dtypes: - raise ValueError( - f"{self._config.dtype} not supported, valid dtype: {supported_dtypes}") + raise ValueError(f"{self._config.dtype} not supported, valid dtype: {supported_dtypes}") if self.injection_dict is not None and not isinstance(self.injection_dict, dict): - raise ValueError( - f"injection_dict must be None or a dict, got: {self.injection_dict}") + raise ValueError(f"injection_dict must be None or a dict, got: {self.injection_dict}") def load_model_with_checkpoint(self, r_module): self.mp_replace = ReplaceWithTensorSlicing( - mp_group=self.mp_group, - mp_size=self._config.tensor_parallel.tp_size) #, out_dim=0, in_dim=1) + mp_group=self.mp_group, mp_size=self._config.tensor_parallel.tp_size) #, out_dim=0, in_dim=1) error_msgs = [] def load(module, state_dict, prefix): args = (state_dict, prefix, {}, True, [], [], error_msgs) if hasattr(module, 'weight'): if 'query_key_value' in prefix: - module.weight = self.mp_replace.qkv_copy( - module.weight.data, - state_dict[prefix + 'weight']) + module.weight = self.mp_replace.qkv_copy(module.weight.data, state_dict[prefix + 'weight']) else: - module.weight = self.mp_replace.copy(module.weight.data, - state_dict[prefix + 'weight']) + module.weight = self.mp_replace.copy(module.weight.data, state_dict[prefix + 'weight']) else: - module.norm.weight = self.mp_replace.copy(module.norm.weight.data, - state_dict[prefix + 'weight']) + module.norm.weight = self.mp_replace.copy(module.norm.weight.data, state_dict[prefix + 'weight']) if prefix + 'bias' in self.key_list: if hasattr(module, 'norm'): - module.norm.bias = self.mp_replace.copy(module.norm.bias, - state_dict[prefix + 'bias']) + module.norm.bias = self.mp_replace.copy(module.norm.bias, state_dict[prefix + 'bias']) else: data = state_dict[prefix + 'bias'] data = data.to(get_accelerator().current_device_name()) @@ -331,45 +306,32 @@ class InferenceEngine(Module): checking_key = prefix + name + '.' if not any(checking_key in item for item in self.key_list): continue - if len(list(child.parameters())) > 0 and list( - child.parameters())[0].numel() == 0: + if len(list(child.parameters())) > 0 and list(child.parameters())[0].numel() == 0: if len(child.weight.ds_shape) == 1: - child = Normalize(dim=child.weight.ds_shape[-1], - dtype=child.weight.dtype, - eps=child.eps) + child = Normalize(dim=child.weight.ds_shape[-1], dtype=child.weight.dtype, eps=child.eps) setattr(module, name, child) load(child, self.sd, prefix + name + '.') else: - load_module_recursive(child, - prefix if level == 0 else prefix + name + '.', - level + 1) + load_module_recursive(child, prefix if level == 0 else prefix + name + '.', level + 1) load_module_recursive(r_module) def _apply_injection_policy(self, config, client_module=None): # client_module is only passed when using the injection_dict method. checkpoint_dir = config.checkpoint - checkpoint = SDLoaderFactory.get_sd_loader_json( - checkpoint_dir, - self.checkpoint_engine) if checkpoint_dir is not None else None + checkpoint = SDLoaderFactory.get_sd_loader_json(checkpoint_dir, + self.checkpoint_engine) if checkpoint_dir is not None else None generic_injection(self.module, - fp16=(config.dtype == torch.half) - or (config.dtype == torch.int8), + fp16=(config.dtype == torch.half) or (config.dtype == torch.int8), enable_cuda_graph=config.enable_cuda_graph) if isinstance(self.module, torch.nn.Module): # config is our DeepSpeedInferenceConfig and self.config is the HF model config - replace_transformer_layer(client_module, - self.module, - checkpoint, - config, - self.config) + replace_transformer_layer(client_module, self.module, checkpoint, config, self.config) def _get_all_ckpt_names(self, checkpoints_path, tag): - ckpt_file_pattern = self._get_ckpt_name(checkpoints_path, - tag, - mp_placeholder="*") + ckpt_file_pattern = self._get_ckpt_name(checkpoints_path, tag, mp_placeholder="*") import glob ckpt_files = glob.glob(ckpt_file_pattern) @@ -392,8 +354,7 @@ class InferenceEngine(Module): def _load_checkpoint(self, load_dir, load_module_strict=True, tag=None): is_pipe_parallel = isinstance(self.module, PipelineModule) if is_pipe_parallel: - raise RuntimeError( - 'pipeline parallelism is currently not supported in inference.') + raise RuntimeError('pipeline parallelism is currently not supported in inference.') if not isinstance(load_dir, dict) and os.path.isdir(load_dir): if tag is None: latest_path = os.path.join(load_dir, "latest") @@ -404,8 +365,7 @@ class InferenceEngine(Module): ckpt_list = self._get_all_ckpt_names(load_dir, tag) sd_loader = SDLoaderFactory.get_sd_loader(ckpt_list, self.checkpoint_engine) else: - sd_loader = SDLoaderFactory.get_sd_loader_json(load_dir, - self.checkpoint_engine) + sd_loader = SDLoaderFactory.get_sd_loader_json(load_dir, self.checkpoint_engine) if type(sd_loader) is list: self.sd = torch.load(sd_loader[0], map_location='cpu') @@ -416,19 +376,18 @@ class InferenceEngine(Module): for i in range(1, len(sd_loader)): if not dist.is_initialized() or dist.get_rank() == 0: print(f"loading checkpoint ({i})") - self.sd = torch.load(sd_loader[i], - map_location=get_accelerator().device_name()) + self.sd = torch.load(sd_loader[i], map_location=get_accelerator().device_name()) self.key_list = list(self.sd.keys()) self.load_model_with_checkpoint(self.module) else: mp_rank = 0 if self.mpu is None else self.mpu.get_model_parallel_rank() load_path, checkpoint, quantize_config = sd_loader.load(self._config.tensor_parallel.tp_size, - mp_rank, - is_pipe_parallel=is_pipe_parallel, - quantize=(self._config.dtype is torch.int8), - quantize_groups=self.quantize_groups, - mlp_extra_grouping=self.mlp_extra_grouping) + mp_rank, + is_pipe_parallel=is_pipe_parallel, + quantize=(self._config.dtype is torch.int8), + quantize_groups=self.quantize_groups, + mlp_extra_grouping=self.mlp_extra_grouping) self.quantization_scales, self.quantize_merge_count = quantize_config @@ -438,21 +397,20 @@ class InferenceEngine(Module): old_moe_load = False if not isinstance(checkpoint['num_experts'], list): old_moe_load = True - DeepSpeedEngine.load_moe_state_dict( - load_dir, - tag, - state_dict=checkpoint[self._choose_module_key(checkpoint)], - old_moe_load=old_moe_load, - model=self.module, - mpu=self.mpu, - checkpoint_engine=self.checkpoint_engine) + DeepSpeedEngine.load_moe_state_dict(load_dir, + tag, + state_dict=checkpoint[self._choose_module_key(checkpoint)], + old_moe_load=old_moe_load, + model=self.module, + mpu=self.mpu, + checkpoint_engine=self.checkpoint_engine) - self.module.load_state_dict( - state_dict=checkpoint[self._choose_module_key(checkpoint)], - strict=load_module_strict) + self.module.load_state_dict(state_dict=checkpoint[self._choose_module_key(checkpoint)], + strict=load_module_strict) def _choose_module_key(self, sd): - assert not ('module' in sd and 'model' in sd), "checkpoint has both 'model' and 'module' keys, not sure how to proceed" + assert not ('module' in sd + and 'model' in sd), "checkpoint has both 'model' and 'module' keys, not sure how to proceed" assert 'module' in sd or 'model' in sd, "checkpoint contains neither 'model' or 'module' keys, not sure how to proceed" if 'module' in sd: return 'module' @@ -465,10 +423,8 @@ class InferenceEngine(Module): if False: #config.dtype is torch.int8 and self.quantization_scales is None: quantizer = WeightQuantization(mlp_extra_grouping=self.mlp_extra_grouping) - model, self.quantization_scales = quantizer.model_quantize(self.module, - self.injection_dict, - self.quantize_bits, - self.quantize_groups) + model, self.quantization_scales = quantizer.model_quantize(self.module, self.injection_dict, + self.quantize_bits, self.quantize_groups) elif config.dtype == torch.half: self.module.half() elif config.dtype == torch.bfloat16: @@ -509,11 +465,10 @@ class InferenceEngine(Module): assert self.model_profile_enabled, "model profiling is not enabled" model_times = self._model_times if self._config.enable_cuda_graph and len(self._model_times) == 0: - raise ValueError( - "Model times are empty and cuda graph is enabled. If " - "this is a GPT-style model this combo is not supported. If this is a " - "BERT-style model this is a bug, please report it. " - f"Model type is: {type(self.module)}") + raise ValueError("Model times are empty and cuda graph is enabled. If " + "this is a GPT-style model this combo is not supported. If this is a " + "BERT-style model this is a bug, please report it. " + f"Model type is: {type(self.module)}") self._model_times = [] return model_times @@ -532,8 +487,7 @@ class InferenceEngine(Module): for name in module.__dict__.keys(): sub_module = getattr(module, name) - if self._module_match(sub_module) and hasattr(sub_module, - "enable_cuda_graph"): + if self._module_match(sub_module) and hasattr(sub_module, "enable_cuda_graph"): sub_module_cuda_graph = True return sub_module_cuda_graph @@ -546,13 +500,11 @@ class InferenceEngine(Module): **kwargs: variable length keyword arguments """ start = None - if self.model_profile_enabled and get_accelerator().device_name( - ) == 'cuda' and self._config.enable_cuda_graph: + if self.model_profile_enabled and get_accelerator().device_name() == 'cuda' and self._config.enable_cuda_graph: get_accelerator().synchronize() start = time.time() - if get_accelerator().device_name( - ) == 'cuda' and self._config.enable_cuda_graph and not self.local_cuda_graph: + if get_accelerator().device_name() == 'cuda' and self._config.enable_cuda_graph and not self.local_cuda_graph: if self.cuda_graph_created: outputs = self._graph_replay(*inputs, **kwargs) else: @@ -580,9 +532,7 @@ class InferenceEngine(Module): num_beams = kwargs["num_beams"] if num_beams > 1: - raise NotImplementedError( - "DeepSpeed does not support `num_beams` > 1, if this is important to you please " - "add your request to: https://github.com/microsoft/DeepSpeed/issues/2506" - ) + raise NotImplementedError("DeepSpeed does not support `num_beams` > 1, if this is important to you please " + "add your request to: https://github.com/microsoft/DeepSpeed/issues/2506") return self.module.generate(*inputs, **kwargs) diff --git a/deepspeed/launcher/launch.py b/deepspeed/launcher/launch.py index d9e6fda50..12b1d790b 100755 --- a/deepspeed/launcher/launch.py +++ b/deepspeed/launcher/launch.py @@ -52,10 +52,7 @@ def parse_args(): help="Master node (rank 0)'s free port that needs to " "be used for communication during distributed " "training") - parser.add_argument("--world_info", - default="None", - type=str, - help="world info base64 encoded dictionary") + parser.add_argument("--world_info", default="None", type=str, help="world info base64 encoded dictionary") parser.add_argument("--module", action="store_true", @@ -68,19 +65,11 @@ def parse_args(): help="Skip prepending the training script with " "'python' - just execute it directly.") - parser.add_argument("--enable_elastic_training", - action="store_true", - help="Enable elastic training support.") + parser.add_argument("--enable_elastic_training", action="store_true", help="Enable elastic training support.") - parser.add_argument("--min_elastic_nodes", - type=int, - default=-1, - help="Min number of nodes in elastic training.") + parser.add_argument("--min_elastic_nodes", type=int, default=-1, help="Min number of nodes in elastic training.") - parser.add_argument("--max_elastic_nodes", - type=int, - default=-1, - help="Max number of nodes in elastic training.") + parser.add_argument("--max_elastic_nodes", type=int, default=-1, help="Max number of nodes in elastic training.") parser.add_argument("--no_local_rank", action="store_true", @@ -92,11 +81,10 @@ def parse_args(): default=0, help="main launching process pid, for internal pid tracking") - parser.add_argument( - "--enable_each_rank_log", - default="None", - type=str, - help="redirect the stdout and stderr from each rank into different log files") + parser.add_argument("--enable_each_rank_log", + default="None", + type=str, + help="redirect the stdout and stderr from each rank into different log files") # positional parser.add_argument("training_script", @@ -145,9 +133,7 @@ def main(): local_node = node_list[args.node_rank] local_gpu_ids = world_info[local_node] num_local_procs = len(local_gpu_ids) - logger.info( - f"nnodes={args.nnodes}, num_local_procs={num_local_procs}, node_rank={args.node_rank}" - ) + logger.info(f"nnodes={args.nnodes}, num_local_procs={num_local_procs}, node_rank={args.node_rank}") global_rank_mapping = defaultdict(list) curr_global_rank = 0 @@ -193,8 +179,7 @@ def main(): lines = file.readlines() lines = [line.rstrip() for line in lines] for line in lines: - if line.startswith('export FC_TASKROLE_NAME') or line.startswith( - 'export FC_TASK_INDEX'): + if line.startswith('export FC_TASKROLE_NAME') or line.startswith('export FC_TASK_INDEX'): key_val = line.split()[1] key, val = key_val.split('=') current_env[key] = val @@ -206,17 +191,13 @@ def main(): if args.enable_each_rank_log != "None": # prepare the log path and the file name prefix if os.path.isfile(args.enable_each_rank_log): - raise ValueError( - f"{args.enable_each_rank_log} should not be a file, it should be a directory." - ) + raise ValueError(f"{args.enable_each_rank_log} should not be a file, it should be a directory.") if not os.path.exists(args.enable_each_rank_log): try: os.makedirs(args.enable_each_rank_log) except Exception as e: print(e) - raise ValueError( - f"unable to create directory {args.enable_each_rank_log} for each rank log." - ) + raise ValueError(f"unable to create directory {args.enable_each_rank_log} for each rank log.") log_name_prefix = time.strftime("%Y%m%d%H%M%S", time.localtime()) for local_rank in range(0, num_local_procs): @@ -242,13 +223,9 @@ def main(): cmd += args.training_script_args if args.enable_each_rank_log != "None": - log_file = os.path.join(args.enable_each_rank_log, - f"{log_name_prefix}_rank{dist_rank}.log") + log_file = os.path.join(args.enable_each_rank_log, f"{log_name_prefix}_rank{dist_rank}.log") log_fd = open(log_file, 'w') - process = subprocess.Popen(cmd, - env=current_env, - stdout=log_fd, - stderr=log_fd) + process = subprocess.Popen(cmd, env=current_env, stdout=log_fd, stderr=log_fd) else: process = subprocess.Popen(cmd, env=current_env) @@ -264,7 +241,7 @@ def main(): args.min_elastic_nodes = 1 if args.max_elastic_nodes == -1: args.max_elastic_nodes = args.nnodes - assert args.max_elastic_nodes > 0 and args.min_elastic_nodes > 0 , "Max and Min nodes should be positive" + assert args.max_elastic_nodes > 0 and args.min_elastic_nodes > 0, "Max and Min nodes should be positive" current_env["NCCL_ASYNC_ERROR_HANDLING"] = str(1) @@ -287,8 +264,7 @@ def main(): # Creating config for rendezvous class rdzv_parameters = RendezvousParameters(backend='c10d', - endpoint=args.master_addr + ":" + - str(args.master_port), + endpoint=args.master_addr + ":" + str(args.master_port), run_id=run_id, min_nodes=args.min_elastic_nodes, max_nodes=args.max_elastic_nodes, diff --git a/deepspeed/launcher/multinode_runner.py b/deepspeed/launcher/multinode_runner.py index 6027d1076..28a7d9bd9 100644 --- a/deepspeed/launcher/multinode_runner.py +++ b/deepspeed/launcher/multinode_runner.py @@ -13,6 +13,7 @@ from .constants import PDSH_MAX_FAN_OUT, MVAPICH_TMP_HOSTFILE class MultiNodeRunner(ABC): + def __init__(self, args, world_info_base64): self.args = args self.validate_args() @@ -45,6 +46,7 @@ class MultiNodeRunner(ABC): class PDSHRunner(MultiNodeRunner): + def __init__(self, args, world_info_base64): super().__init__(args, world_info_base64) @@ -56,9 +58,7 @@ class PDSHRunner(MultiNodeRunner): return "pdsh" def parse_user_args(self): - return list( - map(lambda x: x if x.startswith("-") else f"'{x}'", - self.args.user_args)) + return list(map(lambda x: x if x.startswith("-") else f"'{x}'", self.args.user_args)) def get_cmd(self, environment, active_resources): environment['PDSH_RCMD_TYPE'] = 'ssh' @@ -68,14 +68,8 @@ class PDSHRunner(MultiNodeRunner): # PDSH flags for max node fan out and specific hosts to launch on # See https://linux.die.net/man/1/pdsh for flag details - pdsh_cmd_args = [ - 'pdsh', - '-S', - '-f', - str(PDSH_MAX_FAN_OUT), - '-w', - active_workers - ] + split(self.args.launcher_args) + pdsh_cmd_args = ['pdsh', '-S', '-f', str(PDSH_MAX_FAN_OUT), '-w', active_workers] + split( + self.args.launcher_args) exports = "" for key, val in self.exports.items(): @@ -84,15 +78,8 @@ class PDSHRunner(MultiNodeRunner): # https://linux.die.net/man/1/pdsh # %n will be replaced by pdsh command deepspeed_launch = [ - exports, - f"cd {os.path.abspath('.')};", - sys.executable, - "-u", - "-m", - "deepspeed.launcher.launch", - f'--world_info={self.world_info_base64}', - "--node_rank=%n", - f"--master_addr={self.args.master_addr}", + exports, f"cd {os.path.abspath('.')};", sys.executable, "-u", "-m", "deepspeed.launcher.launch", + f'--world_info={self.world_info_base64}', "--node_rank=%n", f"--master_addr={self.args.master_addr}", f"--master_port={self.args.master_port}" ] if self.args.no_python: @@ -111,11 +98,11 @@ class PDSHRunner(MultiNodeRunner): cmd_to_search = [i + "\\" for i in deepspeed_launch[2:6]] kill_command = pdsh_cmd_args + ["pkill -f ", " ".join(cmd_to_search)[:-2]] - return pdsh_cmd_args + deepspeed_launch + [self.user_script - ] + self.user_arguments, kill_command + return pdsh_cmd_args + deepspeed_launch + [self.user_script] + self.user_arguments, kill_command class OpenMPIRunner(MultiNodeRunner): + def __init__(self, args, world_info_base64, resource_pool): super().__init__(args, world_info_base64) self.resource_pool = resource_pool @@ -133,11 +120,9 @@ class OpenMPIRunner(MultiNodeRunner): super().validate_args() #TODO: Allow for include/exclude at node-level but not gpu-level if self.args.include != "" or self.args.exclude != "": - raise ValueError( - f"{self.name} backend does not support worker include/exclusion") + raise ValueError(f"{self.name} backend does not support worker include/exclusion") if self.args.num_nodes != -1 or self.args.num_gpus != -1: - raise ValueError( - f"{self.name} backend does not support limiting num nodes/gpus") + raise ValueError(f"{self.name} backend does not support limiting num nodes/gpus") def get_cmd(self, environment, active_resources): total_process_count = sum(self.resource_pool.values()) @@ -166,11 +151,11 @@ class OpenMPIRunner(MultiNodeRunner): if self.args.module: python_exec.append("-m") - return mpirun_cmd + export_cmd + python_exec + [self.user_script - ] + self.user_arguments + return mpirun_cmd + export_cmd + python_exec + [self.user_script] + self.user_arguments class MPICHRunner(MultiNodeRunner): + def __init__(self, args, world_info_base64, resource_pool): super().__init__(args, world_info_base64) self.resource_pool = resource_pool @@ -187,12 +172,10 @@ class MPICHRunner(MultiNodeRunner): super().validate_args() #TODO: Allow for include/exclude at node-level but not gpu-level if self.args.include != "" or self.args.exclude != "": - raise ValueError( - f"{self.name} backend does not support worker include/exclusion") + raise ValueError(f"{self.name} backend does not support worker include/exclusion") if self.args.num_nodes != -1 or self.args.num_gpus != -1: - raise ValueError( - f"{self.name} backend does not support limiting num nodes/gpus") + raise ValueError(f"{self.name} backend does not support limiting num nodes/gpus") def get_cmd(self, environment, active_resources): devices_per_node = self.resource_pool.values() @@ -220,6 +203,7 @@ class MPICHRunner(MultiNodeRunner): class SlurmRunner(MultiNodeRunner): + def __init__(self, args, world_info_base64, resource_pool): super().__init__(args, world_info_base64) self.resource_pool = resource_pool @@ -232,7 +216,8 @@ class SlurmRunner(MultiNodeRunner): return 'slurm' def get_cmd(self, environment, active_resources): - assert not getattr(self.args, 'detect_nvlink_pairs', False), "slurm backend does not support remapping visible devices" + assert not getattr(self.args, 'detect_nvlink_pairs', + False), "slurm backend does not support remapping visible devices" total_process_count = sum(self.resource_pool.values()) srun_cmd = [ 'srun', @@ -261,12 +246,12 @@ class SlurmRunner(MultiNodeRunner): exports += f",{key}={val}" python_exec = [sys.executable, "-u"] - command = srun_cmd + [exports] + python_exec + [self.user_script - ] + self.user_arguments + command = srun_cmd + [exports] + python_exec + [self.user_script] + self.user_arguments return command class MVAPICHRunner(MultiNodeRunner): + def __init__(self, args, world_info_base64, resource_pool): super().__init__(args, world_info_base64) self.resource_pool = resource_pool @@ -303,9 +288,7 @@ class MVAPICHRunner(MultiNodeRunner): if "MVAPICH2-GDR" in mpiname_results: exists = True else: - warnings.warn( - f"Expected MVAPICH2-GDR as return for mpiname but received {mpiname_results}" - ) + warnings.warn(f"Expected MVAPICH2-GDR as return for mpiname but received {mpiname_results}") return exists @property @@ -316,11 +299,9 @@ class MVAPICHRunner(MultiNodeRunner): super().validate_args() #TODO: Allow for include/exclude at node-level but not gpu-level if self.args.include != "" or self.args.exclude != "": - raise ValueError( - f"{self.name} backend does not support worker include/exclusion") + raise ValueError(f"{self.name} backend does not support worker include/exclusion") if self.args.num_nodes != -1 or self.args.num_gpus != -1: - raise ValueError( - f"{self.name} backend does not support limiting num nodes/gpus") + raise ValueError(f"{self.name} backend does not support limiting num nodes/gpus") def get_cmd(self, environment, active_resources): devices_per_node = self.resource_pool.values() @@ -353,5 +334,4 @@ class MVAPICHRunner(MultiNodeRunner): if self.args.module: python_exec.append("-m") - return mpirun_cmd + export_cmd + python_exec + [self.user_script - ] + self.user_arguments + return mpirun_cmd + export_cmd + python_exec + [self.user_script] + self.user_arguments diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py index 044f6ff03..4f3a7ac6f 100755 --- a/deepspeed/launcher/runner.py +++ b/deepspeed/launcher/runner.py @@ -36,9 +36,8 @@ PDSH_MAX_FAN_OUT = 1024 def parse_args(args=None): - parser = argparse.ArgumentParser( - description="DeepSpeed runner to help launch distributed " - "multi-node/multi-gpu training jobs.") + parser = argparse.ArgumentParser(description="DeepSpeed runner to help launch distributed " + "multi-node/multi-gpu training jobs.") parser.add_argument("-H", "--hostfile", @@ -109,12 +108,11 @@ def parse_args(args=None): help="(optional) IP address of node 0, will be " "inferred via 'hostname -I' if not specified.") - parser.add_argument( - "--launcher", - default=PDSH_LAUNCHER, - type=str, - help="(optional) choose launcher backend for multi-node " - "training. Options currently include PDSH, OpenMPI, MVAPICH, SLURM, MPICH.") + parser.add_argument("--launcher", + default=PDSH_LAUNCHER, + type=str, + help="(optional) choose launcher backend for multi-node " + "training. Options currently include PDSH, OpenMPI, MVAPICH, SLURM, MPICH.") parser.add_argument("--launcher_args", default="", @@ -147,35 +145,29 @@ def parse_args(args=None): help="Force multi-node launcher mode, helps in cases where user " "wants to launch on single remote node.") - parser.add_argument( - "--save_pid", - action="store_true", - help="Save file containing launcher process id (pid) at /tmp/.ds, " - "where is the pid of the first process that invoked `deepspeed`. " - "Useful when launching deepspeed processes programmatically.") + parser.add_argument("--save_pid", + action="store_true", + help="Save file containing launcher process id (pid) at /tmp/.ds, " + "where is the pid of the first process that invoked `deepspeed`. " + "Useful when launching deepspeed processes programmatically.") - parser.add_argument( - "--enable_each_rank_log", - default="None", - type=str, - help="redirect the stdout and stderr from each rank into different log files") + parser.add_argument("--enable_each_rank_log", + default="None", + type=str, + help="redirect the stdout and stderr from each rank into different log files") - parser.add_argument( - "--autotuning", - default="", - choices=["tune", - "run"], - type=str, - help="Run DeepSpeed autotuner to discover optimal configuration parameters " - "before running job.") + parser.add_argument("--autotuning", + default="", + choices=["tune", "run"], + type=str, + help="Run DeepSpeed autotuner to discover optimal configuration parameters " + "before running job.") parser.add_argument("--elastic_training", action="store_true", help="Enable elastic training support in DeepSpeed.") - parser.add_argument("user_script", - type=str, - help="User script to launch, followed by any required " + parser.add_argument("user_script", type=str, help="User script to launch, followed by any required " "arguments.") parser.add_argument('user_args', nargs=argparse.REMAINDER) return parser.parse_args(args=args) @@ -213,21 +205,15 @@ def _parse_hostfile(hostfile_lines): num_slots = int(match.group(2)) if host in resource_pool: logger.error(f"Bad hostfile text: {hostfile_lines}") - raise ValueError( - f"Hostfile contains multiple entries for {host}, unable to proceed with launching" - ) + raise ValueError(f"Hostfile contains multiple entries for {host}, unable to proceed with launching") resource_pool[host] = num_slots else: logger.error(f"Bad hostfile text: {hostfile_lines}") - raise ValueError( - "Hostfile contains a bad entry: {line}, unable to proceed with launching" - ) + raise ValueError("Hostfile contains a bad entry: {line}, unable to proceed with launching") if len(resource_pool) == 0: logger.error(f"Bad hostfile text: {hostfile_lines}") - raise ValueError( - "Hostfile is empty or not formatted correctly, unable to proceed with launching." - ) + raise ValueError("Hostfile is empty or not formatted correctly, unable to proceed with launching.") return resource_pool @@ -337,9 +323,7 @@ def parse_inclusion_exclusion(resource_pool, inclusion, exclusion): for hostname, slots in resource_pool.items(): active_resources[hostname] = list(range(slots)) - return parse_resource_filter(active_resources, - include_str=inclusion, - exclude_str=exclusion) + return parse_resource_filter(active_resources, include_str=inclusion, exclude_str=exclusion) def encode_world_info(world_info): @@ -389,8 +373,7 @@ def main(args=None): cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "") if not resource_pool and len(cuda_visible_devices): detected_str = f"Detected CUDA_VISIBLE_DEVICES={cuda_visible_devices}" - if len(args.include) or len( - args.exclude) or args.num_nodes > 1 or args.num_gpus > 0: + if len(args.include) or len(args.exclude) or args.num_nodes > 1 or args.num_gpus > 0: print( f"{detected_str} but ignoring it because one or several of --include/--exclude/--num_gpus/--num_nodes cl args were used. If you want to use CUDA_VISIBLE_DEVICES don't pass any of these arguments to deepspeed." ) @@ -416,20 +399,17 @@ def main(args=None): if not multi_node_exec and args.num_nodes > 1: raise ValueError("Num nodes is >1 but no extra nodes available via hostfile") - active_resources = parse_inclusion_exclusion(resource_pool, - args.include, - args.exclude) + active_resources = parse_inclusion_exclusion(resource_pool, args.include, args.exclude) env = os.environ.copy() # validate that passwordless-ssh is workly properly with this hostfile if multi_node_exec and not args.no_ssh_check: first_host = list(active_resources.keys())[0] try: - subprocess.check_call( - f'ssh -o PasswordAuthentication=no {first_host} hostname', - stderr=subprocess.DEVNULL, - stdout=subprocess.DEVNULL, - shell=True) + subprocess.check_call(f'ssh -o PasswordAuthentication=no {first_host} hostname', + stderr=subprocess.DEVNULL, + stdout=subprocess.DEVNULL, + shell=True) except subprocess.CalledProcessError: raise RuntimeError( f"Using hostfile at {args.hostfile} but host={first_host} was not reachable via ssh. If you are running with a single node please remove {args.hostfile} or setup passwordless ssh." @@ -481,13 +461,8 @@ def main(args=None): if not multi_node_exec: deepspeed_launch = [ - sys.executable, - "-u", - "-m", - "deepspeed.launcher.launch", - f"--world_info={world_info_base64}", - f"--master_addr={args.master_addr}", - f"--master_port={args.master_port}" + sys.executable, "-u", "-m", "deepspeed.launcher.launch", f"--world_info={world_info_base64}", + f"--master_addr={args.master_addr}", f"--master_port={args.master_port}" ] if args.no_python: deepspeed_launch.append("--no_python") @@ -498,8 +473,7 @@ def main(args=None): if args.save_pid: deepspeed_launch += ["--save_pid", f"{os.getpid()}"] if args.enable_each_rank_log: - deepspeed_launch.append( - f"--enable_each_rank_log={args.enable_each_rank_log}") + deepspeed_launch.append(f"--enable_each_rank_log={args.enable_each_rank_log}") if args.elastic_training: deepspeed_launch.append("--enable_elastic_training") deepspeed_launch.append(f"--max_elastic_nodes={args.max_elastic_nodes}") diff --git a/deepspeed/model_implementations/diffusers/unet.py b/deepspeed/model_implementations/diffusers/unet.py index 8ac97945c..2d6ba1a5d 100644 --- a/deepspeed/model_implementations/diffusers/unet.py +++ b/deepspeed/model_implementations/diffusers/unet.py @@ -6,6 +6,7 @@ from ..features.cuda_graph import CUDAGraph class DSUNet(CUDAGraph, torch.nn.Module): + def __init__(self, unet, enable_cuda_graph=True): super().__init__(enable_cuda_graph=enable_cuda_graph) self.unet = unet diff --git a/deepspeed/model_implementations/diffusers/vae.py b/deepspeed/model_implementations/diffusers/vae.py index 8f8d13cb2..a6b0dc28f 100644 --- a/deepspeed/model_implementations/diffusers/vae.py +++ b/deepspeed/model_implementations/diffusers/vae.py @@ -6,6 +6,7 @@ from ..features.cuda_graph import CUDAGraph class DSVAE(CUDAGraph, torch.nn.Module): + def __init__(self, vae, enable_cuda_graph=True): super().__init__(enable_cuda_graph=enable_cuda_graph) self.vae = vae @@ -44,8 +45,7 @@ class DSVAE(CUDAGraph, torch.nn.Module): self.static_decoder_kwargs = kwargs with torch.cuda.graph(self._decoder_cuda_graph): - self.static_decoder_output = self._decode(*self.static_decoder_inputs, - **self.static_decoder_kwargs) + self.static_decoder_output = self._decode(*self.static_decoder_inputs, **self.static_decoder_kwargs) self.decoder_cuda_graph_created = True @@ -88,8 +88,7 @@ class DSVAE(CUDAGraph, torch.nn.Module): self.static_encoder_kwargs = kwargs with torch.cuda.graph(self._encoder_cuda_graph): - self.static_encoder_output = self._encode(*self.static_encoder_inputs, - **self.static_encoder_kwargs) + self.static_encoder_output = self._encode(*self.static_encoder_inputs, **self.static_encoder_kwargs) self.encoder_cuda_graph_created = True diff --git a/deepspeed/model_implementations/features/cuda_graph.py b/deepspeed/model_implementations/features/cuda_graph.py index 3224f625c..cdb8e4f57 100644 --- a/deepspeed/model_implementations/features/cuda_graph.py +++ b/deepspeed/model_implementations/features/cuda_graph.py @@ -5,6 +5,7 @@ from abc import ABC, abstractmethod class CUDAGraph(ABC): + def __init__(self, enable_cuda_graph=False): super().__init__() self.enable_cuda_graph = enable_cuda_graph diff --git a/deepspeed/model_implementations/transformers/clip_encoder.py b/deepspeed/model_implementations/transformers/clip_encoder.py index efa282c44..e10ca8980 100644 --- a/deepspeed/model_implementations/transformers/clip_encoder.py +++ b/deepspeed/model_implementations/transformers/clip_encoder.py @@ -7,6 +7,7 @@ from ..features.cuda_graph import CUDAGraph class DSClipEncoder(CUDAGraph, torch.nn.Module): + def __init__(self, enc, enable_cuda_graph=False): super().__init__(enable_cuda_graph=enable_cuda_graph) enc.text_model._build_causal_attention_mask = self._build_causal_attention_mask @@ -22,11 +23,7 @@ class DSClipEncoder(CUDAGraph, torch.nn.Module): self.config = self.enc.config def _build_causal_attention_mask(self, bsz, seq_len, dtype): - mask = torch.empty(bsz, - seq_len, - seq_len, - dtype=dtype, - device=get_accelerator().current_device_name()) + mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype, device=get_accelerator().current_device_name()) mask.fill_(torch.tensor(torch.finfo(dtype).min)) mask.triu_(1) mask = mask.unsqueeze(1) @@ -69,9 +66,8 @@ class DSClipEncoder(CUDAGraph, torch.nn.Module): self.static_kwargs[self.iter] = kwargs with torch.cuda.graph(self._cuda_graphs[self.iter]): - self.static_output[self.iter] = self._forward( - *self.static_inputs[self.iter], - **self.static_kwargs[self.iter]) + self.static_output[self.iter] = self._forward(*self.static_inputs[self.iter], + **self.static_kwargs[self.iter]) self.cuda_graph_created[self.iter] = True diff --git a/deepspeed/model_implementations/transformers/ds_base.py b/deepspeed/model_implementations/transformers/ds_base.py index 9a848eaae..4ba96e052 100644 --- a/deepspeed/model_implementations/transformers/ds_base.py +++ b/deepspeed/model_implementations/transformers/ds_base.py @@ -4,6 +4,7 @@ import torch.nn as nn class DeepSpeedTransformerBase(nn.module): + def __init__(self): pass diff --git a/deepspeed/model_implementations/transformers/ds_bert.py b/deepspeed/model_implementations/transformers/ds_bert.py index 3e7a7b77f..42f9ec955 100644 --- a/deepspeed/model_implementations/transformers/ds_bert.py +++ b/deepspeed/model_implementations/transformers/ds_bert.py @@ -8,6 +8,7 @@ from deepspeed.model_implementations.transformers.ds_transformer import DeepSpee class DeepSpeedBERTInference(DeepSpeedTransformerInference): """Initialize the DeepSpeed BERT Transformer Layer. """ + def __init__(self, config, mp_group=None, @@ -15,9 +16,4 @@ class DeepSpeedBERTInference(DeepSpeedTransformerInference): quantize_groups=1, merge_count=1, mlp_extra_grouping=False): - super().__init__(config, - mp_group, - quantize_scales, - quantize_groups, - merge_count, - mlp_extra_grouping) + super().__init__(config, mp_group, quantize_scales, quantize_groups, merge_count, mlp_extra_grouping) diff --git a/deepspeed/model_implementations/transformers/ds_bloom.py b/deepspeed/model_implementations/transformers/ds_bloom.py index 386352f28..675df872d 100644 --- a/deepspeed/model_implementations/transformers/ds_bloom.py +++ b/deepspeed/model_implementations/transformers/ds_bloom.py @@ -8,6 +8,7 @@ from deepspeed.model_implementations.transformers.ds_transformer import DeepSpee class DeepSpeedBloomInference(DeepSpeedTransformerInference): """Initialize the DeepSpeed Bloom Transformer Layer. """ + def __init__(self, config, mp_group=None, @@ -15,9 +16,4 @@ class DeepSpeedBloomInference(DeepSpeedTransformerInference): quantize_groups=1, merge_count=1, mlp_extra_grouping=False): - super().__init__(config, - mp_group, - quantize_scales, - quantize_groups, - merge_count, - mlp_extra_grouping) + super().__init__(config, mp_group, quantize_scales, quantize_groups, merge_count, mlp_extra_grouping) diff --git a/deepspeed/model_implementations/transformers/ds_gpt.py b/deepspeed/model_implementations/transformers/ds_gpt.py index 86cc9fdc6..d18cc6d89 100644 --- a/deepspeed/model_implementations/transformers/ds_gpt.py +++ b/deepspeed/model_implementations/transformers/ds_gpt.py @@ -8,6 +8,7 @@ from deepspeed.model_implementations.transformers.ds_transformer import DeepSpee class DeepSpeedGPTInference(DeepSpeedTransformerInference): """Initialize the DeepSpeed GPT Transformer Layer. """ + def __init__(self, config, mp_group=None, @@ -15,9 +16,4 @@ class DeepSpeedGPTInference(DeepSpeedTransformerInference): quantize_groups=1, merge_count=1, mlp_extra_grouping=False): - super().__init__(config, - mp_group, - quantize_scales, - quantize_groups, - merge_count, - mlp_extra_grouping) + super().__init__(config, mp_group, quantize_scales, quantize_groups, merge_count, mlp_extra_grouping) diff --git a/deepspeed/model_implementations/transformers/ds_megatron_gpt.py b/deepspeed/model_implementations/transformers/ds_megatron_gpt.py index aca6b809e..9c6e8b705 100644 --- a/deepspeed/model_implementations/transformers/ds_megatron_gpt.py +++ b/deepspeed/model_implementations/transformers/ds_megatron_gpt.py @@ -8,6 +8,7 @@ from deepspeed.model_implementations.transformers.ds_transformer import DeepSpee class DeepSpeedMegatronGPTInference(DeepSpeedTransformerInference): """Initialize the DeepSpeed Megatron GPT Transformer Layer. """ + def __init__(self, config, mp_group=None, @@ -15,9 +16,4 @@ class DeepSpeedMegatronGPTInference(DeepSpeedTransformerInference): quantize_groups=1, merge_count=1, mlp_extra_grouping=False): - super().__init__(config, - mp_group, - quantize_scales, - quantize_groups, - merge_count, - mlp_extra_grouping) + super().__init__(config, mp_group, quantize_scales, quantize_groups, merge_count, mlp_extra_grouping) diff --git a/deepspeed/model_implementations/transformers/ds_opt.py b/deepspeed/model_implementations/transformers/ds_opt.py index a5209a30f..bdce3ca2d 100644 --- a/deepspeed/model_implementations/transformers/ds_opt.py +++ b/deepspeed/model_implementations/transformers/ds_opt.py @@ -8,6 +8,7 @@ from deepspeed.model_implementations.transformers.ds_transformer import DeepSpee class DeepSpeedOPTInference(DeepSpeedTransformerInference): """Initialize the DeepSpeed OPT Transformer Layer. """ + def __init__(self, config, mp_group=None, @@ -15,9 +16,4 @@ class DeepSpeedOPTInference(DeepSpeedTransformerInference): quantize_groups=1, merge_count=1, mlp_extra_grouping=False): - super().__init__(config, - mp_group, - quantize_scales, - quantize_groups, - merge_count, - mlp_extra_grouping) + super().__init__(config, mp_group, quantize_scales, quantize_groups, merge_count, mlp_extra_grouping) diff --git a/deepspeed/model_implementations/transformers/ds_transformer.py b/deepspeed/model_implementations/transformers/ds_transformer.py index 1a18c175b..2a5f7b191 100644 --- a/deepspeed/model_implementations/transformers/ds_transformer.py +++ b/deepspeed/model_implementations/transformers/ds_transformer.py @@ -56,33 +56,17 @@ class DeepSpeedTransformerInference(nn.Module): log_dist(f"DeepSpeed-Inference config: {self.config.__dict__}", [0]) if self.config.bigscience_bloom: - self.attention = BloomSelfAttention(self.config, - mp_group, - quantize_scales, - quantize_groups, - merge_count) + self.attention = BloomSelfAttention(self.config, mp_group, quantize_scales, quantize_groups, merge_count) else: - self.attention = DeepSpeedSelfAttention(self.config, - mp_group, - quantize_scales, - quantize_groups, + self.attention = DeepSpeedSelfAttention(self.config, mp_group, quantize_scales, quantize_groups, merge_count) - self.mlp = DeepSpeedMLP(self.config, - mp_group, - quantize_scales, - quantize_groups, - merge_count, + self.mlp = DeepSpeedMLP(self.config, mp_group, quantize_scales, quantize_groups, merge_count, mlp_extra_grouping) - device = get_accelerator().current_device_name( - ) # if config.bigscience_bloom else 'cpu' - self.norm_w = nn.Parameter(torch.empty(self.config.hidden_size, - dtype=data_type, - device=device), + device = get_accelerator().current_device_name() # if config.bigscience_bloom else 'cpu' + self.norm_w = nn.Parameter(torch.empty(self.config.hidden_size, dtype=data_type, device=device), requires_grad=False) - self.norm_b = nn.Parameter(torch.empty(self.config.hidden_size, - dtype=data_type, - device=device), + self.norm_b = nn.Parameter(torch.empty(self.config.hidden_size, dtype=data_type, device=device), requires_grad=False) self.layer_past = None self.allocate_workspace = inference_cuda_module.allocate_workspace_fp32 if (not config.fp16) else \ @@ -122,20 +106,15 @@ class DeepSpeedTransformerInference(nn.Module): if "hidden_states" in kwargs: input = kwargs["hidden_states"] - input_mask = (input_mask if attn_mask is None else - attn_mask) if attention_mask is None else attention_mask + input_mask = (input_mask if attn_mask is None else attn_mask) if attention_mask is None else attention_mask # Allocate memory only on first layer forward if self.config.layer_id == 0: - self.allocate_workspace(self.config.hidden_size, - self.config.heads, + self.allocate_workspace(self.config.hidden_size, self.config.heads, input.size()[1], - input.size()[0], - DeepSpeedTransformerInference.layer_id, - self.config.mp_size, + input.size()[0], DeepSpeedTransformerInference.layer_id, self.config.mp_size, self.config.bigscience_bloom, - dist.get_rank() if dist.is_initialized() else 0, - self.config.max_out_tokens) + dist.get_rank() if dist.is_initialized() else 0, self.config.max_out_tokens) get_present = (get_present or get_key_value or use_cache) input_mask = input_mask if attention_mask is None else attention_mask @@ -174,10 +153,7 @@ class DeepSpeedTransformerInference(nn.Module): output = self.mlp(attention_output, input, inp_norm, self.attention.attn_ob) if not self.config.pre_layer_norm: - output = inference_cuda_module.layer_norm(output, - self.norm_w, - self.norm_b, - self.config.epsilon) + output = inference_cuda_module.layer_norm(output, self.norm_w, self.norm_b, self.config.epsilon) output = output.to(input_type) if get_present: diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py index a2c570f57..b6907081d 100644 --- a/deepspeed/module_inject/auto_tp.py +++ b/deepspeed/module_inject/auto_tp.py @@ -8,6 +8,7 @@ from .replace_policy import replace_policies class AutoTP(): + def in_module_list(module, module_list): for item in module_list: if type(item).__name__ == type(module).__name__: @@ -28,18 +29,7 @@ class AutoTP(): return mlist def supported(model): - unsupported = [ - 'bloom', - 'codegen', - 'deberta', - 'flaubert', - 'fsmt', - 'gpt2', - 'led', - 'longformer', - 'xlm', - 'xlnet' - ] + unsupported = ['bloom', 'codegen', 'deberta', 'flaubert', 'fsmt', 'gpt2', 'led', 'longformer', 'xlm', 'xlnet'] model = str(model) key = re.search(r": (.*?)Model", model) if key is None: @@ -56,8 +46,7 @@ class AutoTP(): for key, submodule in module._modules.items(): if isinstance(submodule, nn.Linear): layer_list = layer_list + [parent + "." + key] - elif isinstance(submodule, - nn.LayerNorm) or key == 'LayerNorm' or key == 'layer_norm': + elif isinstance(submodule, nn.LayerNorm) or key == 'LayerNorm' or key == 'layer_norm': layer_list = layer_list + ["ln"] else: layer_list = layer_list + AutoTP.get_layers(key, submodule) @@ -102,9 +91,7 @@ class AutoTP(): for key, submodule in module._modules.items(): if isinstance(submodule, nn.Linear): layer_list = layer_list + ["." + key] - elif isinstance( - submodule, - nn.LayerNorm) or key == 'LayerNorm' or key == 'layer_norm': + elif isinstance(submodule, nn.LayerNorm) or key == 'LayerNorm' or key == 'layer_norm': layer_list = layer_list + ["ln"] else: layer_list = layer_list + AutoTP.get_layers(key, submodule) diff --git a/deepspeed/module_inject/containers/base.py b/deepspeed/module_inject/containers/base.py index 45faeb477..aa077dd0d 100644 --- a/deepspeed/module_inject/containers/base.py +++ b/deepspeed/module_inject/containers/base.py @@ -15,6 +15,7 @@ class BaseConvolutionContainer(ABC): class BaseTransformerContainer(ABC): + def __init__(self, policy, config, model_config, layer_id, child): self.policy = policy self.config = config @@ -40,18 +41,14 @@ class BaseTransformerContainer(ABC): hasattr(self.model_config, 'layernorm_epsilon') else 1.0e-12) self.return_tuple = self.config.return_tuple self.triangular_masking = True - self.local_attention = ((self.model_config.attention_layers[self.layer_id] - == "local") if hasattr(self.model_config, - 'attention_layers') else False) + self.local_attention = ((self.model_config.attention_layers[self.layer_id] == "local") if hasattr( + self.model_config, 'attention_layers') else False) self.window_size = getattr(self.model_config, "window_size", 1) self.mlp_act_func_type = self.policy.mlp_act_func_type self.training_mp_size = self.config.training_mp_size self.bigscience_bloom = False self.max_out_tokens = self.config.max_out_tokens - self.scale_attn_by_inverse_layer_idx = getattr( - self.config, - "scale_attn_by_inverse_layer_idx", - False) + self.scale_attn_by_inverse_layer_idx = getattr(self.config, "scale_attn_by_inverse_layer_idx", False) self.use_mup = self.policy.use_mup self.return_single_tuple = False self.rotary_dim = self.model_config.rotary_dim if hasattr(self.model_config, 'rotary_dim') \ @@ -168,10 +165,8 @@ class BaseTransformerContainer(ABC): self.mlp_quantization() def attention_quantization(self): - self.module.attention.attn_qkvw = self.quantizer.quantize( - self.module.attention.attn_qkvw) - self.module.attention.attn_ow = self.quantizer.quantize( - self.module.attention.attn_ow) + self.module.attention.attn_qkvw = self.quantizer.quantize(self.module.attention.attn_qkvw) + self.module.attention.attn_ow = self.quantizer.quantize(self.module.attention.attn_ow) def mlp_quantization(self): self.module.mlp.inter_w = self.quantizer.quantize(self.module.mlp.inter_w) @@ -190,18 +185,12 @@ class BaseTransformerContainer(ABC): self.apply_weight_quantization() def attention_qkv_mp(self, mp_replace): - self.module.attention.attn_qkvw = mp_replace.qkv_copy( - self.module.attention.attn_qkvw, - self.qkvw) - self.module.attention.attn_qkvb = mp_replace.qkv_copy( - self.module.attention.attn_qkvb, - self.qkvb) + self.module.attention.attn_qkvw = mp_replace.qkv_copy(self.module.attention.attn_qkvw, self.qkvw) + self.module.attention.attn_qkvb = mp_replace.qkv_copy(self.module.attention.attn_qkvb, self.qkvb) def attention_o_mp(self, mp_replace): - self.module.attention.attn_ow = mp_replace.copy(self.module.attention.attn_ow, - self.dense_w) - self.module.attention.attn_ob = mp_replace.copy(self.module.attention.attn_ob, - self.dense_b) + self.module.attention.attn_ow = mp_replace.copy(self.module.attention.attn_ow, self.dense_w) + self.module.attention.attn_ob = mp_replace.copy(self.module.attention.attn_ob, self.dense_b) def mlp_inter_mp(self, mp_replace): self.module.mlp.inter_w = mp_replace.copy(self.module.mlp.inter_w, self._h4h_w) @@ -216,15 +205,11 @@ class BaseTransformerContainer(ABC): self.module.mlp.attn_nw = self.attn_nw self.module.mlp.attn_nb = self.attn_nb else: - self.module.mlp.attn_nw.data.copy_( - self.attn_nw.to(get_accelerator().current_device_name())) - self.module.mlp.attn_nb.data.copy_( - self.attn_nb.to(get_accelerator().current_device_name())) + self.module.mlp.attn_nw.data.copy_(self.attn_nw.to(get_accelerator().current_device_name())) + self.module.mlp.attn_nb.data.copy_(self.attn_nb.to(get_accelerator().current_device_name())) - self.module.norm_w.data.copy_( - self.input_nw.to(get_accelerator().current_device_name())) - self.module.norm_b.data.copy_( - self.input_nb.to(get_accelerator().current_device_name())) + self.module.norm_w.data.copy_(self.input_nw.to(get_accelerator().current_device_name())) + self.module.norm_b.data.copy_(self.input_nb.to(get_accelerator().current_device_name())) def transpose(self): self.transpose_attention() diff --git a/deepspeed/module_inject/containers/base_moe.py b/deepspeed/module_inject/containers/base_moe.py index 4139b08d9..3abde6729 100644 --- a/deepspeed/module_inject/containers/base_moe.py +++ b/deepspeed/module_inject/containers/base_moe.py @@ -8,6 +8,7 @@ from deepspeed.accelerator import get_accelerator class BaseTransformerMoEContainer(BaseTransformerContainer): + def __init__(self, **kwargs): # Call the init function of the parent class to initialize the tensors and configs from parent class super().__init__(**kwargs) @@ -16,9 +17,7 @@ class BaseTransformerMoEContainer(BaseTransformerContainer): self.ep_world_size = dist.get_world_size() self.local_ep_size = 1 if self.num_experts < self.ep_world_size else self.num_experts // self.ep_world_size - self.layer_norm_eps = self.config.layer_norm_eps if hasattr( - self.config, - 'layer_norm_eps') else 1e-12, + self.layer_norm_eps = self.config.layer_norm_eps if hasattr(self.config, 'layer_norm_eps') else 1e-12, # MoE models will have a list of mlp related tensors self._h4h_w = [] @@ -102,40 +101,27 @@ class BaseTransformerMoEContainer(BaseTransformerContainer): gpu_index = dist.get_rank() for ep_index in range(self.local_ep_size): # mlp inter - self.module.mlp[ep_index].inter_w.data = self._h4h_w[ - gpu_index * self.local_ep_size + ep_index].to( - get_accelerator().current_device_name()) - self.module.mlp[ep_index].inter_b.data = self._h4h_b[ - gpu_index * self.local_ep_size + ep_index].to( - get_accelerator().current_device_name()) + self.module.mlp[ep_index].inter_w.data = self._h4h_w[gpu_index * self.local_ep_size + ep_index].to( + get_accelerator().current_device_name()) + self.module.mlp[ep_index].inter_b.data = self._h4h_b[gpu_index * self.local_ep_size + ep_index].to( + get_accelerator().current_device_name()) # mlp output - self.module.mlp[ep_index].output_w.data = self._4hh_w[ - gpu_index * self.local_ep_size + ep_index].to( - get_accelerator().current_device_name()) - self.module.mlp[ep_index].output_b.data = self._4hh_b[ - gpu_index * self.local_ep_size + ep_index].to( - get_accelerator().current_device_name()) + self.module.mlp[ep_index].output_w.data = self._4hh_w[gpu_index * self.local_ep_size + ep_index].to( + get_accelerator().current_device_name()) + self.module.mlp[ep_index].output_b.data = self._4hh_b[gpu_index * self.local_ep_size + ep_index].to( + get_accelerator().current_device_name()) def copy_data_to_new_module(self): - self.module.attn_nw.data = self.attn_nw.to( - get_accelerator().current_device_name()) - self.module.attn_nb.data = self.attn_nb.to( - get_accelerator().current_device_name()) + self.module.attn_nw.data = self.attn_nw.to(get_accelerator().current_device_name()) + self.module.attn_nb.data = self.attn_nb.to(get_accelerator().current_device_name()) - self.module.norm_w.data.copy_( - self.input_nw.to(get_accelerator().current_device_name())) - self.module.norm_b.data.copy_( - self.input_nb.to(get_accelerator().current_device_name())) + self.module.norm_w.data.copy_(self.input_nw.to(get_accelerator().current_device_name())) + self.module.norm_b.data.copy_(self.input_nb.to(get_accelerator().current_device_name())) if self.config.moe.type == 'residual': - self.module.res_mlp.inter_w.data = self._res_h4h_w.to( - get_accelerator().current_device_name()) - self.module.res_mlp.inter_b.data = self._res_h4h_b.to( - get_accelerator().current_device_name()) - self.module.res_mlp.output_w.data = self._res_4hh_w.to( - get_accelerator().current_device_name()) - self.module.res_mlp.output_b.data = self._res_4hh_b.to( - get_accelerator().current_device_name()) - self.module.res_coef.data = self._res_coef.to( - get_accelerator().current_device_name()) + self.module.res_mlp.inter_w.data = self._res_h4h_w.to(get_accelerator().current_device_name()) + self.module.res_mlp.inter_b.data = self._res_h4h_b.to(get_accelerator().current_device_name()) + self.module.res_mlp.output_w.data = self._res_4hh_w.to(get_accelerator().current_device_name()) + self.module.res_mlp.output_b.data = self._res_4hh_b.to(get_accelerator().current_device_name()) + self.module.res_coef.data = self._res_coef.to(get_accelerator().current_device_name()) diff --git a/deepspeed/module_inject/containers/bert.py b/deepspeed/module_inject/containers/bert.py index 95d8b485a..8820c478a 100644 --- a/deepspeed/module_inject/containers/bert.py +++ b/deepspeed/module_inject/containers/bert.py @@ -8,6 +8,7 @@ from ..policy import TransformerPolicy class DS_BERTContainer(BaseTransformerContainer): + def __init__(self, **kwargs): super().__init__(**kwargs) @@ -23,6 +24,7 @@ class DS_BERTContainer(BaseTransformerContainer): class HFBertLayerPolicy(TransformerPolicy): + def __init__(self, client_module, inference=False): super().__init__(inference, pre_attn_norm=False) self.client_module = client_module diff --git a/deepspeed/module_inject/containers/bloom.py b/deepspeed/module_inject/containers/bloom.py index eedf85144..d24349590 100644 --- a/deepspeed/module_inject/containers/bloom.py +++ b/deepspeed/module_inject/containers/bloom.py @@ -11,6 +11,7 @@ supported_models = {None} class DS_BloomContainer(MetaTensorContainer, BaseTransformerContainer): + def __init__(self, **kwargs): super().__init__(**kwargs) @@ -25,12 +26,8 @@ class DS_BloomContainer(MetaTensorContainer, BaseTransformerContainer): return self.module def attention_qkv_mp(self, mp_replace): - self.module.attention.attn_qkvw = mp_replace.copy( - self.module.attention.attn_qkvw, - self.qkvw) - self.module.attention.attn_qkvb = mp_replace.copy( - self.module.attention.attn_qkvb, - self.qkvb) + self.module.attention.attn_qkvw = mp_replace.copy(self.module.attention.attn_qkvw, self.qkvw) + self.module.attention.attn_qkvb = mp_replace.copy(self.module.attention.attn_qkvb, self.qkvb) def load_params(self, module, sd, weight_quantizer, mp_replace, prefix): param_names = ( @@ -58,51 +55,28 @@ class DS_BloomContainer(MetaTensorContainer, BaseTransformerContainer): megatron_v2=self.policy.is_megatron_v2, split_qkv=self.policy.split_qkv) for i in range(2, 4): - maybe_copy(module.attention, - sd, - weight_quantizer, - mp_replace, - transformer_param_names[i], + maybe_copy(module.attention, sd, weight_quantizer, mp_replace, transformer_param_names[i], prefix + param_names[i]) for i in range(4, 10): - maybe_copy(module.mlp, - sd, - weight_quantizer, - mp_replace, - transformer_param_names[i], + maybe_copy(module.mlp, sd, weight_quantizer, mp_replace, transformer_param_names[i], prefix + param_names[i]) for i in range(10, 12): - maybe_copy(module, - sd, - weight_quantizer, - mp_replace, - transformer_param_names[i], - prefix + param_names[i]) + maybe_copy(module, sd, weight_quantizer, mp_replace, transformer_param_names[i], prefix + param_names[i]) class BLOOMLayerPolicy(TransformerPolicy): _orig_layer_class = None - def __init__(self, - client_module, - inference=True, - use_load_prefix=True, - split_qkv=False): - super().__init__(inference, - linear_layer=True, - use_load_prefix=use_load_prefix, - split_qkv=split_qkv) + def __init__(self, client_module, inference=True, use_load_prefix=True, split_qkv=False): + super().__init__(inference, linear_layer=True, use_load_prefix=use_load_prefix, split_qkv=split_qkv) self.client_module = client_module try: import transformers BLOOMLayerPolicy._orig_layer_class = transformers.models.bloom.modeling_bloom.BloomBlock global supported_models - supported_models.update( - {transformers.models.bloom.modeling_bloom.BloomModel}) + supported_models.update({transformers.models.bloom.modeling_bloom.BloomModel}) except Exception as e: - print( - f"WARNING! Setting BLOOMLayerPolicy._orig_layer_class to None due to Exception: {e}" - ) + print(f"WARNING! Setting BLOOMLayerPolicy._orig_layer_class to None due to Exception: {e}") BLOOMLayerPolicy._orig_layer_class = None def get_hidden_heads(self): diff --git a/deepspeed/module_inject/containers/clip.py b/deepspeed/module_inject/containers/clip.py index 8e6975091..40bd0d1f1 100644 --- a/deepspeed/module_inject/containers/clip.py +++ b/deepspeed/module_inject/containers/clip.py @@ -8,6 +8,7 @@ from ..policy import TransformerPolicy class DS_CLIPContainer(BaseTransformerContainer): + def __init__(self, **kwargs): super().__init__(**kwargs) @@ -21,6 +22,7 @@ class DS_CLIPContainer(BaseTransformerContainer): class HFCLIPLayerPolicy(TransformerPolicy): + def __init__(self, client_module, inference=False): super().__init__(inference, pre_attn_norm=True, scale_attention=True) self.client_module = client_module diff --git a/deepspeed/module_inject/containers/distil_bert.py b/deepspeed/module_inject/containers/distil_bert.py index 71f46dc8f..aa568c90c 100644 --- a/deepspeed/module_inject/containers/distil_bert.py +++ b/deepspeed/module_inject/containers/distil_bert.py @@ -8,6 +8,7 @@ from ..policy import TransformerPolicy class DS_DistilBERTContainer(BaseTransformerContainer): + def __init__(self, **kwargs): super().__init__(**kwargs) diff --git a/deepspeed/module_inject/containers/features/megatron.py b/deepspeed/module_inject/containers/features/megatron.py index 45a013be9..453948d27 100644 --- a/deepspeed/module_inject/containers/features/megatron.py +++ b/deepspeed/module_inject/containers/features/megatron.py @@ -5,6 +5,7 @@ from abc import ABC class MegatronContainer(ABC): + def __init__(self, **kwargs): super().__init__(**kwargs) self.megatron_v2 = self.policy.is_megatron_v2 @@ -15,23 +16,13 @@ class MegatronContainer(ABC): x_1 = x.view(*new_x_shape) (q, k, v) = torch.split(x_1, (x_1.shape[-1] // 3), dim=(x_1.dim() - 1)) if len(q.shape) > 2: - return torch.cat((q.reshape(q.shape[0], - -1), - k.reshape(q.shape[0], - -1), - v.reshape(q.shape[0], - -1)), + return torch.cat((q.reshape(q.shape[0], -1), k.reshape(q.shape[0], -1), v.reshape(q.shape[0], -1)), dim=-1).reshape(x.shape) else: - return torch.cat((q.reshape(-1), - k.reshape(-1), - v.reshape(-1)), - dim=-1).reshape(x.shape) + return torch.cat((q.reshape(-1), k.reshape(-1), v.reshape(-1)), dim=-1).reshape(x.shape) def transpose(self): super().transpose() if self.megatron_v2: - self.qkvw = torch.nn.parameter.Parameter( - self.transpose_qkv_alignment(self.qkvw).contiguous()) - self.qkvb = torch.nn.parameter.Parameter( - self.transpose_qkv_alignment(self.qkvb).contiguous()) + self.qkvw = torch.nn.parameter.Parameter(self.transpose_qkv_alignment(self.qkvw).contiguous()) + self.qkvb = torch.nn.parameter.Parameter(self.transpose_qkv_alignment(self.qkvb).contiguous()) diff --git a/deepspeed/module_inject/containers/features/meta_tensor.py b/deepspeed/module_inject/containers/features/meta_tensor.py index 5b63c5cf5..a0756bcea 100644 --- a/deepspeed/module_inject/containers/features/meta_tensor.py +++ b/deepspeed/module_inject/containers/features/meta_tensor.py @@ -4,6 +4,7 @@ from abc import ABC, abstractmethod class MetaTensorContainer(ABC): + def __init__(self, **kwargs): super().__init__(**kwargs) self.is_meta = False @@ -53,6 +54,5 @@ class MetaTensorContainer(ABC): of q, k, and v are stored together and needs to split in the DeepSpeed-Inference API. """ - raise NotImplementedError( - "A load_params() function must be defined in the model container \ + raise NotImplementedError("A load_params() function must be defined in the model container \ when inheriting the MetaTensorContainer feature") diff --git a/deepspeed/module_inject/containers/gpt2.py b/deepspeed/module_inject/containers/gpt2.py index dc194d71d..1c9e43534 100644 --- a/deepspeed/module_inject/containers/gpt2.py +++ b/deepspeed/module_inject/containers/gpt2.py @@ -6,6 +6,7 @@ from ..policy import TransformerPolicy class DS_GPT2Container(BaseTransformerContainer): + def __init__(self, **kwargs): super().__init__(**kwargs) diff --git a/deepspeed/module_inject/containers/gptj.py b/deepspeed/module_inject/containers/gptj.py index 35472c100..2a76cb2e0 100644 --- a/deepspeed/module_inject/containers/gptj.py +++ b/deepspeed/module_inject/containers/gptj.py @@ -12,6 +12,7 @@ from ..policy import maybe_copy_qkv class DS_GPTJContainer(MetaTensorContainer, BaseTransformerContainer): + def __init__(self, **kwargs): super().__init__(**kwargs) @@ -36,36 +37,20 @@ class DS_GPTJContainer(MetaTensorContainer, BaseTransformerContainer): 'ln_1.weight', \ 'ln_1.bias' ) - maybe_copy_qkv( - module.attention, - sd, - weight_quantizer, - mp_replace, - 'attn_qkvw', - [prefix + param_names[0], - prefix + param_names[1], - prefix + param_names[2]], - split_qkv=self.policy.split_qkv) - for i in range(3, 4): - maybe_copy(module.attention, + maybe_copy_qkv(module.attention, sd, weight_quantizer, mp_replace, - transformer_param_names[i - 1], + 'attn_qkvw', [prefix + param_names[0], prefix + param_names[1], prefix + param_names[2]], + split_qkv=self.policy.split_qkv) + for i in range(3, 4): + maybe_copy(module.attention, sd, weight_quantizer, mp_replace, transformer_param_names[i - 1], prefix + param_names[i]) for i in range(4, 8): - maybe_copy(module.mlp, - sd, - weight_quantizer, - mp_replace, - transformer_param_names[i], + maybe_copy(module.mlp, sd, weight_quantizer, mp_replace, transformer_param_names[i], prefix + param_names[i]) for i in range(8, 10): - maybe_copy(module, - sd, - weight_quantizer, - mp_replace, - transformer_param_names[i + 2], + maybe_copy(module, sd, weight_quantizer, mp_replace, transformer_param_names[i + 2], prefix + param_names[i]) diff --git a/deepspeed/module_inject/containers/gptneo.py b/deepspeed/module_inject/containers/gptneo.py index a8f206f51..80e422104 100644 --- a/deepspeed/module_inject/containers/gptneo.py +++ b/deepspeed/module_inject/containers/gptneo.py @@ -12,6 +12,7 @@ from ..policy import maybe_copy_qkv class DS_GPTNEOContainer(MetaTensorContainer, BaseTransformerContainer): + def __init__(self, **kwargs): super().__init__(**kwargs) @@ -39,40 +40,25 @@ class DS_GPTNEOContainer(MetaTensorContainer, BaseTransformerContainer): 'ln_1.weight', \ 'ln_1.bias' ) - maybe_copy_qkv( - module.attention, - sd, - weight_quantizer, - mp_replace, - 'attn_qkvw', - [prefix + param_names[0], - prefix + param_names[1], - prefix + param_names[2]], - split_qkv=self.policy.split_qkv) - for i in range(3, 5): - maybe_copy(module.attention, + maybe_copy_qkv(module.attention, sd, weight_quantizer, mp_replace, - transformer_param_names[i - 1], + 'attn_qkvw', [prefix + param_names[0], prefix + param_names[1], prefix + param_names[2]], + split_qkv=self.policy.split_qkv) + for i in range(3, 5): + maybe_copy(module.attention, sd, weight_quantizer, mp_replace, transformer_param_names[i - 1], prefix + param_names[i]) for i in range(5, 11): - maybe_copy(module.mlp, - sd, - weight_quantizer, - mp_replace, - transformer_param_names[i - 1], + maybe_copy(module.mlp, sd, weight_quantizer, mp_replace, transformer_param_names[i - 1], prefix + param_names[i]) for i in range(11, 13): - maybe_copy(module, - sd, - weight_quantizer, - mp_replace, - transformer_param_names[i - 1], + maybe_copy(module, sd, weight_quantizer, mp_replace, transformer_param_names[i - 1], prefix + param_names[i]) class HFGPTNEOLayerPolicy(TransformerPolicy): + def __init__(self, client_module, inference=True): super().__init__(inference, scale_attention=False) self.client_module = client_module diff --git a/deepspeed/module_inject/containers/gptneox.py b/deepspeed/module_inject/containers/gptneox.py index ebf2db025..ad2b88297 100644 --- a/deepspeed/module_inject/containers/gptneox.py +++ b/deepspeed/module_inject/containers/gptneox.py @@ -11,9 +11,8 @@ from ..policy import maybe_copy from packaging import version as pkg_version -class DS_GPTNEOXContainer(MetaTensorContainer, - MegatronContainer, - BaseTransformerContainer): +class DS_GPTNEOXContainer(MetaTensorContainer, MegatronContainer, BaseTransformerContainer): + def __init__(self, **kwargs): super().__init__(**kwargs) @@ -57,26 +56,13 @@ class DS_GPTNEOXContainer(MetaTensorContainer, split_qkv=self.policy.split_qkv, heads=self.policy.client_module.attention.num_attention_heads) for i in range(2, 4): - maybe_copy(module.attention, - sd, - weight_quantizer, - mp_replace, - transformer_param_names[i], + maybe_copy(module.attention, sd, weight_quantizer, mp_replace, transformer_param_names[i], prefix + param_names[i]) for i in range(4, 10): - maybe_copy(module.mlp, - sd, - weight_quantizer, - mp_replace, - transformer_param_names[i], + maybe_copy(module.mlp, sd, weight_quantizer, mp_replace, transformer_param_names[i], prefix + param_names[i]) for i in range(10, 12): - maybe_copy(module, - sd, - weight_quantizer, - mp_replace, - transformer_param_names[i], - prefix + param_names[i]) + maybe_copy(module, sd, weight_quantizer, mp_replace, transformer_param_names[i], prefix + param_names[i]) class GPTNEOXLayerPolicy(TransformerPolicy): diff --git a/deepspeed/module_inject/containers/megatron_gpt.py b/deepspeed/module_inject/containers/megatron_gpt.py index 7a8db9108..3b5c3bc43 100644 --- a/deepspeed/module_inject/containers/megatron_gpt.py +++ b/deepspeed/module_inject/containers/megatron_gpt.py @@ -9,6 +9,7 @@ from packaging import version as pkg_version class DS_MegatronGPTContainer(MegatronContainer, BaseTransformerContainer): + def __init__(self, **kwargs): super().__init__(**kwargs) @@ -36,9 +37,7 @@ class MegatronLayerPolicy(TransformerPolicy): use_mup = False def __init__(self, client_module, inference=True): - super().__init__(inference, - megatron_v2=MegatronLayerPolicy.megatron_v2, - use_mup=MegatronLayerPolicy.use_mup) + super().__init__(inference, megatron_v2=MegatronLayerPolicy.megatron_v2, use_mup=MegatronLayerPolicy.use_mup) self.client_module = client_module # we use megatron version to differentiate between the old and new # megatron-lm source code diff --git a/deepspeed/module_inject/containers/megatron_gpt_moe.py b/deepspeed/module_inject/containers/megatron_gpt_moe.py index 296816130..d6d189b4f 100644 --- a/deepspeed/module_inject/containers/megatron_gpt_moe.py +++ b/deepspeed/module_inject/containers/megatron_gpt_moe.py @@ -10,6 +10,7 @@ from packaging import version as pkg_version class DS_MegatronGPTMoEContainer(MegatronContainer, BaseTransformerMoEContainer): + def __init__(self, policy, config, model_config, layer_id): super().__init__(policy, config, model_config, layer_id) diff --git a/deepspeed/module_inject/containers/opt.py b/deepspeed/module_inject/containers/opt.py index 8f9c30bca..25c2bcdcb 100644 --- a/deepspeed/module_inject/containers/opt.py +++ b/deepspeed/module_inject/containers/opt.py @@ -13,6 +13,7 @@ from deepspeed.utils.types import ActivationFuncType class DS_OPTContainer(MetaTensorContainer, BaseTransformerContainer): + def __init__(self, **kwargs): super().__init__(**kwargs) @@ -50,32 +51,16 @@ class DS_OPTContainer(MetaTensorContainer, BaseTransformerContainer): weight_quantizer, mp_replace, transformer_param_names[i // 3], - [ - prefix + param_names[i], - prefix + param_names[i + 1], - prefix + param_names[i + 2] - ], + [prefix + param_names[i], prefix + param_names[i + 1], prefix + param_names[i + 2]], split_qkv=self.policy.split_qkv) for i in range(6, 8): - maybe_copy(module.attention, - sd, - weight_quantizer, - mp_replace, - transformer_param_names[i - 4], + maybe_copy(module.attention, sd, weight_quantizer, mp_replace, transformer_param_names[i - 4], prefix + param_names[i]) for i in range(8, 14): - maybe_copy(module.mlp, - sd, - weight_quantizer, - mp_replace, - transformer_param_names[i - 4], + maybe_copy(module.mlp, sd, weight_quantizer, mp_replace, transformer_param_names[i - 4], prefix + param_names[i]) for i in range(14, 16): - maybe_copy(module, - sd, - weight_quantizer, - mp_replace, - transformer_param_names[i - 4], + maybe_copy(module, sd, weight_quantizer, mp_replace, transformer_param_names[i - 4], prefix + param_names[i]) @@ -93,8 +78,7 @@ class HFOPTLayerPolicy(TransformerPolicy): try: import transformers HFOPTLayerPolicy._orig_layer_class = transformers.models.opt.modeling_opt.OPTDecoderLayer - if isinstance(TransformerPolicy.hf_model_config, - transformers.models.opt.configuration_opt.OPTConfig): + if isinstance(TransformerPolicy.hf_model_config, transformers.models.opt.configuration_opt.OPTConfig): self.pre_attn_norm = TransformerPolicy.hf_model_config.do_layer_norm_before except: HFOPTLayerPolicy._orig_layer_class = None diff --git a/deepspeed/module_inject/containers/unet.py b/deepspeed/module_inject/containers/unet.py index 461ca12bf..cc979db0a 100644 --- a/deepspeed/module_inject/containers/unet.py +++ b/deepspeed/module_inject/containers/unet.py @@ -9,6 +9,7 @@ from ...model_implementations.diffusers.unet import DSUNet class UNetPolicy(DSPolicy): + def __init__(self): super().__init__() try: diff --git a/deepspeed/module_inject/containers/vae.py b/deepspeed/module_inject/containers/vae.py index c873f9768..1a9f03453 100644 --- a/deepspeed/module_inject/containers/vae.py +++ b/deepspeed/module_inject/containers/vae.py @@ -6,6 +6,7 @@ from ...model_implementations.diffusers.vae import DSVAE class VAEPolicy(DSPolicy): + def __init__(self): super().__init__() try: diff --git a/deepspeed/module_inject/inject.py b/deepspeed/module_inject/inject.py index 384bb7279..5d31490e4 100755 --- a/deepspeed/module_inject/inject.py +++ b/deepspeed/module_inject/inject.py @@ -5,30 +5,22 @@ import torch from deepspeed.ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig -def module_inject(layer_obj, - model, - config, - micro_batch_size, - max_seq_length, - seed, - preln, - fp16=True): +def module_inject(layer_obj, model, config, micro_batch_size, max_seq_length, seed, preln, fp16=True): for name, child in model.named_children(): if isinstance(child, layer_obj): print('REPLACING BertLayer') - cuda_config = DeepSpeedTransformerConfig( - batch_size=micro_batch_size, - max_seq_length=max_seq_length, - hidden_size=config.hidden_size, - heads=config.num_attention_heads, - attn_dropout_ratio=config.attention_probs_dropout_prob, - hidden_dropout_ratio=config.hidden_dropout_prob, - num_hidden_layers=config.num_hidden_layers, - initializer_range=config.initializer_range, - seed=seed, - fp16=fp16, - pre_layer_norm=preln) + cuda_config = DeepSpeedTransformerConfig(batch_size=micro_batch_size, + max_seq_length=max_seq_length, + hidden_size=config.hidden_size, + heads=config.num_attention_heads, + attn_dropout_ratio=config.attention_probs_dropout_prob, + hidden_dropout_ratio=config.hidden_dropout_prob, + num_hidden_layers=config.num_hidden_layers, + initializer_range=config.initializer_range, + seed=seed, + fp16=fp16, + pre_layer_norm=preln) new_module = DeepSpeedTransformerLayer(cuda_config) @@ -71,14 +63,7 @@ def module_inject(layer_obj, setattr(model, name, copy.deepcopy(new_module)) else: - module_inject(layer_obj, - child, - config, - micro_batch_size, - max_seq_length, - seed, - preln, - fp16) + module_inject(layer_obj, child, config, micro_batch_size, max_seq_length, seed, preln, fp16) return model diff --git a/deepspeed/module_inject/layers.py b/deepspeed/module_inject/layers.py index 0636e8928..4e1e482d3 100644 --- a/deepspeed/module_inject/layers.py +++ b/deepspeed/module_inject/layers.py @@ -10,6 +10,7 @@ from deepspeed.accelerator import get_accelerator class LinearAllreduce(nn.Module): + def __init__(self, weight, bias=None, mp_group=None): super(LinearAllreduce, self).__init__() self.weight = weight @@ -26,6 +27,7 @@ class LinearAllreduce(nn.Module): class LinearLayer(nn.Module): + def __init__(self, weight_shape=None, dtype=torch.half, weight=None, bias=None): super(LinearLayer, self).__init__() if weight is not None: @@ -33,9 +35,7 @@ class LinearLayer(nn.Module): self.bias = bias else: self.weight = Parameter( - torch.empty(weight_shape, - dtype=dtype, - device=get_accelerator().current_device_name())) + torch.empty(weight_shape, dtype=dtype, device=get_accelerator().current_device_name())) self.bias = Parameter( torch.empty(weight_shape[0], @@ -51,11 +51,10 @@ class LinearLayer(nn.Module): class Normalize(nn.Module): + def __init__(self, dim, dtype=torch.float, eps=1e-5): super(Normalize, self).__init__() - self.norm = nn.LayerNorm(dim, - eps=eps).to(dtype).to( - get_accelerator().current_device_name()) + self.norm = nn.LayerNorm(dim, eps=eps).to(dtype).to(get_accelerator().current_device_name()) self.weight = self.norm.weight self.bias = self.norm.bias @@ -64,13 +63,11 @@ class Normalize(nn.Module): class EmbeddingLayer(nn.Module): + def __init__(self, weight_shape, dtype=torch.half): super(EmbeddingLayer, self).__init__() self.weight = Parameter( - torch.empty(weight_shape[0], - weight_shape[1], - dtype=dtype, - device=get_accelerator().current_device_name())) + torch.empty(weight_shape[0], weight_shape[1], dtype=dtype, device=get_accelerator().current_device_name())) def forward(self, input): return F.embedding(input, self.weight) @@ -80,6 +77,7 @@ class OPTEmbedding(EmbeddingLayer): """ This module learns positional embeddings up to a fixed maximum size. """ + def __init__(self, weight_shape): # OPT is set up so that if padding_idx is specified then offset the embedding ids by 2 # and adjust num_embeddings appropriately. Other models don't have this hack @@ -91,9 +89,7 @@ class OPTEmbedding(EmbeddingLayer): attention_mask = attention_mask.long() # create positions depending on attention_mask - positions = (torch.cumsum(attention_mask, - dim=1).type_as(attention_mask) * - attention_mask).long() - 1 + positions = (torch.cumsum(attention_mask, dim=1).type_as(attention_mask) * attention_mask).long() - 1 # cut positions if `past_key_values_length` is > 0 positions = positions[:, past_key_values_length:] diff --git a/deepspeed/module_inject/load_checkpoint.py b/deepspeed/module_inject/load_checkpoint.py index 4fc094fb9..565ac895d 100644 --- a/deepspeed/module_inject/load_checkpoint.py +++ b/deepspeed/module_inject/load_checkpoint.py @@ -50,10 +50,8 @@ def load_model_with_checkpoint(r_module, if prefix + 'bias' in sd[0].keys(): if module.bias.data.is_meta: # meta tensor cannot be casted or copied to, so we need to replace it with a normal tensor here - module.bias = torch.nn.parameter.Parameter( - data=torch.empty_like(module.bias.data, - device="cpu"), - requires_grad=module.bias.data.requires_grad) + module.bias = torch.nn.parameter.Parameter(data=torch.empty_like(module.bias.data, device="cpu"), + requires_grad=module.bias.data.requires_grad) module.bias = mp_replace.copy(module.bias.data, sd[0][prefix + 'bias']) args = None gc.collect() @@ -71,86 +69,62 @@ def load_model_with_checkpoint(r_module, # set the quantizer number of groups using the checkpoint scale shape weight_quantizer.num_groups = scale.shape[0] else: - tmp_data = sd[0][prefix + n].to( - get_accelerator().current_device_name()) + tmp_data = sd[0][prefix + n].to(get_accelerator().current_device_name()) scale = None src_shape = tmp_data.shape dst_shape = p.shape inner_dim = 1 if tmp_data.dtype == torch.int8 else 0 outer_dim = 0 if tmp_data.dtype == torch.int8 else 1 if (len(src_shape) == 2 and len(dst_shape) == 2): - if (src_shape[inner_dim] == dst_shape[0] - and src_shape[outer_dim] == dst_shape[1]): + if (src_shape[inner_dim] == dst_shape[0] and src_shape[outer_dim] == dst_shape[1]): if tmp_data.dtype != torch.int8: p = weight_quantizer.quantize( - transpose(tmp_data) if weight_quantizer. - q_int8 else tmp_data) + transpose(tmp_data) if weight_quantizer.q_int8 else tmp_data) else: - p = torch.nn.parameter.Parameter(tmp_data, - requires_grad=False) + p = torch.nn.parameter.Parameter(tmp_data, requires_grad=False) p.scale = scale setattr(module, n, p) else: - dim = inner_dim if src_shape[inner_dim] != dst_shape[ - 0] else outer_dim + dim = inner_dim if src_shape[inner_dim] != dst_shape[0] else outer_dim dim1 = 0 if src_shape[inner_dim] != dst_shape[0] else 1 if src_shape[dim] > dst_shape[dim1]: - weight_partition = torch.split( - tmp_data, - dst_shape[dim1], - dim=dim)[rank].to( - get_accelerator().current_device_name()) + weight_partition = torch.split(tmp_data, dst_shape[dim1], dim=dim)[rank].to( + get_accelerator().current_device_name()) assert tmp_data.dtype != torch.int8 or scale.numel() > weight_quantizer.num_groups * (rank+1), \ '''ERROR: We require the quantization scales for larger TP-size when loading INT8 checkpoint!\ Please use the FP16 checkpoint to generate INT8 checkpoint with the sharding parameters!''' - scale = scale.view( - -1)[weight_quantizer.num_groups * - (rank + 1):].reshape( - weight_quantizer.num_groups, - -1).contiguous() + scale = scale.view(-1)[weight_quantizer.num_groups * (rank + 1):].reshape( + weight_quantizer.num_groups, -1).contiguous() else: assert tmp_data.dtype != torch.int8, \ '''Merging of the checkpoints are not supported when using INT8 checkpoint! \ Please use a as many GPUs as TP-size for the checkpoint''' all_data = [ - sd[j][prefix + - n] if type(sd[j][prefix + n]) is list else - sd[j][prefix + n].to( - get_accelerator().current_device_name()) - for j in range(len(sd)) + sd[j][prefix + n] if type(sd[j][prefix + n]) is list else sd[j][prefix + n].to( + get_accelerator().current_device_name()) for j in range(len(sd)) ] # Check if the weight tensor is for the QKV parameter - if src_shape[1] == (3 * - src_shape[0]) // ckpt_mp_size: + if src_shape[1] == (3 * src_shape[0]) // ckpt_mp_size: qkv_size = src_shape[outer_dim] // 3 src_split = [ - torch.split(src[0].data, - qkv_size, - dim=outer_dim) - for src in all_data + torch.split(src[0].data, qkv_size, dim=outer_dim) for src in all_data ] weight_partition = torch.cat([ - torch.cat([qkv_s[i] for qkv_s in src_split], - axis=outer_dim) + torch.cat([qkv_s[i] for qkv_s in src_split], axis=outer_dim) for i in range(len(src_split[0])) ], dim=dim) else: weight_partition = torch.cat([ - ad[0].to( - get_accelerator().current_device_name()) - if type(ad) is list else ad - for ad in all_data + ad[0].to(get_accelerator().current_device_name()) + if type(ad) is list else ad for ad in all_data ], dim=dim) if tmp_data.dtype == torch.int8: - scale = torch.cat([ - ad[1].to( - get_accelerator().current_device_name()) - for ad in all_data - ], - dim=dim) + scale = torch.cat( + [ad[1].to(get_accelerator().current_device_name()) for ad in all_data], + dim=dim) if tmp_data.dtype != torch.int8: weight_partition = weight_quantizer.quantize( @@ -158,9 +132,8 @@ def load_model_with_checkpoint(r_module, parallel_dim=(0 if dim == 1 else 1)) if weight_quantizer.q_int8 else \ weight_quantizer.quantize(weight_partition) else: - weight_partition = torch.nn.parameter.Parameter( - weight_partition, - requires_grad=False) + weight_partition = torch.nn.parameter.Parameter(weight_partition, + requires_grad=False) weight_partition.scale = scale setattr(module, n, weight_partition) else: @@ -168,42 +141,27 @@ def load_model_with_checkpoint(r_module, p.data.copy_(tmp_data) else: if src_shape[0] > dst_shape[0]: - bias_split = torch.split( - tmp_data, - dst_shape[-1])[rank].to(get_accelerator( - ).current_device_name()).contiguous() + bias_split = torch.split(tmp_data, dst_shape[-1])[rank].to( + get_accelerator().current_device_name()).contiguous() p.data.copy_(bias_split) else: # Check if the weight tensor is for the QKV parameter - if src_shape[0] == (3 * r_module.config.hidden_size - ) // ckpt_mp_size: + if src_shape[0] == (3 * r_module.config.hidden_size) // ckpt_mp_size: qkv_size = src_shape[0] // 3 src_split = [ - torch.split(sd[j][prefix + n], - qkv_size, - dim=0) for j in range(len(sd)) + torch.split(sd[j][prefix + n], qkv_size, dim=0) for j in range(len(sd)) ] p.data.copy_( - torch.cat( - [ - torch.cat([ - qkv_s[i] for qkv_s in src_split - ], - axis=0) - for i in range(len(src_split[0])) - ], - dim=0).to(get_accelerator( - ).current_device_name()).contiguous()) + torch.cat([ + torch.cat([qkv_s[i] for qkv_s in src_split], axis=0) + for i in range(len(src_split[0])) + ], + dim=0).to(get_accelerator().current_device_name()).contiguous()) else: p.data.copy_( - torch.cat( - [ - sd[j][prefix + n] - for j in range(len(sd)) - ], - dim=0).to(get_accelerator( - ).current_device_name()).contiguous()) + torch.cat([sd[j][prefix + n] for j in range(len(sd))], + dim=0).to(get_accelerator().current_device_name()).contiguous()) load_parameters(module, prefix) for n, child in module.named_children(): @@ -249,20 +207,16 @@ def load_model_with_checkpoint(r_module, setattr(module, name, child) continue child_params = list(child.parameters()) - if len(child_params) > 0 and (child_params[0].numel() == 0 - or child_params[0].is_meta): + if len(child_params) > 0 and (child_params[0].numel() == 0 or child_params[0].is_meta): if child.weight.is_meta: ds_shape = child.weight.shape else: ds_shape = child.weight.ds_shape if child.__class__ is nn.LayerNorm: - child = Normalize(dim=ds_shape[-1], - dtype=child.weight.dtype, - eps=child.eps) + child = Normalize(dim=ds_shape[-1], dtype=child.weight.dtype, eps=child.eps) setattr(module, name, child) elif child.__class__ is nn.Linear: - child = LinearLayer(weight_shape=child.weight.shape, - bias=child.bias) + child = LinearLayer(weight_shape=child.weight.shape, bias=child.bias) setattr(module, name, child) elif child.__class__ is OPTLearnedPositionalEmbedding: child = OPTEmbedding(weight_shape=ds_shape) @@ -271,8 +225,7 @@ def load_model_with_checkpoint(r_module, ds_id = None if hasattr(child.weight, 'ds_id'): ds_id = child.weight.ds_id - child = EmbeddingLayer(weight_shape=ds_shape, - dtype=child.weight.dtype) + child = EmbeddingLayer(weight_shape=ds_shape, dtype=child.weight.dtype) if ds_id is not None: all_ds_ids[ds_id] = child.weight setattr(module, name, child) diff --git a/deepspeed/module_inject/module_quantize.py b/deepspeed/module_inject/module_quantize.py index 4123a1214..90d928a68 100755 --- a/deepspeed/module_inject/module_quantize.py +++ b/deepspeed/module_inject/module_quantize.py @@ -18,34 +18,25 @@ def quantize_transformer_layer(orig_layer_impl, model, megatron=False, preln=Fal Returns: Updated nn.module with quantized transformer layers """ + def quantize_weight(weight): return weight.to(torch.int8) def megatron_layer_quantize(layer): - layer.attention.query_key_value.weight.data = quantize_weight( - layer.attention.query_key_value.weight.data) - layer.attention.dense.weight.data = quantize_weight( - layer.attention.dense.weight.data) - layer.mlp.dense_h_to_4h.weight.data = quantize_weight( - layer.mlp.dense_h_to_4h.weight.data) - layer.mlp.dense_4h_to_h.weight.data = quantize_weight( - layer.mlp.dense_4h_to_h.weight.data) + layer.attention.query_key_value.weight.data = quantize_weight(layer.attention.query_key_value.weight.data) + layer.attention.dense.weight.data = quantize_weight(layer.attention.dense.weight.data) + layer.mlp.dense_h_to_4h.weight.data = quantize_weight(layer.mlp.dense_h_to_4h.weight.data) + layer.mlp.dense_4h_to_h.weight.data = quantize_weight(layer.mlp.dense_4h_to_h.weight.data) def bert_layer_quantize(layer): - layer.attention.self.query.weight.data = quantize_weight( - layer.attention.self.query.weight.data) - layer.attention.self.key.weight.data = quantize_weight( - layer.attention.self.key.weight.data) - layer.attention.self.value.weight.data = quantize_weight( - layer.attention.self.value.weight.data) - layer.attention.output.dense.weight.data = quantize_weight( - layer.attention.output.dense.weight.data) + layer.attention.self.query.weight.data = quantize_weight(layer.attention.self.query.weight.data) + layer.attention.self.key.weight.data = quantize_weight(layer.attention.self.key.weight.data) + layer.attention.self.value.weight.data = quantize_weight(layer.attention.self.value.weight.data) + layer.attention.output.dense.weight.data = quantize_weight(layer.attention.output.dense.weight.data) if preln: - layer.intermediate.dense_act.weight.data = quantize_weight( - layer.intermediate.dense_act.weight.data) + layer.intermediate.dense_act.weight.data = quantize_weight(layer.intermediate.dense_act.weight.data) else: - layer.intermediate.dense.weight.data = quantize_weight( - layer.intermediate.dense.weight.data) + layer.intermediate.dense.weight.data = quantize_weight(layer.intermediate.dense.weight.data) layer.output.dense.weight.data = quantize_weight(layer.output.dense.weight.data) def quantize_fn(child): @@ -58,9 +49,7 @@ def quantize_transformer_layer(orig_layer_impl, model, megatron=False, preln=Fal return child - return quantize_module(model=model, - orig_class=orig_layer_impl, - quantize_fn=quantize_fn) + return quantize_module(model=model, orig_class=orig_layer_impl, quantize_fn=quantize_fn) def quantize_module(model, orig_class, quantize_fn): diff --git a/deepspeed/module_inject/policy.py b/deepspeed/module_inject/policy.py index dfd3343e1..ca11a5933 100644 --- a/deepspeed/module_inject/policy.py +++ b/deepspeed/module_inject/policy.py @@ -124,15 +124,10 @@ def _transpose(x, heads=1, mp_replace=None): (q, k, v) = torch.split(x_1, (x_1.shape[-1] // 3), dim=-1) if len(q.shape) > 2: new_shape = (q.shape[0], ) + (-1, ) - return torch.cat((q.reshape(new_shape), - k.reshape(new_shape), - v.reshape(new_shape)), + return torch.cat((q.reshape(new_shape), k.reshape(new_shape), v.reshape(new_shape)), dim=outer_dim).reshape(x.shape) else: - return torch.cat((q.reshape(-1), - k.reshape(-1), - v.reshape(-1)), - dim=-1).reshape(x.shape) + return torch.cat((q.reshape(-1), k.reshape(-1), v.reshape(-1)), dim=-1).reshape(x.shape) # This checks if the parameter exits in the checkpoint file and maybe copies it into the corresponding destination tensor. @@ -156,19 +151,14 @@ def maybe_copy(module, else: dst = mp_replace.copy(dst, tmp) if qkv and megatron_v2: - dst = torch.nn.parameter.Parameter( - _transpose(dst, - heads=heads, - mp_replace=mp_replace).contiguous()) + dst = torch.nn.parameter.Parameter(_transpose(dst, heads=heads, mp_replace=mp_replace).contiguous()) else: if split_qkv: dst = mp_replace.qkv_copy(dst, weight_quantizer.quantize(tmp if weight_quantizer.q_int8 else \ (transpose(tmp).contiguous())), int8=weight_quantizer.q_int8) else: if qkv and megatron_v2: - tmp = _transpose(transpose(tmp), - heads=heads, - mp_replace=mp_replace).contiguous() + tmp = _transpose(transpose(tmp), heads=heads, mp_replace=mp_replace).contiguous() if weight_quantizer.q_int8: tmp = transpose(tmp) dst = mp_replace.copy(dst, weight_quantizer.quantize(tmp if weight_quantizer.q_int8 else \ @@ -177,13 +167,7 @@ def maybe_copy(module, # Extending the maybe_copy function for when the q, k, and v are in separate parameters! -def maybe_copy_qkv(module, - sd, - weight_quantizer, - mp_replace, - dst_name, - src_names, - split_qkv=False): +def maybe_copy_qkv(module, sd, weight_quantizer, mp_replace, dst_name, src_names, split_qkv=False): if src_names[0] in sd: q = sd[src_names[0]] k = sd[src_names[1]] diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index fd67bfa95..9b038f398 100644 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -23,6 +23,7 @@ from .utils import policy_to_ds_container class ReplaceWithTensorSlicing: + def __init__(self, mp_group=None, mp_size=1, out_dim=1, in_dim=0): if mp_group is not None: self.gpu_index = dist.get_rank(group=mp_group) @@ -58,32 +59,22 @@ class ReplaceWithTensorSlicing: if self.out_dim == 1: self.merge_assert(src_shape[outer_dim], dst_shape[self.out_dim]) qkv_size = dst_shape[self.out_dim] // 3 - qkv_split = [ - torch.split(src_s, - qkv_size, - dim=outer_dim) for src_s in src_split - ] + qkv_split = [torch.split(src_s, qkv_size, dim=outer_dim) for src_s in src_split] weight_split = [ - torch.cat([qkv_s[i] for qkv_s in qkv_split], - axis=outer_dim) for i in range(len(qkv_split[0])) + torch.cat([qkv_s[i] for qkv_s in qkv_split], axis=outer_dim) for i in range(len(qkv_split[0])) ] - dst = dst.reshape(-1).data.copy_( - weight_split[self.gpu_index].contiguous().reshape(-1)).reshape( - weight_split[self.gpu_index].shape) + dst = dst.reshape(-1).data.copy_(weight_split[self.gpu_index].contiguous().reshape(-1)).reshape( + weight_split[self.gpu_index].shape) else: - dst.data.copy_(src_split[self.gpu_index].to( - get_accelerator().current_device_name()).contiguous()) + dst.data.copy_(src_split[self.gpu_index].to(get_accelerator().current_device_name()).contiguous()) else: if src_shape[0] == dst_shape[0]: return torch.nn.parameter.Parameter(src) if self.out_dim == 1: qkv_size = dst_shape[0] // 3 qkv_split = [torch.split(src_s, qkv_size, dim=0) for src_s in src_split] - bias_split = [ - torch.cat([qkv_s[i] for qkv_s in qkv_split], - axis=0) for i in range(len(qkv_split[0])) - ] + bias_split = [torch.cat([qkv_s[i] for qkv_s in qkv_split], axis=0) for i in range(len(qkv_split[0]))] dst.data.copy_(bias_split[self.gpu_index].contiguous()) else: dst.data.copy_(src_split[self.gpu_index].contiguous()) @@ -103,30 +94,22 @@ class ReplaceWithTensorSlicing: dst_shape = dst.shape if (len(src_shape) == 2 and len(dst_shape) == 2): - if src_shape[inner_dim] == dst_shape[ - self.in_dim] and src_shape[outer_dim] == dst_shape[self.out_dim]: + if src_shape[inner_dim] == dst_shape[self.in_dim] and src_shape[outer_dim] == dst_shape[self.out_dim]: dst = dst.reshape(-1).data.copy_(src.data.reshape(-1)).reshape(src.shape) else: if src_shape[inner_dim] != dst_shape[self.in_dim]: self.merge_assert(src_shape[inner_dim], dst_shape[self.in_dim]) - weight_split = torch.split( - src, - dst_shape[self.in_dim], - dim=inner_dim)[self.gpu_index].contiguous() + weight_split = torch.split(src, dst_shape[self.in_dim], dim=inner_dim)[self.gpu_index].contiguous() else: self.merge_assert(src_shape[outer_dim], dst_shape[self.out_dim]) - weight_split = torch.split( - src.data, - dst_shape[self.out_dim], - dim=outer_dim)[self.gpu_index].contiguous() - dst = dst.reshape(-1).data.copy_(weight_split.reshape(-1)).reshape( - weight_split.shape) + weight_split = torch.split(src.data, dst_shape[self.out_dim], + dim=outer_dim)[self.gpu_index].contiguous() + dst = dst.reshape(-1).data.copy_(weight_split.reshape(-1)).reshape(weight_split.shape) else: if src_shape[0] == dst_shape[0]: dst.data.copy_(src) else: - bias_split = torch.split(src.data, - dst_shape[-1])[self.gpu_index].contiguous() + bias_split = torch.split(src.data, dst_shape[-1])[self.gpu_index].contiguous() dst.data.copy_(bias_split) dst = torch.nn.parameter.Parameter(dst, requires_grad=False) if hasattr(src, 'scale'): @@ -150,6 +133,7 @@ def get_transformer_name(replaced_module): class GroupQuantizer: + def __init__(self, q_int8=True, group_size=1, num_bits=8, num_groups=0): self.group_size = group_size self.num_bits = num_bits @@ -163,8 +147,7 @@ class GroupQuantizer: inputs.scale = torch.empty(1) return inputs q_range = 2**self.num_bits - num_groups = self.num_groups if self.num_groups > 0 else inputs.shape[ - 0] // self.group_size + num_groups = self.num_groups if self.num_groups > 0 else inputs.shape[0] // self.group_size inputs = inputs.to(get_accelerator().current_device_name()) input_flat = inputs.reshape(num_groups, -1).contiguous() input_min = torch.min(input_flat, dim=1, keepdim=True)[0].float() @@ -174,31 +157,14 @@ class GroupQuantizer: inputs_q = input_flat.reshape(inputs.shape).to(torch.int8).contiguous() out = torch.nn.Parameter(inputs_q, requires_grad=False) inputs_split = inputs.split(inputs.shape[parallel_dim] // 2, dim=parallel_dim) - input_flat = [ - inputs_split[i].reshape(num_groups, - -1).contiguous() for i in range(2) - ] - input_min = [ - torch.min(input_flat[i], - dim=1, - keepdim=True)[0].float() for i in range(2) - ] - input_max = [ - torch.max(input_flat[i], - dim=1, - keepdim=True)[0].float() for i in range(2) - ] - scale1 = [ - (torch.max(input_min[i].abs(), - input_max[i].abs()) * 2.0 / (q_range)).squeeze().unsqueeze(0) - for i in range(2) - ] + input_flat = [inputs_split[i].reshape(num_groups, -1).contiguous() for i in range(2)] + input_min = [torch.min(input_flat[i], dim=1, keepdim=True)[0].float() for i in range(2)] + input_max = [torch.max(input_flat[i], dim=1, keepdim=True)[0].float() for i in range(2)] + scale1 = [(torch.max(input_min[i].abs(), input_max[i].abs()) * 2.0 / (q_range)).squeeze().unsqueeze(0) + for i in range(2)] - out.scale = torch.cat([scale.squeeze().unsqueeze(0), - scale1[0], - scale1[1]], - dim=0).reshape(num_groups, - -1).contiguous() + out.scale = torch.cat([scale.squeeze().unsqueeze(0), scale1[0], scale1[1]], dim=0).reshape(num_groups, + -1).contiguous() return out @@ -211,6 +177,7 @@ def _module_match(module): def generic_injection(module, fp16=False, enable_cuda_graph=True): + def replace_attn(child, policy): policy_attn = policy.attention(child) if policy_attn is None: @@ -246,8 +213,7 @@ def generic_injection(module, fp16=False, enable_cuda_graph=True): attn_module.attn_qkvb = None attn_module.attn_ow.data = transpose(attn_ow.data) - attn_module.attn_ob.data.copy_( - attn_ob.data.to(get_accelerator().current_device_name())) + attn_module.attn_ob.data.copy_(attn_ob.data.to(get_accelerator().current_device_name())) return attn_module def replace_attn_block(child, policy): @@ -278,8 +244,7 @@ def generic_injection(module, fp16=False, enable_cuda_graph=True): # triangular_masking=True, # max_out_tokens=8192) from ..model_implementations.transformers.clip_encoder import DSClipEncoder - cg_encoder = DSClipEncoder(module.text_encoder, - enable_cuda_graph=enable_cuda_graph) + cg_encoder = DSClipEncoder(module.text_encoder, enable_cuda_graph=enable_cuda_graph) setattr(module, 'text_encoder', cg_encoder) for name in module.__dict__.keys(): sub_module = getattr(module, name) @@ -291,13 +256,11 @@ def generic_injection(module, fp16=False, enable_cuda_graph=True): for name, child in module.named_children(): _replace_module(child, policy) if child.__class__ in new_policies: - replaced_module = new_policies[child.__class__](child, - policy) + replaced_module = new_policies[child.__class__](child, policy) setattr(module, name, replaced_module) _replace_module(sub_module, policy) - new_module = policy.apply(sub_module, - enable_cuda_graph=enable_cuda_graph) + new_module = policy.apply(sub_module, enable_cuda_graph=enable_cuda_graph) print(f"**** found and replaced {name} w. {type(new_module)}") setattr(module, name, new_module) @@ -305,11 +268,7 @@ def generic_injection(module, fp16=False, enable_cuda_graph=True): container_g = None -def replace_transformer_layer(orig_layer_impl, - model, - checkpoint_dict, - config, - model_config): +def replace_transformer_layer(orig_layer_impl, model, checkpoint_dict, config, model_config): """ Replace bert-style transformer layers with DeepSpeed's transformer layer Arguments: orig_layer_impl (torch.nn.Module): the original transformer layer implementation to look for, @@ -334,15 +293,10 @@ def replace_transformer_layer(orig_layer_impl, seed = -1 local_rank = -1 - mp_replace = ReplaceWithTensorSlicing( - mp_group=config.tensor_parallel.tp_group, - mp_size=config.tensor_parallel.tp_size) #, out_dim=0, in_dim=1) + mp_replace = ReplaceWithTensorSlicing(mp_group=config.tensor_parallel.tp_group, + mp_size=config.tensor_parallel.tp_size) #, out_dim=0, in_dim=1) - def replace_with_policy(child, - policy_cls, - triangular_masking, - inference=False, - layer_id=0): + def replace_with_policy(child, policy_cls, triangular_masking, inference=False, layer_id=0): policy = policy_cls(child, inference=inference) if not policy.cuda_graph_supported: # policy says cuda graph is not supported raise an error if set @@ -364,8 +318,7 @@ def replace_transformer_layer(orig_layer_impl, _container.set_moe(moe) # 2. Set the tensor parallelism config - _container.set_tensor_parallel_config(config.tensor_parallel.tp_size, - config.tensor_parallel.tp_group) + _container.set_tensor_parallel_config(config.tensor_parallel.tp_size, config.tensor_parallel.tp_group) # 3. Initialize tensors _container.initialize_tensors() @@ -411,25 +364,21 @@ def replace_transformer_layer(orig_layer_impl, if name in all_reduce_linears: new_weight = torch.empty(( weight_shape[1] if conv_linear_layer else weight_shape[0], - (weight_shape[0] if conv_linear_layer else weight_shape[1]) // - mp_size, + (weight_shape[0] if conv_linear_layer else weight_shape[1]) // mp_size, ), device=child.weight.device, dtype=child.weight.dtype) if conv_linear_layer: child.weight.data = child.weight.data.transpose(-1, -2).contiguous() data = mp_replace.copy(new_weight, child.weight.data) - new_bias = torch.empty((weight_shape[0]), - device=child.weight.device, - dtype=child.weight.dtype) + new_bias = torch.empty((weight_shape[0]), device=child.weight.device, dtype=child.weight.dtype) if child.bias is not None: new_bias.data.copy_(child.bias.data) return LinearAllreduce(data, child.bias if child.bias is None else \ torch.nn.parameter.Parameter(new_bias.to(get_accelerator().current_device_name())), mp_group) else: new_weight = torch.empty(( - (weight_shape[1] if conv_linear_layer else weight_shape[0]) // - mp_size, + (weight_shape[1] if conv_linear_layer else weight_shape[0]) // mp_size, weight_shape[0] // mp_size if conv_linear_layer else weight_shape[1], ), device=child.weight.device, @@ -441,51 +390,54 @@ def replace_transformer_layer(orig_layer_impl, new_bias = torch.empty((weight_shape[0] // mp_size), device=child.weight.device, dtype=child.weight.dtype) - bias_data = None if child.bias is None else mp_replace.copy( - new_bias, - child.bias.data).to(get_accelerator().current_device_name()) - return LinearLayer(weight=data.to( - get_accelerator().current_device_name()), - bias=bias_data) + bias_data = None if child.bias is None else mp_replace.copy(new_bias, child.bias.data).to( + get_accelerator().current_device_name()) + return LinearLayer(weight=data.to(get_accelerator().current_device_name()), bias=bias_data) def _slice_embedding(child, name, conv_linear_layer): mp_replace = ReplaceWithTensorSlicing(mp_group=mp_group) - new_weight = torch.empty((child.weight.shape[0], - child.weight.shape[1] // mp_size), + new_weight = torch.empty((child.weight.shape[0], child.weight.shape[1] // mp_size), device=child.weight.device, dtype=child.weight.dtype) data = mp_replace.copy(new_weight, child.weight.ds_tensor.data if hasattr(child.weight, 'ds_tensor') else \ child.weight.data) - new_embedding = nn.Embedding(child.weight.shape[0], - child.weight.shape[1] // mp_size) + new_embedding = nn.Embedding(child.weight.shape[0], child.weight.shape[1] // mp_size) new_embedding.weight.data.copy_(data) return new_embedding def update_mp_params(child): if hasattr(child, 'n_heads'): - assert child.n_heads%mp_size == 0, "n_heads ({}) must be divisible by mp_size ({})".format(child.n_heads, mp_size) + assert child.n_heads % mp_size == 0, "n_heads ({}) must be divisible by mp_size ({})".format( + child.n_heads, mp_size) child.n_heads = child.n_heads // mp_size if hasattr(child, 'inner_dim'): - assert child.inner_dim%mp_size == 0, "inner_dim ({}) must be divisible by mp_size ({})".format(child.inner_dim, mp_size) + assert child.inner_dim % mp_size == 0, "inner_dim ({}) must be divisible by mp_size ({})".format( + child.inner_dim, mp_size) child.inner_dim = child.inner_dim // mp_size if hasattr(child, 'num_heads'): - assert child.num_heads%mp_size == 0, "num_heads ({}) must be divisible by mp_size ({})".format(child.num_heads, mp_size) + assert child.num_heads % mp_size == 0, "num_heads ({}) must be divisible by mp_size ({})".format( + child.num_heads, mp_size) child.num_heads = child.num_heads // mp_size if hasattr(child, 'num_attention_heads'): - assert child.num_attention_heads%mp_size == 0, "num_attention_heads ({}) must be divisible by mp_size ({})".format(child.num_attention_heads, mp_size) + assert child.num_attention_heads % mp_size == 0, "num_attention_heads ({}) must be divisible by mp_size ({})".format( + child.num_attention_heads, mp_size) child.num_attention_heads = child.num_attention_heads // mp_size if hasattr(child, 'num_attn_heads'): - assert child.num_attn_heads%mp_size == 0, "num_attn_heads ({}) must be divisible by mp_size ({})".format(child.num_attn_heads, mp_size) + assert child.num_attn_heads % mp_size == 0, "num_attn_heads ({}) must be divisible by mp_size ({})".format( + child.num_attn_heads, mp_size) child.num_attn_heads = child.num_attn_heads // mp_size if hasattr(child, 'all_head_size'): - assert child.all_head_size%mp_size == 0, "all_head_size ({}) must be divisible by mp_size ({})".format(child.all_head_size, mp_size) + assert child.all_head_size % mp_size == 0, "all_head_size ({}) must be divisible by mp_size ({})".format( + child.all_head_size, mp_size) child.all_head_size = child.all_head_size // mp_size if hasattr(child, 'embed_dim'): - assert child.embed_dim%mp_size == 0, "embed_dim must ({}) be divisible by mp_size ({})".format(child.embed_dim, mp_size) + assert child.embed_dim % mp_size == 0, "embed_dim must ({}) be divisible by mp_size ({})".format( + child.embed_dim, mp_size) child.embed_dim = child.embed_dim // mp_size if hasattr(child, 'hidden_size'): - assert child.hidden_size%mp_size == 0, "hidden_size ({}) must be divisible by mp_size ({})".format(child.hidden_size, mp_size) + assert child.hidden_size % mp_size == 0, "hidden_size ({}) must be divisible by mp_size ({})".format( + child.hidden_size, mp_size) child.hidden_size = child.hidden_size // mp_size conv_linear_layer = False @@ -507,12 +459,8 @@ def replace_transformer_layer(orig_layer_impl, def _replace_module(r_module, prev_name=''): for name, child in r_module.named_children(): if child.__class__ in linear_policies: - setattr( - r_module, - name, - linear_policies[child.__class__](child, - prev_name + '.' + name, - conv_linear_layer)) + setattr(r_module, name, linear_policies[child.__class__](child, prev_name + '.' + name, + conv_linear_layer)) else: update_mp_params(child) _replace_module(child, name) @@ -559,15 +507,10 @@ def replace_transformer_layer(orig_layer_impl, base_dir1 = checkpoint_dict.get('base_dir', config.base_dir) if ckpt_type == 'pp' and type(checkpoint) is list: - pbar = tqdm.tqdm(total=len(checkpoint), - desc=f"Loading {len(checkpoint)} checkpoint shards") + pbar = tqdm.tqdm(total=len(checkpoint), desc=f"Loading {len(checkpoint)} checkpoint shards") for i in range(len(checkpoint)): - sd = [ - torch.load(os.path.join(base_dir1, - checkpoint[i]), - map_location='cpu') - ] + sd = [torch.load(os.path.join(base_dir1, checkpoint[i]), map_location='cpu')] load_model_with_checkpoint(replaced_module, sd, mp_replace, @@ -582,22 +525,15 @@ def replace_transformer_layer(orig_layer_impl, tp_split_size = (world_size / ckpt_mp_size) sd_offset = int(rank / tp_split_size) sd_count = int((rank + max(1, tp_split_size)) / tp_split_size) - sd_offset - pbar = tqdm.tqdm(total=num_checkpoints, - desc=f"Loading {num_checkpoints} checkpoint shards") + pbar = tqdm.tqdm(total=num_checkpoints, desc=f"Loading {num_checkpoints} checkpoint shards") for i in range(num_checkpoints): pbar.update(1) ckpt_index = i * ckpt_mp_size + sd_offset ckpt_files = [ - os.path.join(base_dir1, - ckpt_list[ckpt_index + - j]) if base_dir1 else ckpt_list[ckpt_index + - j] + os.path.join(base_dir1, ckpt_list[ckpt_index + j]) if base_dir1 else ckpt_list[ckpt_index + j] for j in range(sd_count) ] - sds = [ - torch.load(ckpt_file, - map_location='cpu') for ckpt_file in ckpt_files - ] + sds = [torch.load(ckpt_file, map_location='cpu') for ckpt_file in ckpt_files] load_model_with_checkpoint(replaced_module, sds, mp_replace, @@ -610,15 +546,13 @@ def replace_transformer_layer(orig_layer_impl, gc.collect() if "non_tp" in checkpoint: - pbar = tqdm.tqdm( - total=len(checkpoint["non_tp"]), - desc=f"Loading {len(checkpoint['non_tp'])} checkpoint shards") + pbar = tqdm.tqdm(total=len(checkpoint["non_tp"]), + desc=f"Loading {len(checkpoint['non_tp'])} checkpoint shards") for i in range(len(checkpoint["non_tp"])): pbar.update(1) ckpt_file = os.path.join(base_dir1, - checkpoint["non_tp"][i] - ) if base_dir1 else checkpoint["non_tp"][i] + checkpoint["non_tp"][i]) if base_dir1 else checkpoint["non_tp"][i] sds = [torch.load(ckpt_file, map_location='cpu')] load_model_with_checkpoint(replaced_module, sds, @@ -657,37 +591,22 @@ def replace_transformer_layer(orig_layer_impl, if not dist.is_initialized() or dist.get_rank() == 0: print("Saving tp-sharded checkpoints") torch.save( - OrderedDict({ - k: v - for k, - v in dict(replaced_module.state_dict()).items() - if transformer_name not in k - }), - f'{config.save_mp_checkpoint_path}/{non_tp_ckpt_name}') + OrderedDict({k: v + for k, v in dict(replaced_module.state_dict()).items() + if transformer_name not in k}), f'{config.save_mp_checkpoint_path}/{non_tp_ckpt_name}') ckpt_config = json.dumps({ - 'type': - ckpt_name, - 'base_dir': - f'{config.save_mp_checkpoint_path}', + 'type': ckpt_name, + 'base_dir': f'{config.save_mp_checkpoint_path}', 'checkpoints': { - "non_tp": - ckpt_files, - "tp": [ - f'tp_{r:0>2d}_{m:0>2d}.pt' for m in range(num_partitions) - for r in range(world_size) - ] + "non_tp": ckpt_files, + "tp": [f'tp_{r:0>2d}_{m:0>2d}.pt' for m in range(num_partitions) for r in range(world_size)] }, - 'version': - 1.0, - 'parallelization': - 'tp', - 'tp_size': - world_size, - 'dtype': - 'int8' if quantize else ('float16' if fp16 else 'float32') + 'version': 1.0, + 'parallelization': 'tp', + 'tp_size': world_size, + 'dtype': 'int8' if quantize else ('float16' if fp16 else 'float32') }) - with open(f"{config.save_mp_checkpoint_path}/ds_inference_config.json", - "w") as cfg: + with open(f"{config.save_mp_checkpoint_path}/ds_inference_config.json", "w") as cfg: cfg.write(ckpt_config) rep_sd = replaced_module.state_dict() @@ -699,13 +618,9 @@ def replace_transformer_layer(orig_layer_impl, for m in range(num_partitions): torch.save( OrderedDict({ - k: [rep_sd[k], - rep_sd[k].scale] if hasattr(rep_sd[k], - 'scale') else rep_sd[k] - for k in keys[m * partition_size:(m + 1) * partition_size] - if transformer_name in k - }), - f'{config.save_mp_checkpoint_path}/tp_{rank:0>2d}_{m:0>2d}.pt') + k: [rep_sd[k], rep_sd[k].scale] if hasattr(rep_sd[k], 'scale') else rep_sd[k] + for k in keys[m * partition_size:(m + 1) * partition_size] if transformer_name in k + }), f'{config.save_mp_checkpoint_path}/tp_{rank:0>2d}_{m:0>2d}.pt') return replaced_module @@ -720,6 +635,7 @@ def revert_transformer_layer(orig_layer_impl, model, config, preln=False): Returns: Updated nn.module with original bert-style transformer layers """ + def replace_fn(child, _replace_policy, layer_id): #from turing.nvidia_modelingpreln import BertLayer orig_module = orig_layer_impl(config) @@ -821,9 +737,7 @@ def _replace_module(model, policies, layer_id=0): """ for name, child in model.named_children(): if child.__class__ in policies: - replaced_module = policies[child.__class__][0](child, - policies[child.__class__][-1], - layer_id) + replaced_module = policies[child.__class__][0](child, policies[child.__class__][-1], layer_id) setattr(model, name, replaced_module) if isinstance(model, PipelineModule): assert hasattr(model, 'forward_funcs'),\ diff --git a/deepspeed/module_inject/replace_policy.py b/deepspeed/module_inject/replace_policy.py index 65dadcc18..2a5c98aa2 100755 --- a/deepspeed/module_inject/replace_policy.py +++ b/deepspeed/module_inject/replace_policy.py @@ -16,16 +16,8 @@ from .containers import VAEPolicy # transformer-based policies replace_policies = [ - HFBertLayerPolicy, - HFGPTNEOLayerPolicy, - GPTNEOXLayerPolicy, - HFGPTJLayerPolicy, - MegatronLayerPolicy, - HFGPT2LayerPolicy, - BLOOMLayerPolicy, - HFOPTLayerPolicy, - HFCLIPLayerPolicy, - HFDistilBertLayerPolicy + HFBertLayerPolicy, HFGPTNEOLayerPolicy, GPTNEOXLayerPolicy, HFGPTJLayerPolicy, MegatronLayerPolicy, + HFGPT2LayerPolicy, BLOOMLayerPolicy, HFOPTLayerPolicy, HFCLIPLayerPolicy, HFDistilBertLayerPolicy ] # non-transformer-based policies diff --git a/deepspeed/moe/experts.py b/deepspeed/moe/experts.py index 0fa440c28..3cf554452 100644 --- a/deepspeed/moe/experts.py +++ b/deepspeed/moe/experts.py @@ -7,11 +7,11 @@ import copy class Experts(torch.nn.Module): + def __init__(self, expert, num_local_experts=1, expert_group_name=None): super(Experts, self).__init__() - self.deepspeed_experts = torch.nn.ModuleList( - [copy.deepcopy(expert) for i in range(num_local_experts)]) + self.deepspeed_experts = torch.nn.ModuleList([copy.deepcopy(expert) for i in range(num_local_experts)]) self.num_local_experts = num_local_experts # TODO: revisit allreduce for moe.gate... diff --git a/deepspeed/moe/layer.py b/deepspeed/moe/layer.py index 6b4a07642..3932a3672 100644 --- a/deepspeed/moe/layer.py +++ b/deepspeed/moe/layer.py @@ -31,6 +31,7 @@ class MoE(torch.nn.Module): use_tutel (bool, optional): default=False, whether to use Tutel optimizations (if installed). enable_expert_tensor_parallelism (bool, optional): default=False, whether to use tensor parallelism for experts """ + def __init__(self, hidden_size, expert, @@ -65,15 +66,8 @@ class MoE(torch.nn.Module): 'Unsupported noisy_gate_policy: ' + noisy_gate_policy experts = Experts(expert, self.num_local_experts, self.expert_group_name) - self.deepspeed_moe = MOELayer(TopKGate(hidden_size, - num_experts, - k, - capacity_factor, - eval_capacity_factor, - min_capacity, - noisy_gate_policy, - drop_tokens, - use_rts), + self.deepspeed_moe = MOELayer(TopKGate(hidden_size, num_experts, k, capacity_factor, eval_capacity_factor, + min_capacity, noisy_gate_policy, drop_tokens, use_rts), experts, self.expert_group_name, self.ep_size, @@ -90,20 +84,16 @@ class MoE(torch.nn.Module): def _create_process_groups(self): # Create process group for a layer if needed if self.expert_group_name not in groups._get_expert_parallel_group_dict(): - print( - f"No existing process group found, creating a new group named: {self.expert_group_name}" - ) + print(f"No existing process group found, creating a new group named: {self.expert_group_name}") if (groups.mpu is None) or (not self.enable_expert_tensor_parallelism): # Condition 1 - no groups.mpu means no tensor parallelism # Condition 2 - disabling expert tensor parallelism on purpose groups._create_expert_and_data_parallel(self.ep_size) else: # expert tensor parallelism is enabled - groups._create_expert_data_and_model_parallel(self.ep_size, - mpu=groups.mpu) + groups._create_expert_data_and_model_parallel(self.ep_size, mpu=groups.mpu) # Set the group handle for the MOELayer (deepspeed_moe) object - self.deepspeed_moe._set_ep_group( - groups._get_expert_parallel_group(self.expert_group_name)) + self.deepspeed_moe._set_ep_group(groups._get_expert_parallel_group(self.expert_group_name)) def forward(self, hidden_states, used_token=None): """ MoE forward diff --git a/deepspeed/moe/mappings.py b/deepspeed/moe/mappings.py index 38f1630a6..018cc6b22 100644 --- a/deepspeed/moe/mappings.py +++ b/deepspeed/moe/mappings.py @@ -32,14 +32,9 @@ def _gather_tokens(input_, dim=0): # Size and dimension. rank = mpu.get_tensor_model_parallel_rank() - tensor_list = [ - torch.empty_like(input_) - for _ in range(mpu.get_tensor_model_parallel_world_size()) - ] + tensor_list = [torch.empty_like(input_) for _ in range(mpu.get_tensor_model_parallel_world_size())] tensor_list[rank] = input_ - deepspeed.comm.all_gather(tensor_list, - input_, - group=mpu.get_tensor_model_parallel_group()) + deepspeed.comm.all_gather(tensor_list, input_, group=mpu.get_tensor_model_parallel_group()) # Note: torch.cat already creates a contiguous tensor. output = torch.cat(tensor_list, dim=dim).contiguous() @@ -53,7 +48,8 @@ def _drop_tokens(input_, dim=0): total_chunks = mpu.get_tensor_model_parallel_world_size() this_chunk = mpu.get_tensor_model_parallel_rank() - assert input_.shape[dim] % total_chunks == 0, f"input dimension {dim} ({input_.shape[dim]}) is not divisible by tensor parallel world size ({total_chunks})" + assert input_.shape[ + dim] % total_chunks == 0, f"input dimension {dim} ({input_.shape[dim]}) is not divisible by tensor parallel world size ({total_chunks})" chunk_size = input_.shape[dim] // total_chunks return torch.narrow(input_, dim, this_chunk * chunk_size, chunk_size) @@ -61,6 +57,7 @@ def _drop_tokens(input_, dim=0): class _GatherTokens(torch.autograd.Function): """All gather tokens among the tensor parallel ranks""" + @staticmethod def symbolic(graph, input_, dim): return _gather_tokens(input_, dim) diff --git a/deepspeed/moe/sharded_moe.py b/deepspeed/moe/sharded_moe.py index 211b2127b..c4f1554e4 100644 --- a/deepspeed/moe/sharded_moe.py +++ b/deepspeed/moe/sharded_moe.py @@ -60,11 +60,9 @@ def multiplicative_jitter(x, device: torch.device, epsilon=1e-2): return x uniform = uniform_map.get(device) if uniform is None: - uniform = torch.distributions.uniform.Uniform( - low=torch.tensor(1.0 - epsilon, - device=device), - high=torch.tensor(1.0 + epsilon, - device=device)).rsample # type: ignore + uniform = torch.distributions.uniform.Uniform(low=torch.tensor(1.0 - epsilon, device=device), + high=torch.tensor(1.0 + epsilon, + device=device)).rsample # type: ignore uniform_map[device] = uniform return x * uniform(x.shape) @@ -87,6 +85,7 @@ from deepspeed import comm as dist # Based on https://github.com/pytorch/pytorch/pull/40762 class _AllToAll(torch.autograd.Function): + @staticmethod def forward( ctx: Any, @@ -181,25 +180,18 @@ def top1gating(logits: Tensor, noisy_gate_policy: Optional[str] = None, drop_tokens: bool = True, use_rts: bool = True, - use_tutel: bool = False) -> Tuple[Tensor, - Tensor, - Tensor, - Tensor]: + use_tutel: bool = False) -> Tuple[Tensor, Tensor, Tensor, Tensor]: """Implements Top1Gating on logits.""" if noisy_gate_policy == 'RSample': logits_w_noise = logits + gumbel_rsample(logits.shape, device=logits.device) # everything is in fp32 in this function gates = F.softmax(logits, dim=1) - capacity = _capacity(gates, - torch.tensor(capacity_factor), - torch.tensor(min_capacity)) + capacity = _capacity(gates, torch.tensor(capacity_factor), torch.tensor(min_capacity)) # Create a mask for 1st's expert per token # noisy gating - indices1_s = torch.argmax( - logits_w_noise if noisy_gate_policy == 'RSample' else gates, - dim=1) + indices1_s = torch.argmax(logits_w_noise if noisy_gate_policy == 'RSample' else gates, dim=1) num_experts = int(gates.shape[1]) mask1 = F.one_hot(indices1_s, num_classes=num_experts) @@ -225,18 +217,16 @@ def top1gating(logits: Tensor, if use_rts: uniform = exp_selection_uniform_map.get(logits.device) if uniform is None: - uniform = torch.distributions.uniform.Uniform( - low=torch.tensor(0.0, - device=logits.device), - high=torch.tensor(1.0, - device=logits.device)).rsample + uniform = torch.distributions.uniform.Uniform(low=torch.tensor(0.0, device=logits.device), + high=torch.tensor(1.0, device=logits.device)).rsample exp_selection_uniform_map[logits.device] = uniform mask1_rand = mask1 * uniform(mask1.shape) else: mask1_rand = mask1 - assert logits.shape[0] >= min_capacity, "No. of tokens (batch-size) should be greater than min_capacity. Either set min_capacity to 0 or increase your batch size." + assert logits.shape[ + 0] >= min_capacity, "No. of tokens (batch-size) should be greater than min_capacity. Either set min_capacity to 0 or increase your batch size." top_idx = _top_idx(mask1_rand, capacity) @@ -258,7 +248,13 @@ def top1gating(logits: Tensor, if use_tutel: gates1_s = (gates * mask1).sum(dim=1) locations1_s = torch.sum(locations1 * mask1, dim=1) - return l_aux, capacity, num_experts, [indices1_s,], [locations1_s,], [gates1_s,], exp_counts + return l_aux, capacity, num_experts, [ + indices1_s, + ], [ + locations1_s, + ], [ + gates1_s, + ], exp_counts # Store the capacity location for each token locations1_s = torch.sum(locations1 * mask1, dim=1) @@ -275,19 +271,12 @@ def top1gating(logits: Tensor, return l_aux, combine_weights, dispatch_mask, exp_counts -def top2gating(logits: Tensor, - capacity_factor: float, - min_capacity: int) -> Tuple[Tensor, - Tensor, - Tensor, - Tensor]: +def top2gating(logits: Tensor, capacity_factor: float, min_capacity: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]: """Implements Top2Gating on logits.""" # everything is in fp32 in this function gates = F.softmax(logits, dim=1) - capacity = _capacity(gates, - torch.tensor(capacity_factor * 2), - torch.tensor(min_capacity)) + capacity = _capacity(gates, torch.tensor(capacity_factor * 2), torch.tensor(min_capacity)) # Create a mask for 1st's expert per token indices1_s = torch.argmax(gates, dim=1) @@ -393,13 +382,10 @@ class TopKGate(Module): self.drop_tokens = drop_tokens self.use_rts = use_rts - def forward( - self, - input: torch.Tensor, - used_token: torch.Tensor = None, - use_tutel: bool = False) -> Tuple[Tensor, - Tensor, - Tensor]: # type: ignore + def forward(self, + input: torch.Tensor, + used_token: torch.Tensor = None, + use_tutel: bool = False) -> Tuple[Tensor, Tensor, Tensor]: # type: ignore if self.wall_clock_breakdown: self.timers('TopKGate').start() @@ -413,21 +399,13 @@ class TopKGate(Module): logits = self.wg(input_fp32) if self.k == 1: - gate_output = top1gating( - logits, - self.capacity_factor if self.training else self.eval_capacity_factor, - self.min_capacity, - used_token, - self.noisy_gate_policy if self.training else None, - self.drop_tokens, - self.use_rts, - use_tutel) + gate_output = top1gating(logits, self.capacity_factor if self.training else self.eval_capacity_factor, + self.min_capacity, used_token, self.noisy_gate_policy if self.training else None, + self.drop_tokens, self.use_rts, use_tutel) else: - gate_output = top2gating( - logits, - self.capacity_factor if self.training else self.eval_capacity_factor, - self.min_capacity) + gate_output = top2gating(logits, self.capacity_factor if self.training else self.eval_capacity_factor, + self.min_capacity) if self.wall_clock_breakdown: self.timers('TopKGate').stop() @@ -453,6 +431,7 @@ class MOELayer(Base): expert (torch.nn.Module): expert network """ + def __init__(self, gate: Module, experts: Module, @@ -481,9 +460,8 @@ class MOELayer(Base): logger.warning("Tutel optimization requested but not installed. " "Proceeding without Tutel.") elif use_tutel and TUTEL_INSTALLED and gate.k != 1: - logger.warning( - "To enable Tutel optimization, use top-1 instead of top-2 gate. " - "Proceeding without Tutel.") + logger.warning("To enable Tutel optimization, use top-1 instead of top-2 gate. " + "Proceeding without Tutel.") def _set_ep_group(self, ep_group): self.ep_group = ep_group @@ -506,18 +484,12 @@ class MOELayer(Base): S, M = reshaped_input.size(0), reshaped_input.size(1) if not hasattr(self, '_tutel_dispatcher'): - self._tutel_dispatcher = tutel_moe.fast_dispatcher( - E, - C, - M, - dispatch_dtype=reshaped_input.dtype) + self._tutel_dispatcher = tutel_moe.fast_dispatcher(E, C, M, dispatch_dtype=reshaped_input.dtype) self._tutel_dispatcher.update(indices_, locations_, gates_, capacity=C) dispatched_input = self._tutel_dispatcher.encode(reshaped_input) else: self.l_aux, combine_weights, dispatch_mask, self.exp_counts = self.gate(reshaped_input, input[1]) - dispatched_input = einsum("sec,sm->ecm", - dispatch_mask.type_as(input[0]), - reshaped_input) + dispatched_input = einsum("sec,sm->ecm", dispatch_mask.type_as(input[0]), reshaped_input) if self.wall_clock_breakdown: self.timers('falltoall').start() @@ -538,10 +510,7 @@ class MOELayer(Base): self.time_falltoall = self.timers('falltoall').elapsed(reset=False) # Re-shape after all-to-all: ecm -> gecm - dispatched_input = dispatched_input.reshape(self.ep_size, - self.num_local_experts, - -1, - d_model) + dispatched_input = dispatched_input.reshape(self.ep_size, self.num_local_experts, -1, d_model) expert_output = self.experts(dispatched_input) @@ -555,9 +524,7 @@ class MOELayer(Base): self.time_salltoall = self.timers('salltoall').elapsed(reset=False) # Re-shape back: gecm -> ecm - expert_output = expert_output.reshape(self.ep_size * self.num_local_experts, - -1, - d_model) + expert_output = expert_output.reshape(self.ep_size * self.num_local_experts, -1, d_model) if groups._get_expert_model_parallel_world_size() == 1: # the dropped duplicate tokens need to be gathered on each @@ -568,9 +535,7 @@ class MOELayer(Base): if self.use_tutel: combined_output = self._tutel_dispatcher.decode(expert_output.view(E * C, M)) else: - combined_output = einsum("sec,ecm->sm", - combine_weights.type_as(input[0]), - expert_output) + combined_output = einsum("sec,ecm->sm", combine_weights.type_as(input[0]), expert_output) a = combined_output.reshape(input[0].shape) diff --git a/deepspeed/moe/utils.py b/deepspeed/moe/utils.py index 1bf527951..0f57c9d9d 100644 --- a/deepspeed/moe/utils.py +++ b/deepspeed/moe/utils.py @@ -24,8 +24,7 @@ def is_moe_param(param: torch.Tensor) -> bool: def split_params_into_shared_and_expert_params( - params: List[torch.nn.Parameter]) -> Tuple[torch.nn.Parameter, - torch.nn.Parameter]: + params: List[torch.nn.Parameter]) -> Tuple[torch.nn.Parameter, torch.nn.Parameter]: shared_params, expert_params = [], [] for p in params: if is_moe_param(p): @@ -36,8 +35,7 @@ def split_params_into_shared_and_expert_params( def split_params_grads_into_shared_and_expert_params( - group: List[torch.nn.Parameter]) -> Tuple[torch.nn.Parameter, - torch.nn.Parameter]: + group: List[torch.nn.Parameter]) -> Tuple[torch.nn.Parameter, torch.nn.Parameter]: """Split grad of parameters into grads of non-expert params and grads of expert params. This is useful while computing grad-norms for clipping and overflow detection @@ -62,8 +60,7 @@ def split_params_grads_into_shared_and_expert_params( def split_params_into_different_moe_groups_for_optimizer(param_groups: Tuple[Dict], - max_group_size=178956971 - ) -> Tuple[Dict]: + max_group_size=178956971) -> Tuple[Dict]: """Split parameters into different MoE groups for optimizer Args: @@ -101,8 +98,7 @@ def split_params_into_different_moe_groups_for_optimizer(param_groups: Tuple[Dic if ori_key == 'params': group_moe[param_group['name']][key][ori_key] = [] else: - group_moe[ - param_group['name']][key][ori_key] = param_group[ori_key] + group_moe[param_group['name']][key][ori_key] = param_group[ori_key] # Assign param for param_group in param_groups: new_params = [] diff --git a/deepspeed/monitor/config.py b/deepspeed/monitor/config.py index 09ba7ef1a..1ce80f1bf 100644 --- a/deepspeed/monitor/config.py +++ b/deepspeed/monitor/config.py @@ -9,13 +9,7 @@ from deepspeed.runtime.config_utils import DeepSpeedConfigModel def get_monitor_config(param_dict): - monitor_dict = { - key: param_dict.get(key, - {}) - for key in ("tensorboard", - "wandb", - "csv_monitor") - } + monitor_dict = {key: param_dict.get(key, {}) for key in ("tensorboard", "wandb", "csv_monitor")} return DeepSpeedMonitorConfig(**monitor_dict) @@ -78,10 +72,10 @@ class DeepSpeedMonitorConfig(DeepSpeedConfigModel): csv_monitor: CSVConfig = {} """ Local CSV output of monitoring data. """ + @root_validator def check_enabled(cls, values): values["enabled"] = False - if (values.get("tensorboard").enabled or values.get("wandb").enabled - or values.get("csv_monitor").enabled): + if (values.get("tensorboard").enabled or values.get("wandb").enabled or values.get("csv_monitor").enabled): values["enabled"] = True return values diff --git a/deepspeed/monitor/csv_monitor.py b/deepspeed/monitor/csv_monitor.py index 7de4fbced..7548a210d 100644 --- a/deepspeed/monitor/csv_monitor.py +++ b/deepspeed/monitor/csv_monitor.py @@ -7,6 +7,7 @@ import deepspeed.comm as dist class csvMonitor(Monitor): + def __init__(self, csv_config): super().__init__(csv_config) self.filenames = [] diff --git a/deepspeed/monitor/monitor.py b/deepspeed/monitor/monitor.py index 504c3da39..a74bfe969 100644 --- a/deepspeed/monitor/monitor.py +++ b/deepspeed/monitor/monitor.py @@ -8,6 +8,7 @@ import deepspeed.comm as dist class Monitor(ABC): + @abstractmethod def __init__(self, monitor_config): self.monitor_config = monitor_config @@ -23,6 +24,7 @@ from .csv_monitor import csvMonitor class MonitorMaster(Monitor): + def __init__(self, monitor_config): super().__init__(monitor_config) self.tb_monitor = None diff --git a/deepspeed/monitor/tensorboard.py b/deepspeed/monitor/tensorboard.py index db3a50d75..1753503a6 100644 --- a/deepspeed/monitor/tensorboard.py +++ b/deepspeed/monitor/tensorboard.py @@ -8,6 +8,7 @@ import deepspeed.comm as dist class TensorBoardMonitor(Monitor): + def __init__(self, tensorboard_config): super().__init__(tensorboard_config) check_tb_availability() @@ -20,9 +21,7 @@ class TensorBoardMonitor(Monitor): if self.enabled and dist.get_rank() == 0: self.get_summary_writer() - def get_summary_writer(self, - base=os.path.join(os.path.expanduser("~"), - "tensorboard")): + def get_summary_writer(self, base=os.path.join(os.path.expanduser("~"), "tensorboard")): if self.enabled and dist.get_rank() == 0: from torch.utils.tensorboard import SummaryWriter if self.output_path is not None: diff --git a/deepspeed/monitor/wandb.py b/deepspeed/monitor/wandb.py index 49fc03320..189fdd55a 100644 --- a/deepspeed/monitor/wandb.py +++ b/deepspeed/monitor/wandb.py @@ -7,6 +7,7 @@ import deepspeed.comm as dist class WandbMonitor(Monitor): + def __init__(self, wandb_config): super().__init__(wandb_config) check_wandb_availability() diff --git a/deepspeed/nebula/config.py b/deepspeed/nebula/config.py index 93b879fd7..83014c54d 100644 --- a/deepspeed/nebula/config.py +++ b/deepspeed/nebula/config.py @@ -9,6 +9,7 @@ from deepspeed.nebula.constants import * class DeepSpeedNebulaConfig(DeepSpeedConfigObject): + def __init__(self, param_dict): super(DeepSpeedNebulaConfig, self).__init__() @@ -26,29 +27,18 @@ class DeepSpeedNebulaConfig(DeepSpeedConfigObject): self._initialize(nebula_dict) def _initialize(self, nebula_dict): - self.enabled = get_scalar_param(nebula_dict, - NEBULA_ENABLED, - NEBULA_ENABLED_DEFAULT) + self.enabled = get_scalar_param(nebula_dict, NEBULA_ENABLED, NEBULA_ENABLED_DEFAULT) - self.load_path = get_scalar_param(nebula_dict, - NEBULA_LOAD_PATH, - NEBULA_LOAD_PATH_DEFAULT) + self.load_path = get_scalar_param(nebula_dict, NEBULA_LOAD_PATH, NEBULA_LOAD_PATH_DEFAULT) - self.enable_nebula_load = get_scalar_param(nebula_dict, - NEBULA_ENABLE_NEBULA_LOAD, + self.enable_nebula_load = get_scalar_param(nebula_dict, NEBULA_ENABLE_NEBULA_LOAD, NEBULA_ENABLE_NEBULA_LOAD_DEFAULT) - self.persistent_storage_path = get_scalar_param( - nebula_dict, - NEBULA_PERSISTENT_STORAGE_PATH, - NEBULA_PERSISTENT_STORAGE_PATH_DEFAULT) + self.persistent_storage_path = get_scalar_param(nebula_dict, NEBULA_PERSISTENT_STORAGE_PATH, + NEBULA_PERSISTENT_STORAGE_PATH_DEFAULT) - self.persistent_time_interval = get_scalar_param( - nebula_dict, - NEBULA_PERSISTENT_TIME_INTERVAL, - NEBULA_PERSISTENT_TIME_INTERVAL_DEFAULT) + self.persistent_time_interval = get_scalar_param(nebula_dict, NEBULA_PERSISTENT_TIME_INTERVAL, + NEBULA_PERSISTENT_TIME_INTERVAL_DEFAULT) - self.num_of_version_in_retention = get_scalar_param( - nebula_dict, - NEBULA_NUM_OF_VERSION_IN_RETENTION, - NEBULA_NUM_OF_VERSION_IN_RETENTION_DEFAULT) + self.num_of_version_in_retention = get_scalar_param(nebula_dict, NEBULA_NUM_OF_VERSION_IN_RETENTION, + NEBULA_NUM_OF_VERSION_IN_RETENTION_DEFAULT) diff --git a/deepspeed/nebula/constants.py b/deepspeed/nebula/constants.py index 6ad876a8d..83d8a8b22 100644 --- a/deepspeed/nebula/constants.py +++ b/deepspeed/nebula/constants.py @@ -63,24 +63,11 @@ NEBULA_NUM_OF_VERSION_IN_RETENTION_DEFAULT = 2 # Neubla envs NEBULA_EXPORT_ENVS = [ - 'DLTS_JOB_ID', - 'DLTS_NUM_WORKER', - 'NEBULA_PERSISTENT_STORAGE_PATH', - 'NEBULA_PERSISTENT_TIME_INTERVAL', - 'AML_RUN_ID', - 'AZUREML_RUN_TOKEN', - 'AZUREML_WORKSPACE_SCOPE', - 'AZUREML_EXPERIMENT_SCOPE', - 'AZUREML_RUN_HISTORY_SERVICE_ENDPOINT', - 'AZUREML_RUN_ID', - 'NEBULA_MEMORY_BUFFER_SIZE', - 'AZUREML_PARAMETER_ITPJOB_NAME', - 'FC_TASKROLE_NAME', - 'FC_TASK_INDEX', - 'MASTER_HOST', - 'LOCAL_HOST', - 'AZUREML_BLOB_ACCOUNT_NAME', - 'AZUREML_BLOB_ACCOUNT_KEY' + 'DLTS_JOB_ID', 'DLTS_NUM_WORKER', 'NEBULA_PERSISTENT_STORAGE_PATH', 'NEBULA_PERSISTENT_TIME_INTERVAL', + 'AML_RUN_ID', 'AZUREML_RUN_TOKEN', 'AZUREML_WORKSPACE_SCOPE', 'AZUREML_EXPERIMENT_SCOPE', + 'AZUREML_RUN_HISTORY_SERVICE_ENDPOINT', 'AZUREML_RUN_ID', 'NEBULA_MEMORY_BUFFER_SIZE', + 'AZUREML_PARAMETER_ITPJOB_NAME', 'FC_TASKROLE_NAME', 'FC_TASK_INDEX', 'MASTER_HOST', 'LOCAL_HOST', + 'AZUREML_BLOB_ACCOUNT_NAME', 'AZUREML_BLOB_ACCOUNT_KEY' ] # ITP env files diff --git a/deepspeed/ops/adagrad/cpu_adagrad.py b/deepspeed/ops/adagrad/cpu_adagrad.py index 07cdaa48c..2f1fd9ff2 100755 --- a/deepspeed/ops/adagrad/cpu_adagrad.py +++ b/deepspeed/ops/adagrad/cpu_adagrad.py @@ -10,13 +10,7 @@ from deepspeed.utils.logging import should_log_le class DeepSpeedCPUAdagrad(torch.optim.Optimizer): optimizer_id = 0 - def __init__(self, - model_params, - lr=1e-2, - eps=1e-10, - weight_decay=0, - amsgrad=False, - fp32_optimizer_states=True): + def __init__(self, model_params, lr=1e-2, eps=1e-10, weight_decay=0, amsgrad=False, fp32_optimizer_states=True): default_args = dict(lr=lr, eps=eps, weight_decay=weight_decay, amsgrad=amsgrad) super(DeepSpeedCPUAdagrad, self).__init__(model_params, default_args) @@ -26,11 +20,7 @@ class DeepSpeedCPUAdagrad(torch.optim.Optimizer): self.fp32_optimizer_states = fp32_optimizer_states self.ds_opt_adagrad = CPUAdagradBuilder().load() - self.ds_opt_adagrad.create_adagrad(self.opt_id, - lr, - eps, - weight_decay, - should_log_le("info")) + self.ds_opt_adagrad.create_adagrad(self.opt_id, lr, eps, weight_decay, should_log_le("info")) def __del__(self): # need to destroy the C++ object explicitly to avoid a memory leak when deepspeed.initialize @@ -90,9 +80,7 @@ class DeepSpeedCPUAdagrad(torch.optim.Optimizer): #memory_format=torch.preserve_format) # gradient variances - state['exp_avg_sq'] = torch.zeros_like(p.data, - dtype=state_dtype, - device='cpu') + state['exp_avg_sq'] = torch.zeros_like(p.data, dtype=state_dtype, device='cpu') #memory_format=torch.preserve_format) state['step'] += 1 @@ -100,39 +88,21 @@ class DeepSpeedCPUAdagrad(torch.optim.Optimizer): if p.grad.is_sparse == True: sparse_param = p.sparse_mask(p.grad) sparse_exp_avg_sq = state['exp_avg_sq'].sparse_mask(p.grad) - self.ds_opt_adagrad.adagrad_update(self.opt_id, - state['step'], - group['lr'], - group['eps'], - group['weight_decay'], - sparse_param.values(), - p.grad.values(), + self.ds_opt_adagrad.adagrad_update(self.opt_id, state['step'], group['lr'], group['eps'], + group['weight_decay'], sparse_param.values(), p.grad.values(), sparse_exp_avg_sq.values()) p[sparse_param.indices()] = sparse_param.values() - state['exp_avg_sq'][ - sparse_exp_avg_sq.indices()] = sparse_exp_avg_sq.values() + state['exp_avg_sq'][sparse_exp_avg_sq.indices()] = sparse_exp_avg_sq.values() if fp16_param_groups is not None: - fp16_param_groups[group_id][param_id][ - sparse_param.indices()] = sparse_param.values() + fp16_param_groups[group_id][param_id][sparse_param.indices()] = sparse_param.values() else: if fp16_param_groups is not None: - self.ds_opt_adagrad.adagrad_update_copy( - self.opt_id, - state['step'], - group['lr'], - group['eps'], - group['weight_decay'], - p.data, - p.grad.data, - state['exp_avg_sq'], - fp16_param_groups[group_id][param_id].data) + self.ds_opt_adagrad.adagrad_update_copy(self.opt_id, state['step'], group['lr'], group['eps'], + group['weight_decay'], p.data, p.grad.data, + state['exp_avg_sq'], + fp16_param_groups[group_id][param_id].data) else: - self.ds_opt_adagrad.adagrad_update(self.opt_id, - state['step'], - group['lr'], - group['eps'], - group['weight_decay'], - p.data, - p.grad.data, + self.ds_opt_adagrad.adagrad_update(self.opt_id, state['step'], group['lr'], group['eps'], + group['weight_decay'], p.data, p.grad.data, state['exp_avg_sq']) return loss diff --git a/deepspeed/ops/adam/cpu_adam.py b/deepspeed/ops/adam/cpu_adam.py index 04c5ac34c..c0ea1d304 100755 --- a/deepspeed/ops/adam/cpu_adam.py +++ b/deepspeed/ops/adam/cpu_adam.py @@ -16,8 +16,7 @@ class DeepSpeedCPUAdam(torch.optim.Optimizer): model_params, lr=1e-3, bias_correction=True, - betas=(0.9, - 0.999), + betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False, @@ -76,14 +75,12 @@ class DeepSpeedCPUAdam(torch.optim.Optimizer): super(DeepSpeedCPUAdam, self).__init__(model_params, default_args) cpu_info = get_cpu_info() - self.cpu_vendor = cpu_info["vendor_id_raw"].lower( - ) if "vendor_id_raw" in cpu_info else "unknown" + self.cpu_vendor = cpu_info["vendor_id_raw"].lower() if "vendor_id_raw" in cpu_info else "unknown" if "amd" in self.cpu_vendor: for group_id, group in enumerate(self.param_groups): for param_id, p in enumerate(group['params']): if p.dtype == torch.half: - logger.warning( - "FP16 params for CPUAdam may not work on AMD CPUs") + logger.warning("FP16 params for CPUAdam may not work on AMD CPUs") break else: continue @@ -95,13 +92,7 @@ class DeepSpeedCPUAdam(torch.optim.Optimizer): self.fp32_optimizer_states = fp32_optimizer_states self.ds_opt_adam = CPUAdamBuilder().load() - self.ds_opt_adam.create_adam(self.opt_id, - lr, - betas[0], - betas[1], - eps, - weight_decay, - adamw_mode, + self.ds_opt_adam.create_adam(self.opt_id, lr, betas[0], betas[1], eps, weight_decay, adamw_mode, should_log_le("info")) def __del__(self): @@ -168,45 +159,22 @@ class DeepSpeedCPUAdam(torch.optim.Optimizer): state_dtype = torch.float if self.fp32_optimizer_states else p.dtype # gradient momentums - state['exp_avg'] = torch.zeros_like(p.data, - dtype=state_dtype, - device=device) + state['exp_avg'] = torch.zeros_like(p.data, dtype=state_dtype, device=device) #memory_format=torch.preserve_format) # gradient variances - state['exp_avg_sq'] = torch.zeros_like(p.data, - dtype=state_dtype, - device=device) + state['exp_avg_sq'] = torch.zeros_like(p.data, dtype=state_dtype, device=device) #memory_format=torch.preserve_format) state['step'] += 1 beta1, beta2 = group['betas'] if fp16_param_groups is not None: - self.ds_opt_adam.adam_update_copy( - self.opt_id, - state['step'], - group['lr'], - beta1, - beta2, - group['eps'], - group['weight_decay'], - group['bias_correction'], - p.data, - p.grad.data, - state['exp_avg'], - state['exp_avg_sq'], - fp16_param_groups[group_id][param_id].data) + self.ds_opt_adam.adam_update_copy(self.opt_id, state['step'], group['lr'], beta1, beta2, + group['eps'], group['weight_decay'], group['bias_correction'], + p.data, p.grad.data, state['exp_avg'], state['exp_avg_sq'], + fp16_param_groups[group_id][param_id].data) else: - self.ds_opt_adam.adam_update(self.opt_id, - state['step'], - group['lr'], - beta1, - beta2, - group['eps'], - group['weight_decay'], - group['bias_correction'], - p.data, - p.grad.data, - state['exp_avg'], - state['exp_avg_sq']) + self.ds_opt_adam.adam_update(self.opt_id, state['step'], group['lr'], beta1, beta2, group['eps'], + group['weight_decay'], group['bias_correction'], p.data, p.grad.data, + state['exp_avg'], state['exp_avg_sq']) return loss diff --git a/deepspeed/ops/adam/fused_adam.py b/deepspeed/ops/adam/fused_adam.py index 169fde67e..f93f73809 100644 --- a/deepspeed/ops/adam/fused_adam.py +++ b/deepspeed/ops/adam/fused_adam.py @@ -47,12 +47,12 @@ class FusedAdam(torch.optim.Optimizer): .. _On the Convergence of Adam and Beyond: https://openreview.net/forum?id=ryQu7f-RZ """ + def __init__(self, params, lr=1e-3, bias_correction=True, - betas=(0.9, - 0.999), + betas=(0.9, 0.999), eps=1e-8, adam_w_mode=True, weight_decay=0., @@ -61,11 +61,7 @@ class FusedAdam(torch.optim.Optimizer): if amsgrad: raise RuntimeError('FusedAdam does not support the AMSGrad variant.') - defaults = dict(lr=lr, - bias_correction=bias_correction, - betas=betas, - eps=eps, - weight_decay=weight_decay) + defaults = dict(lr=lr, bias_correction=bias_correction, betas=betas, eps=eps, weight_decay=weight_decay) super(FusedAdam, self).__init__(params, defaults) self.adam_w_mode = 1 if adam_w_mode else 0 self.set_grad_none = set_grad_none @@ -83,12 +79,7 @@ class FusedAdam(torch.optim.Optimizer): else: super(FusedAdam, self).zero_grad() - def step(self, - closure=None, - grads=None, - output_params=None, - scale=None, - grad_norms=None): + def step(self, closure=None, grads=None, output_params=None, scale=None, grad_norms=None): """Performs a single optimization step. Arguments: @@ -121,8 +112,7 @@ class FusedAdam(torch.optim.Optimizer): continue if p.grad.data.is_sparse: raise RuntimeError( - 'FusedAdam does not support sparse gradients, please consider SparseAdam instead' - ) + 'FusedAdam does not support sparse gradients, please consider SparseAdam instead') state = self.state[p] # State initialization @@ -151,35 +141,13 @@ class FusedAdam(torch.optim.Optimizer): if (len(g_16) > 0): state['step'] += 1 - multi_tensor_applier(self.multi_tensor_adam, - self._dummy_overflow_buf, - [g_16, - p_16, - m_16, - v_16], - group['lr'], - beta1, - beta2, - group['eps'], - state['step'], - self.adam_w_mode, - bias_correction, - group['weight_decay']) + multi_tensor_applier(self.multi_tensor_adam, self._dummy_overflow_buf, [g_16, p_16, m_16, v_16], + group['lr'], beta1, beta2, group['eps'], state['step'], self.adam_w_mode, + bias_correction, group['weight_decay']) if (len(g_32) > 0): state['step'] += 1 - multi_tensor_applier(self.multi_tensor_adam, - self._dummy_overflow_buf, - [g_32, - p_32, - m_32, - v_32], - group['lr'], - beta1, - beta2, - group['eps'], - state['step'], - self.adam_w_mode, - bias_correction, - group['weight_decay']) + multi_tensor_applier(self.multi_tensor_adam, self._dummy_overflow_buf, [g_32, p_32, m_32, v_32], + group['lr'], beta1, beta2, group['eps'], state['step'], self.adam_w_mode, + bias_correction, group['weight_decay']) return loss diff --git a/deepspeed/ops/adam/multi_tensor_apply.py b/deepspeed/ops/adam/multi_tensor_apply.py index e837309be..52b51e60d 100644 --- a/deepspeed/ops/adam/multi_tensor_apply.py +++ b/deepspeed/ops/adam/multi_tensor_apply.py @@ -7,6 +7,7 @@ This file is adapted from NVIDIA/apex, commit a109f85 class MultiTensorApply(object): + def __init__(self, chunk_size): self.chunk_size = chunk_size diff --git a/deepspeed/ops/lamb/fused_lamb.py b/deepspeed/ops/lamb/fused_lamb.py index 33a1461e9..2c59b8b7c 100644 --- a/deepspeed/ops/lamb/fused_lamb.py +++ b/deepspeed/ops/lamb/fused_lamb.py @@ -35,12 +35,12 @@ class FusedLamb(torch.optim.Optimizer): min_coeff(float, optional): minimum value of the lamb coefficient (default: 0.01) amsgrad (boolean, optional): NOT SUPPORTED in FusedLamb! """ + def __init__(self, params, lr=1e-3, bias_correction=True, - betas=(0.9, - 0.999), + betas=(0.9, 0.999), eps=1e-8, eps_inside_sqrt=False, weight_decay=0., @@ -64,12 +64,7 @@ class FusedLamb(torch.optim.Optimizer): self.eps_mode = 0 if eps_inside_sqrt else 1 self.lamb_coeffs = [] - def step(self, - closure=None, - grads=None, - output_params=None, - scale=1., - grad_norms=None): + def step(self, closure=None, grads=None, output_params=None, scale=1., grad_norms=None): """Performs a single optimization step. Arguments: @@ -114,7 +109,8 @@ class FusedLamb(torch.optim.Optimizer): #remove the previous coeffs del self.lamb_coeffs[:] - for group, grads_this_group, output_params_this_group, grad_norm_group in zip(self.param_groups, grads_group, output_params_group, grad_norms): + for group, grads_this_group, output_params_this_group, grad_norm_group in zip( + self.param_groups, grads_group, output_params_group, grad_norms): if grads_this_group is None: grads_this_group = [None] * len(group['params']) if output_params_this_group is None: @@ -127,7 +123,8 @@ class FusedLamb(torch.optim.Optimizer): bias_correction = 1 if group['bias_correction'] else 0 - for p, grad, output_param, grad_norm in zip(group['params'], grads_this_group, output_params_this_group, grad_norm_group): + for p, grad, output_param, grad_norm in zip(group['params'], grads_this_group, output_params_this_group, + grad_norm_group): # compute combined scale factor for this group combined_scale = scale @@ -162,24 +159,10 @@ class FusedLamb(torch.optim.Optimizer): state['step'] += 1 - out_p = torch.tensor( - [], - dtype=torch.float) if output_param is None else output_param - lamb_coeff = self.fused_lamb_cuda.lamb(p.data, - out_p, - exp_avg, - exp_avg_sq, - grad, - group['lr'], - beta1, - beta2, - max_coeff, - min_coeff, - group['eps'], - combined_scale, - state['step'], - self.eps_mode, - bias_correction, + out_p = torch.tensor([], dtype=torch.float) if output_param is None else output_param + lamb_coeff = self.fused_lamb_cuda.lamb(p.data, out_p, exp_avg, exp_avg_sq, grad, group['lr'], beta1, + beta2, max_coeff, min_coeff, group['eps'], combined_scale, + state['step'], self.eps_mode, bias_correction, group['weight_decay']) self.lamb_coeffs.append(lamb_coeff) return loss diff --git a/deepspeed/ops/random_ltd/dropping_utils.py b/deepspeed/ops/random_ltd/dropping_utils.py index 102ffe13d..496fcf635 100644 --- a/deepspeed/ops/random_ltd/dropping_utils.py +++ b/deepspeed/ops/random_ltd/dropping_utils.py @@ -23,9 +23,7 @@ def gpt_sample_tokens(reserved_length: int, prob_dist = torch.ones((layers * batch_size, seq_length), device=device) sampled_indices = torch.multinomial(prob_dist, reserved_length) - sampled_indices = sampled_indices.reshape(layers, - batch_size, - reserved_length).to(torch.int32) + sampled_indices = sampled_indices.reshape(layers, batch_size, reserved_length).to(torch.int32) global random_ltd_module if random_ltd_module is None: random_ltd_module = RandomLTDBuilder().load() @@ -59,9 +57,7 @@ def bert_sample_tokens(reserved_length: int, prob_dist = torch.ones((layers * batch_size, seq_length), device=device) sampled_indices = torch.multinomial(prob_dist, reserved_length) - sampled_indices = sampled_indices.reshape(layers, - batch_size, - reserved_length).to(torch.int32) + sampled_indices = sampled_indices.reshape(layers, batch_size, reserved_length).to(torch.int32) global random_ltd_module if random_ltd_module is None: random_ltd_module = RandomLTDBuilder().load() @@ -82,11 +78,9 @@ def bert_sample_tokens(reserved_length: int, class GatherTokens(torch.autograd.Function): + @staticmethod - def forward(ctx, - activations: torch.Tensor, - sorted_indices: torch.Tensor, - batch_first: bool): + def forward(ctx, activations: torch.Tensor, sorted_indices: torch.Tensor, batch_first: bool): global random_ltd_module if random_ltd_module is None: random_ltd_module = RandomLTDBuilder().load() @@ -104,25 +98,18 @@ class GatherTokens(torch.autograd.Function): activations, sorted_indices = ctx.saved_tensors batch_first = ctx.batch_first - return random_ltd_module.token_scatter_(a_gradients, - g_gradients, - sorted_indices, - batch_first), None, None + return random_ltd_module.token_scatter_(a_gradients, g_gradients, sorted_indices, batch_first), None, None class ScatterTokens(torch.autograd.Function): + @staticmethod - def forward(ctx, - all_activations: torch.Tensor, - layer_activations: torch.Tensor, - sorted_indices: torch.Tensor, + def forward(ctx, all_activations: torch.Tensor, layer_activations: torch.Tensor, sorted_indices: torch.Tensor, batch_first: bool): global random_ltd_module if random_ltd_module is None: random_ltd_module = RandomLTDBuilder().load() - scatter_results = random_ltd_module.token_scatter_(all_activations.clone(), - layer_activations, - sorted_indices, + scatter_results = random_ltd_module.token_scatter_(all_activations.clone(), layer_activations, sorted_indices, batch_first) ctx.save_for_backward(sorted_indices) @@ -139,7 +126,5 @@ class ScatterTokens(torch.autograd.Function): sorted_indices, = ctx.saved_tensors batch_first = ctx.batch_first - ret_val = random_ltd_module.token_gather(out_gradients, - sorted_indices, - batch_first) + ret_val = random_ltd_module.token_gather(out_gradients, sorted_indices, batch_first) return out_gradients, ret_val, None, None diff --git a/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py b/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py index 6c134d71f..a3a615c73 100755 --- a/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py +++ b/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py @@ -13,6 +13,7 @@ class BertSparseSelfAttention(nn.Module): For usage example please see, TODO DeepSpeed Sparse Transformer Tutorial. """ + def __init__( self, config, @@ -29,10 +30,8 @@ class BertSparseSelfAttention(nn.Module): super(BertSparseSelfAttention, self).__init__() if config.hidden_size % config.num_attention_heads != 0: - raise ValueError( - "The hidden size (%d) is not a multiple of the number of attention " - "heads (%d)" % (config.hidden_size, - config.num_attention_heads)) + raise ValueError("The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (config.hidden_size, config.num_attention_heads)) self.num_attention_heads = config.num_attention_heads self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size @@ -44,8 +43,7 @@ class BertSparseSelfAttention(nn.Module): self.sparse_self_attention = SparseSelfAttention(sparsity_config) def transpose_for_scores(self, x): - new_x_shape = x.size()[:-1] + (self.num_attention_heads, - self.attention_head_size) + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) x = x.view(*new_x_shape) return x.permute(0, 2, 1, 3) diff --git a/deepspeed/ops/sparse_attention/matmul.py b/deepspeed/ops/sparse_attention/matmul.py index 17b0898fd..24536c90d 100755 --- a/deepspeed/ops/sparse_attention/matmul.py +++ b/deepspeed/ops/sparse_attention/matmul.py @@ -12,29 +12,8 @@ from deepspeed.accelerator import get_accelerator @triton.jit -def _kernel(A, - B, - C, - stride_za, - stride_ha, - stride_ma, - stride_ka, - stride_zb, - stride_hb, - stride_kb, - stride_nb, - stride_zc, - stride_hc, - stride_mc, - stride_nc, - DS0, - DS1, - SDD_K, - SDD_off_width, - lut, - locks, - nlocks, - **meta): +def _kernel(A, B, C, stride_za, stride_ha, stride_ma, stride_ka, stride_zb, stride_hb, stride_kb, stride_nb, stride_zc, + stride_hc, stride_mc, stride_nc, DS0, DS1, SDD_K, SDD_off_width, lut, locks, nlocks, **meta): TM = meta['TM'] TN = meta['TN'] TK = meta['TK'] @@ -194,8 +173,7 @@ def _kernel(A, tl.store(pc, c, mask=checkc) # accumulate partial results using spin-locks else: - plock = locks + tl.program_id(2) * nlocks * tl.num_programs(1) + tl.program_id( - 1) * nlocks + lockid - 1 + plock = locks + tl.program_id(2) * nlocks * tl.num_programs(1) + tl.program_id(1) * nlocks + lockid - 1 pcount = plock + tl.num_programs(2) * tl.num_programs(1) * nlocks while tl.atomic_cas(plock, 0, 1) == 1: pass @@ -292,10 +270,7 @@ class _sparse_matmul(torch.autograd.Function): #segmented = _sparse_matmul.sdd_segment(layout.type(torch.int32), start_width) start_width = (128 if block > 16 else 32) // block layout = layout.type(torch.int32) - segmented = libtriton.superblock(layout.data_ptr(), - layout.shape[0], - layout.shape[1], - layout.shape[2], + segmented = libtriton.superblock(layout.data_ptr(), layout.shape[0], layout.shape[1], layout.shape[2], start_width) luts, widths, packs = [], [], [] for size, nnz in segmented: @@ -317,19 +292,7 @@ class _sparse_matmul(torch.autograd.Function): return luts, None, widths, packs @staticmethod - def _sdd_matmul(a, - b, - trans_a, - trans_b, - trans_c, - spdims, - block, - luts, - num_locks, - widths, - packs, - bench, - time): + def _sdd_matmul(a, b, trans_a, trans_b, trans_c, spdims, block, luts, num_locks, widths, packs, bench, time): if trans_c: a, b = b, a trans_a, trans_b = not trans_b, not trans_a @@ -339,9 +302,8 @@ class _sparse_matmul(torch.autograd.Function): b_dim = -1 if trans_b else -2 a_inner, b_inner = a.shape[a_dim], b.shape[b_dim] if a_inner != b_inner: - raise ValueError( - f"Size of tensor A along the {a_dim} dim ({a_inner}) must match size " - f"of tensor B along the {b_dim} dim ({b_inner})") + raise ValueError(f"Size of tensor A along the {a_dim} dim ({a_inner}) must match size " + f"of tensor B along the {b_dim} dim ({b_inner})") if a_inner % 16 != 0: raise ValueError('Reduction size for SDD must be a multiple of 16') @@ -356,12 +318,7 @@ class _sparse_matmul(torch.autograd.Function): device = a.device # create kernel total_width = sum([width * pack * pack for width, pack in zip(widths, packs)]) - c = torch.empty((batch_size, - total_width, - block, - block), - dtype=dtype, - device=a.device) + c = torch.empty((batch_size, total_width, block, block), dtype=dtype, device=a.device) for lut, width, pack in zip(luts, widths, packs): F32TK = [8, 16] F16TK = [16] @@ -387,12 +344,7 @@ class _sparse_matmul(torch.autograd.Function): max_width = 49152 total = 0 if bench else None for off_width in range(0, width, max_width): - grid = lambda meta: [ - meta['TZ'], - min(max_width, - width - off_width), - batch_size - ] + grid = lambda meta: [meta['TZ'], min(max_width, width - off_width), batch_size] _kernel[grid](a, b, c, @@ -504,13 +456,7 @@ class _sparse_matmul(torch.autograd.Function): # create header width = column.size(0) offsets += 6 * width - header = torch.stack((offsets, - segments, - column, - depth, - lockid, - maxid), - dim=1).view(-1).contiguous() + header = torch.stack((offsets, segments, column, depth, lockid, maxid), dim=1).view(-1).contiguous() incs = torch.stack((xincs, wincs), dim=1).view(-1).contiguous() incs = torch.cat((incs, torch.zeros(2, device=incs.device, dtype=incs.dtype))) # create lut @@ -521,19 +467,7 @@ class _sparse_matmul(torch.autograd.Function): return lut, num_locks, width, None @staticmethod - def _dds_matmul(a, - b, - trans_a, - trans_b, - trans_c, - spdims, - block, - lut, - num_locks, - width, - packs, - bench, - time): + def _dds_matmul(a, b, trans_a, trans_b, trans_c, spdims, block, lut, num_locks, width, packs, bench, time): global triton if triton is None: triton = importlib.import_module('triton') @@ -548,16 +482,7 @@ class _sparse_matmul(torch.autograd.Function): BS2 = block * spdims[1 if trans_b else 2] dtype = a.dtype # kernel - meta = { - 'TN': block, - 'TM': 128, - 'TK': 16, - 'BLOCK': block, - 'TZ': 1, - 'SDD': False, - 'DSD': False, - 'DDS': True - } + meta = {'TN': block, 'TM': 128, 'TK': 16, 'BLOCK': block, 'TZ': 1, 'SDD': False, 'DSD': False, 'DDS': True} # output CS0 = AS0 CS1 = AS1 @@ -593,19 +518,7 @@ class _sparse_matmul(torch.autograd.Function): return c @staticmethod - def _dsd_matmul(a, - b, - trans_a, - trans_b, - trans_c, - spdims, - block, - lut, - num_locks, - width, - packs, - bench, - time): + def _dsd_matmul(a, b, trans_a, trans_b, trans_c, spdims, block, lut, num_locks, width, packs, bench, time): global triton if triton is None: triton = importlib.import_module('triton') @@ -621,16 +534,7 @@ class _sparse_matmul(torch.autograd.Function): dtype = a.dtype # kernel - meta = { - 'TM': block, - 'TN': 128, - 'TK': 16, - 'BLOCK': block, - 'TZ': 1, - 'SDD': False, - 'DSD': True, - 'DDS': False - } + meta = {'TM': block, 'TN': 128, 'TK': 16, 'BLOCK': block, 'TZ': 1, 'SDD': False, 'DSD': True, 'DDS': False} # output CS0 = BS0 CS1 = BS1 @@ -665,53 +569,14 @@ class _sparse_matmul(torch.autograd.Function): **meta) return c - fn = { - 'sdd': _sdd_matmul.__get__(object), - 'dsd': _dsd_matmul.__get__(object), - 'dds': _dds_matmul.__get__(object) - } + fn = {'sdd': _sdd_matmul.__get__(object), 'dsd': _dsd_matmul.__get__(object), 'dds': _dds_matmul.__get__(object)} @staticmethod - def forward(ctx, - a, - b, - trans_a, - trans_b, - trans_c, - mode, - spdims, - block, - c_lut, - c_num_locks, - c_width, - c_packs, - c_bench, - c_time, - da_lut, - da_num_locks, - da_width, - da_packs, - da_bench, - da_time, - db_lut, - db_num_locks, - db_width, - db_packs, - db_bench, - db_time): - c = _sparse_matmul.fn[mode](a, - b, - trans_a, - trans_b, - trans_c, - spdims, - block, - c_lut, - c_num_locks, - c_width, - c_packs, - c_bench, - c_time) + def forward(ctx, a, b, trans_a, trans_b, trans_c, mode, spdims, block, c_lut, c_num_locks, c_width, c_packs, + c_bench, c_time, da_lut, da_num_locks, da_width, da_packs, da_bench, da_time, db_lut, db_num_locks, + db_width, db_packs, db_bench, db_time): + c = _sparse_matmul.fn[mode](a, b, trans_a, trans_b, trans_c, spdims, block, c_lut, c_num_locks, c_width, + c_packs, c_bench, c_time) # save for backward ctx.save_for_backward(a, b) ctx.da_num_locks = da_num_locks @@ -741,34 +606,14 @@ class _sparse_matmul(torch.autograd.Function): # gradients w.r.t. a if ctx.needs_input_grad[0]: mode_da = mode[1] + mode[0] + mode[2] - da = _sparse_matmul.fn[mode_da](dc, - b, - False, - not ctx.trans_b, - ctx.trans_a, - ctx.spdims, - ctx.block, - ctx.da_lut, - ctx.da_num_locks, - ctx.da_width, - ctx.da_packs, - ctx.da_bench, + da = _sparse_matmul.fn[mode_da](dc, b, False, not ctx.trans_b, ctx.trans_a, ctx.spdims, ctx.block, + ctx.da_lut, ctx.da_num_locks, ctx.da_width, ctx.da_packs, ctx.da_bench, ctx.da_time) # gradients w.r.t. b if ctx.needs_input_grad[1]: mode_db = mode[2] + mode[1] + mode[0] - db = _sparse_matmul.fn[mode_db](a, - dc, - not ctx.trans_a, - False, - ctx.trans_b, - ctx.spdims, - ctx.block, - ctx.db_lut, - ctx.db_num_locks, - ctx.db_width, - ctx.db_packs, - ctx.db_bench, + db = _sparse_matmul.fn[mode_db](a, dc, not ctx.trans_a, False, ctx.trans_b, ctx.spdims, ctx.block, + ctx.db_lut, ctx.db_num_locks, ctx.db_width, ctx.db_packs, ctx.db_bench, ctx.db_time) return da, db, None, None, None,\ None, None, None, None,\ @@ -785,6 +630,7 @@ class MatMul: For more details about sparsity config, please see `Generative Modeling with Sparse Transformers`: https://arxiv.org/abs/1904.10509 """ + def make_lut(self, dtype, device): """Generates the sparsity layout/s used in block-sparse matmul """ @@ -797,21 +643,25 @@ class MatMul: if self.mode == 'sdd': c_lut, c_num_locks, c_width, c_packs = _sparse_matmul.make_sdd_lut(layout, block, dtype, device) elif self.mode == 'dsd': - c_lut, c_num_locks, c_width, c_packs = _sparse_matmul.make_dxx_lut(layout, block, step, not self.trans_a, device) + c_lut, c_num_locks, c_width, c_packs = _sparse_matmul.make_dxx_lut(layout, block, step, not self.trans_a, + device) elif self.mode == 'dds': - c_lut, c_num_locks, c_width, c_packs = _sparse_matmul.make_dxx_lut(layout, block, step, self.trans_b, device) + c_lut, c_num_locks, c_width, c_packs = _sparse_matmul.make_dxx_lut(layout, block, step, self.trans_b, + device) # DA look-up table if self.mode == 'sdd': da_lut, da_num_locks, da_width, da_packs = _sparse_matmul.make_dxx_lut(layout, block, step, True, device) elif self.mode == 'dsd': da_lut, da_num_locks, da_width, da_packs = _sparse_matmul.make_sdd_lut(layout, block, dtype, device) elif self.mode == 'dds': - da_lut, da_num_locks, da_width, da_packs = _sparse_matmul.make_dxx_lut(layout, block, step, not self.trans_b, device) + da_lut, da_num_locks, da_width, da_packs = _sparse_matmul.make_dxx_lut(layout, block, step, + not self.trans_b, device) # DB look-up table if self.mode == 'sdd': db_lut, db_num_locks, db_width, db_packs = _sparse_matmul.make_dxx_lut(layout, block, step, False, device) elif self.mode == 'dsd': - db_lut, db_num_locks, db_width, db_packs = _sparse_matmul.make_dxx_lut(layout, block, step, self.trans_a, device) + db_lut, db_num_locks, db_width, db_packs = _sparse_matmul.make_dxx_lut(layout, block, step, self.trans_a, + device) elif self.mode == 'dds': db_lut, db_num_locks, db_width, db_packs = _sparse_matmul.make_sdd_lut(layout, block, dtype, device) self.lut_cache[key] = (c_lut, c_num_locks, c_width, c_packs,\ @@ -845,11 +695,10 @@ class MatMul: assert layout_dim in (2, 3), "Layout should be a 2 or 3 dimensional tensor of 0s and 1s" if not mode == 'sdd': # Dims to be reduced on the 'inside' of the matmul, either -1 or -2 - trans_dense, trans_sparse, sparse_inner = (trans_b, trans_a, -1) if mode == 'dsd' else (trans_a, trans_b, -2) - self.dense_inner_dim = -( - (sparse_inner % 2) + 1) if not trans_dense else sparse_inner - sparse_inner = sparse_inner if not trans_sparse else -( - (sparse_inner % 2) + 1) + trans_dense, trans_sparse, sparse_inner = (trans_b, trans_a, -1) if mode == 'dsd' else (trans_a, trans_b, + -2) + self.dense_inner_dim = -((sparse_inner % 2) + 1) if not trans_dense else sparse_inner + sparse_inner = sparse_inner if not trans_sparse else -((sparse_inner % 2) + 1) # Inner dim of the dense input should be equal to the inner dim of the sparse input self.dense_inner_size = layout.shape[sparse_inner] * block @@ -860,8 +709,7 @@ class MatMul: if layout_dim == 2: layout = layout.unsqueeze(0) - layout = layout.long( - ) # Above code assumes the layout tensor is an integral type + layout = layout.long() # Above code assumes the layout tensor is an integral type self.spdims = layout.shape # timings @@ -909,31 +757,9 @@ class MatMul: b = MatMul._pad_shape(b, self.mode == 'dds') # execute - c = _sparse_matmul.apply(a, - b, - self.trans_a, - self.trans_b, - False, - self.mode, - self.spdims, - self.block, - c_lut, - c_num_locks, - c_width, - c_packs, - self.bench, - time_c, - da_lut, - da_num_locks, - da_width, - da_packs, - self.bench, - time_da, - db_lut, - db_num_locks, - db_width, - db_packs, - self.bench, + c = _sparse_matmul.apply(a, b, self.trans_a, self.trans_b, False, self.mode, self.spdims, self.block, c_lut, + c_num_locks, c_width, c_packs, self.bench, time_c, da_lut, da_num_locks, da_width, + da_packs, self.bench, time_da, db_lut, db_num_locks, db_width, db_packs, self.bench, time_db) # This removes any leading singleton dimensions we may have added to the tensor that weren't in the input @@ -948,9 +774,8 @@ class MatMul: def _validate_inputs(self, a, b): if a.device != b.device: - raise ValueError( - f"Inputs must be on the same device; got {a.device} for tensor A " - f"and {b.device} for tensor B") + raise ValueError(f"Inputs must be on the same device; got {a.device} for tensor A " + f"and {b.device} for tensor B") if not get_accelerator().on_accelerator(a): raise ValueError("Only GPU devices are supported for now") @@ -958,9 +783,7 @@ class MatMul: if torch.is_autocast_enabled(): a, b = a.half(), b.half() elif a.dtype != b.dtype: - raise ValueError( - f"Inputs must be the same dtype; got {a.dtype} for A and {b.dtype} for B" - ) + raise ValueError(f"Inputs must be the same dtype; got {a.dtype} for A and {b.dtype} for B") mode, trans_a, trans_b = self.mode, self.trans_a, self.trans_b if mode != 'sdd': @@ -968,14 +791,12 @@ class MatMul: dense, dense_name, sparse, sparse_name = (a, 'A', b, 'B') if mode == 'dds' else (b, 'B', a, 'A') dense_inner = dense.shape[self.dense_inner_dim] if dense_inner != self.dense_inner_size: - raise ValueError( - f"Expected tensor {dense_name} to have size {self.dense_inner_size} at dim " - f"{self.dense_inner_dim % dense.ndim}, got {dense_inner}.") + raise ValueError(f"Expected tensor {dense_name} to have size {self.dense_inner_size} at dim " + f"{self.dense_inner_dim % dense.ndim}, got {dense_inner}.") if sparse.shape[-len(self.sparse_shape):] != self.sparse_shape: - raise ValueError( - f"Expected tensor with trailing dimensions of shape {self.sparse_shape} for argument " - f"{sparse_name}, got {sparse.shape}") + raise ValueError(f"Expected tensor with trailing dimensions of shape {self.sparse_shape} for argument " + f"{sparse_name}, got {sparse.shape}") def add_extra_dims(x): # Add extra leading singleton dimensions if needed @@ -984,8 +805,7 @@ class MatMul: singletons = [1] * dims_needed x = x.view(*singletons, *x.shape) elif dims_needed < 0: - raise ValueError( - "Tensors with more than 4 dimensions are not currently supported") + raise ValueError("Tensors with more than 4 dimensions are not currently supported") return x diff --git a/deepspeed/ops/sparse_attention/softmax.py b/deepspeed/ops/sparse_attention/softmax.py index 09560e103..4b6376d84 100755 --- a/deepspeed/ops/sparse_attention/softmax.py +++ b/deepspeed/ops/sparse_attention/softmax.py @@ -28,29 +28,11 @@ def num_warps(n): return 16 -@triton.heuristics({ - 'num_warps': lambda *args, - **meta: num_warps(args[6] * meta['BLOCK']) -}) -@triton.heuristics({ - 'TN': lambda *args, - **meta: next_power_of_2(args[6] * meta['BLOCK']) -}) +@triton.heuristics({'num_warps': lambda *args, **meta: num_warps(args[6] * meta['BLOCK'])}) +@triton.heuristics({'TN': lambda *args, **meta: next_power_of_2(args[6] * meta['BLOCK'])}) @triton.jit -def _forward(X, - scale, - LUT, - RPE, - KP_M, - ATTN_M, - sizemax, - stride_zx, - stride_zrpe, - stride_hrpe, - stride_srpe, - stride_zkpm, - stride_zattnm, - **meta): +def _forward(X, scale, LUT, RPE, KP_M, ATTN_M, sizemax, stride_zx, stride_zrpe, stride_hrpe, stride_srpe, stride_zkpm, + stride_zattnm, **meta): TN = meta['TN'] BLOCK = meta['BLOCK'] pidhm = tl.program_id(0) @@ -102,14 +84,8 @@ def _forward(X, tl.store(px, x, mask=check) -@triton.heuristics({ - 'num_warps': lambda *args, - **meta: num_warps(args[4] * meta['BLOCK']) -}) -@triton.heuristics({ - 'TN': lambda *args, - **meta: next_power_of_2(args[4]) * meta['BLOCK'] -}) +@triton.heuristics({'num_warps': lambda *args, **meta: num_warps(args[4] * meta['BLOCK'])}) +@triton.heuristics({'TN': lambda *args, **meta: next_power_of_2(args[4]) * meta['BLOCK']}) @triton.jit def _backward(X, scale, DX, LUT, sizemax, stride_zx, stride_zdx, **meta): pidhm = tl.program_id(0) @@ -168,21 +144,8 @@ class _sparse_softmax(torch.autograd.Function): return lut, int(sizes.max()) @staticmethod - def forward(ctx, - x, - scale, - rpe, - key_padding_mask, - attn_mask, - kp_mask_mode, - attn_mask_mode, - spdims, - block, - lut, - num_blocks, - maxlut, - bench, - time): + def forward(ctx, x, scale, rpe, key_padding_mask, attn_mask, kp_mask_mode, attn_mask_mode, spdims, block, lut, + num_blocks, maxlut, bench, time): apply_scale = False if scale == 1.0 else True @@ -251,14 +214,7 @@ class _sparse_softmax(torch.autograd.Function): # run kernel M = x.shape[0] grid = lambda opt: [ctx.spdims[0] * ctx.spdims[1] * ctx.block, M] - _backward[grid](x, - ctx.scale, - dx, - lut, - ctx.maxlut, - x.stride(0), - dx.stride(0), - BLOCK=ctx.block) + _backward[grid](x, ctx.scale, dx, lut, ctx.maxlut, x.stride(0), dx.stride(0), BLOCK=ctx.block) return dx, None, None, None, None, None, None, None, None, None, None, None, None, None, None @@ -270,6 +226,7 @@ class Softmax: For more details about sparsity config, please see `Generative Modeling with Sparse Transformers`: https://arxiv.org/abs/1904.10509 """ + def sparse_softmax(*args, **kwargs): return _sparse_softmax.apply(*args, **kwargs) @@ -278,9 +235,7 @@ class Softmax: """ key = (device, ) if key not in self.lut_cache: - self.lut_cache[key] = _sparse_softmax.make_lut(self.layout, - self.block, - device) + self.lut_cache[key] = _sparse_softmax.make_lut(self.layout, self.block, device) return self.lut_cache[key] def __init__(self, layout, block, bench=False): @@ -332,19 +287,7 @@ class Softmax: if key_padding_mask is not None and key_padding_mask.dtype != x.dtype: raise ValueError('Key padding mask must be %s' % x.dtype) lut, maxlut = self.make_lut(x.device) - x = Softmax.sparse_softmax(x, - scale, - rpe, - key_padding_mask, - attn_mask, - key_padding_mask_mode, - attn_mask_mode, - self.spdims, - self.block, - lut, - self.num_blocks, - maxlut, - self.bench, - time_y) + x = Softmax.sparse_softmax(x, scale, rpe, key_padding_mask, attn_mask, key_padding_mask_mode, attn_mask_mode, + self.spdims, self.block, lut, self.num_blocks, maxlut, self.bench, time_y) self.time_y = time_y[0] return x diff --git a/deepspeed/ops/sparse_attention/sparse_attention_utils.py b/deepspeed/ops/sparse_attention/sparse_attention_utils.py index 90edb10fd..73a1439ce 100644 --- a/deepspeed/ops/sparse_attention/sparse_attention_utils.py +++ b/deepspeed/ops/sparse_attention/sparse_attention_utils.py @@ -15,6 +15,7 @@ class SparseAttentionUtils: Such utilities include extending position embeddings, replacing current self-attention layer with sparse attention, padding sequences to multiple of block size, etc. """ + @staticmethod def extend_position_embedding(model, max_position): """This function extends the position embedding weights of a model loaded from a checkpoint. @@ -28,13 +29,11 @@ class SparseAttentionUtils: """ if hasattr(model, 'bert'): - original_max_position = model.bert.embeddings.position_embeddings.weight.size( - 0) + original_max_position = model.bert.embeddings.position_embeddings.weight.size(0) assert max_position > original_max_position extend_multiples = max(1, max_position // original_max_position) model.bert.embeddings.position_embeddings.weight.data = model.bert.embeddings.position_embeddings.weight.repeat( - extend_multiples, - 1) + extend_multiples, 1) elif hasattr(model, 'roberta'): # RoBERTa has positions 0 & 1 reserved, so embedding size is max position + 2 original_max_position, embed_size = model.roberta.embeddings.position_embeddings.weight.shape @@ -43,13 +42,11 @@ class SparseAttentionUtils: assert max_position > original_max_position max_position += 2 extended_position_embedding = model.roberta.embeddings.position_embeddings.weight.new_empty( - max_position, - embed_size) + max_position, embed_size) k = 2 for i in range(extend_multiples): extended_position_embedding[k:( - k + original_max_position - )] = model.roberta.embeddings.position_embeddings.weight[2:] + k + original_max_position)] = model.roberta.embeddings.position_embeddings.weight[2:] k += original_max_position model.roberta.embeddings.position_embeddings.weight.data = extended_position_embedding else: @@ -58,9 +55,7 @@ class SparseAttentionUtils: ) model.config.max_position_embeddings = max_position - print( - f'Extended position embeddings to {original_max_position * extend_multiples}' - ) + print(f'Extended position embeddings to {original_max_position * extend_multiples}') return model @@ -102,21 +97,17 @@ class SparseAttentionUtils: if hasattr(model, 'bert'): model.config.max_position_embeddings = max_position - model.replace_self_attention_layer_with_sparse_self_attention_layer( - model.config, - model.bert.encoder.layer, - sparsity_config) + model.replace_self_attention_layer_with_sparse_self_attention_layer(model.config, model.bert.encoder.layer, + sparsity_config) elif hasattr(model, 'roberta'): model.config.max_position_embeddings = max_position + 2 - model.replace_self_attention_layer_with_sparse_self_attention_layer( - model.config, - model.roberta.encoder.layer, - sparsity_config) + model.replace_self_attention_layer_with_sparse_self_attention_layer(model.config, + model.roberta.encoder.layer, + sparsity_config) else: raise ValueError( 'Please extend \"update_model_self_attention_to_sparse_self_attention\" function to support \ - your model type. It currently only supports \"bert\" & \"roberta\"!' - ) + your model type. It currently only supports \"bert\" & \"roberta\"!') return model @staticmethod @@ -148,14 +139,8 @@ class SparseAttentionUtils: return layers @staticmethod - def pad_to_block_size(block_size, - input_ids, - attention_mask, - token_type_ids, - position_ids, - inputs_embeds, - pad_token_id, - model_embeddings): + def pad_to_block_size(block_size, input_ids, attention_mask, token_type_ids, position_ids, inputs_embeds, + pad_token_id, model_embeddings): """This function pads input tokens and attention mask on sequence length dimension to be multiple of block size. This is a requirement for Sparse Transformer in which the self attention layer works on sequences of length multiple of block size. It needs to be called in your model, such as BertModel, right before you calculate the embedding outputs. @@ -187,10 +172,7 @@ class SparseAttentionUtils: pad_len = (block_size - seq_len % block_size) % block_size if pad_len > 0: if inputs_embeds is not None: - pad_input_ids = inputs_embeds.new_full((batch_size, - pad_len), - pad_token_id, - dtype=torch.long) + pad_input_ids = inputs_embeds.new_full((batch_size, pad_len), pad_token_id, dtype=torch.long) pad_inputs_embeds = model_embeddings(pad_input_ids) inputs_embeds = torch.cat([inputs_embeds, pad_inputs_embeds], dim=-2) # may not be needed as input_ids are not used if inputs_embeds are given diff --git a/deepspeed/ops/sparse_attention/sparse_self_attention.py b/deepspeed/ops/sparse_attention/sparse_self_attention.py index 46dedd077..306459379 100644 --- a/deepspeed/ops/sparse_attention/sparse_self_attention.py +++ b/deepspeed/ops/sparse_attention/sparse_self_attention.py @@ -15,6 +15,7 @@ class SparseSelfAttention(nn.Module): For usage example please see, TODO DeepSpeed Sparse Transformer Tutorial. """ + def __init__( self, # SparsityConfig parameters needs to be set accordingly @@ -53,8 +54,7 @@ class SparseSelfAttention(nn.Module): if (L % self.sparsity_config.block != 0): raise ValueError( - f'Sequence Length, {L}, needs to be dividable by Block size {self.sparsity_config.block}!' - ) + f'Sequence Length, {L}, needs to be dividable by Block size {self.sparsity_config.block}!') num_blocks = L // self.sparsity_config.block return self.master_layout[..., :num_blocks, :num_blocks].cpu() # layout needs to be a CPU tensor @@ -65,11 +65,7 @@ class SparseSelfAttention(nn.Module): from deepspeed.ops.sparse_attention.softmax import Softmax if L not in SparseSelfAttention.ops: sparsity_layout = self.get_layout(L) - sparse_dot_sdd_nt = MatMul(sparsity_layout, - self.sparsity_config.block, - 'sdd', - trans_a=False, - trans_b=True) + sparse_dot_sdd_nt = MatMul(sparsity_layout, self.sparsity_config.block, 'sdd', trans_a=False, trans_b=True) sparse_dot_dsd_nn = MatMul(sparsity_layout, self.sparsity_config.block, @@ -79,9 +75,7 @@ class SparseSelfAttention(nn.Module): sparse_softmax = Softmax(sparsity_layout, self.sparsity_config.block) - SparseSelfAttention.ops[L] = (sparse_dot_sdd_nt, - sparse_dot_dsd_nn, - sparse_softmax) + SparseSelfAttention.ops[L] = (sparse_dot_sdd_nt, sparse_dot_dsd_nn, sparse_softmax) return SparseSelfAttention.ops[L] def transpose_key_for_scores(self, x, L): @@ -100,13 +94,7 @@ class SparseSelfAttention(nn.Module): return x.squeeze() # forward pass - def forward(self, - query, - key, - value, - rpe=None, - key_padding_mask=None, - attn_mask=None): + def forward(self, query, key, value, rpe=None, key_padding_mask=None, attn_mask=None): """Applies forward phase of sparse self attention Arguments: @@ -134,9 +122,7 @@ class SparseSelfAttention(nn.Module): # squeeze key_padding_mask if it is given if key_padding_mask is not None: - key_padding_mask = self.transpose_mask_for_sparse(query.dtype, - key_padding_mask, - is_key_padding_mask=True) + key_padding_mask = self.transpose_mask_for_sparse(query.dtype, key_padding_mask, is_key_padding_mask=True) # squeeze attn_mask if it is given if attn_mask is not None: @@ -149,14 +135,13 @@ class SparseSelfAttention(nn.Module): # attention scores attn_output_weights = sparse_dot_sdd_nt(query, key) - attn_output_weights = sparse_softmax( - attn_output_weights, - scale=scaling, - rpe=rpe, - key_padding_mask=key_padding_mask, - attn_mask=attn_mask, - key_padding_mask_mode=self.key_padding_mask_mode, - attn_mask_mode=self.attn_mask_mode) + attn_output_weights = sparse_softmax(attn_output_weights, + scale=scaling, + rpe=rpe, + key_padding_mask=key_padding_mask, + attn_mask=attn_mask, + key_padding_mask_mode=self.key_padding_mask_mode, + attn_mask_mode=self.attn_mask_mode) # outputs attn_output = sparse_dot_dsd_nn(attn_output_weights, value) diff --git a/deepspeed/ops/sparse_attention/sparsity_config.py b/deepspeed/ops/sparse_attention/sparsity_config.py index efdad2e4d..bb82a2ead 100644 --- a/deepspeed/ops/sparse_attention/sparsity_config.py +++ b/deepspeed/ops/sparse_attention/sparsity_config.py @@ -10,6 +10,7 @@ class SparsityConfig: """Abstract Configuration class to store `sparsity configuration of a self attention layer`. It contains shared property of different block-sparse sparsity patterns. However, each class needs to extend it based on required property and functionality. """ + def __init__(self, num_heads, block=16, different_layout_per_head=False): """Initialize the Sparsity Pattern Config. @@ -37,9 +38,7 @@ class SparsityConfig: """ if (seq_len % self.block != 0): - raise ValueError( - f'Sequence Length, {seq_len}, needs to be dividable by Block size {self.block}!' - ) + raise ValueError(f'Sequence Length, {seq_len}, needs to be dividable by Block size {self.block}!') num_blocks = seq_len // self.block # TODO Currently we allocate layout per head; needs to be updated if heads share a single layout. layout = torch.zeros((self.num_heads, num_blocks, num_blocks), dtype=torch.int64) @@ -64,6 +63,7 @@ class DenseSparsityConfig(SparsityConfig): """Configuration class to store `Dense` configuration. In reality, this is not sparse and all blocks are used. We keep it for the sake of comparison and comprehension. """ + def __init__(self, num_heads, block=16, different_layout_per_head=False): """Initialize the Dense Sparsity Pattern Config. In reality, this is not sparse and all blocks are used. We keep it for the sake of comparison and comprehension. @@ -96,6 +96,7 @@ class FixedSparsityConfig(SparsityConfig): For more details about this sparsity config, please see `Generative Modeling with Sparse Transformers`: https://arxiv.org/abs/1904.10509; this has been customized. This class extends parent class of `SparsityConfig` and customizes it for `Fixed` sparsity. """ + def __init__(self, num_heads, block=16, @@ -131,14 +132,11 @@ class FixedSparsityConfig(SparsityConfig): self.num_global_blocks = num_global_blocks if (attention != 'unidirectional' and attention != 'bidirectional'): - raise NotImplementedError( - 'only \"uni/bi-directional\" attentions are supported for now!') + raise NotImplementedError('only \"uni/bi-directional\" attentions are supported for now!') self.attention = attention if (attention != 'bidirectional' and horizontal_global_attention): - raise ValueError( - 'only \"bi-directional\" attentions can support horizontal global attention!' - ) + raise ValueError('only \"bi-directional\" attentions can support horizontal global attention!') self.horizontal_global_attention = horizontal_global_attention if (num_different_global_patterns > 1 and not different_layout_per_head): @@ -166,9 +164,7 @@ class FixedSparsityConfig(SparsityConfig): for i in range(0, num_blocks, self.num_local_blocks): end = min(i + self.num_local_blocks, num_blocks) for row in range(i, end): - for col in range( - i, - (row + 1 if self.attention == 'unidirectional' else end)): + for col in range(i, (row + 1 if self.attention == 'unidirectional' else end)): layout[h, row, col] = 1 return layout @@ -206,8 +202,7 @@ class FixedSparsityConfig(SparsityConfig): # set last global blocks; handle possible short last local window if (end < num_blocks): - start = min(end + first_global_block_idx, - num_blocks - self.num_global_blocks) + start = min(end + first_global_block_idx, num_blocks - self.num_global_blocks) end = start + self.num_global_blocks # vertical global attention @@ -250,6 +245,7 @@ class VariableSparsityConfig(SparsityConfig): For more details about `Fixed` sparsity config, please see `Generative Modeling with Sparse Transformers`: https://arxiv.org/abs/1904.10509; this has been customized. This class extends parent class of `SparsityConfig` and customizes it for `Fixed` sparsity. """ + def __init__(self, num_heads, block=16, @@ -296,14 +292,11 @@ class VariableSparsityConfig(SparsityConfig): self.global_block_end_indices = global_block_end_indices if (attention != 'unidirectional' and attention != 'bidirectional'): - raise NotImplementedError( - 'only \"uni/bi-directional\" attentions are supported for now!') + raise NotImplementedError('only \"uni/bi-directional\" attentions are supported for now!') self.attention = attention if (attention != 'bidirectional' and horizontal_global_attention): - raise ValueError( - 'only \"bi-directional\" attentions can support horizontal global attention!' - ) + raise ValueError('only \"bi-directional\" attentions can support horizontal global attention!') self.horizontal_global_attention = horizontal_global_attention def set_random_layout(self, h, layout): @@ -345,9 +338,7 @@ class VariableSparsityConfig(SparsityConfig): end_block_idx += block_size end_block_idx = min(end_block_idx, num_blocks) for row in range(start_block_idx, end_block_idx): - for col in range( - start_block_idx, - (row + 1 if self.attention == 'unidirectional' else end_block_idx)): + for col in range(start_block_idx, (row + 1 if self.attention == 'unidirectional' else end_block_idx)): layout[h, row, col] = 1 start_block_idx += block_size @@ -355,9 +346,7 @@ class VariableSparsityConfig(SparsityConfig): for i in range(start_block_idx, num_blocks, block_size): end_block_idx = min(i + block_size, num_blocks) for row in range(i, end_block_idx): - for col in range( - i, - (row + 1 if self.attention == 'unidirectional' else end_block_idx)): + for col in range(i, (row + 1 if self.attention == 'unidirectional' else end_block_idx)): layout[h, row, col] = 1 return layout @@ -423,6 +412,7 @@ class BigBirdSparsityConfig(SparsityConfig): For more details about this sparsity config, please see `Big Bird: Transformers for Longer Sequences`: https://arxiv.org/pdf/2007.14062.pdf This class extends parent class of `SparsityConfig` and customizes it for `BigBird` sparsity. """ + def __init__(self, num_heads, block=16, @@ -452,8 +442,7 @@ class BigBirdSparsityConfig(SparsityConfig): self.num_global_blocks = num_global_blocks if (attention != 'unidirectional' and attention != 'bidirectional'): - raise NotImplementedError( - 'only \"uni/bi-directional\" attentions are supported for now!') + raise NotImplementedError('only \"uni/bi-directional\" attentions are supported for now!') self.attention = attention def set_random_layout(self, h, layout): @@ -475,10 +464,7 @@ class BigBirdSparsityConfig(SparsityConfig): ) for row in range(0, num_blocks): - sample_range = range( - 0, - num_blocks) if self.attention == 'bidirectional' else range(0, - row + 1) + sample_range = range(0, num_blocks) if self.attention == 'bidirectional' else range(0, row + 1) rnd_cols = random.sample(sample_range, self.num_random_blocks) layout[h, row, rnd_cols] = 1 return layout @@ -564,6 +550,7 @@ class BSLongformerSparsityConfig(SparsityConfig): For more details about this sparsity config, please see `Longformer: The Long-Document Transformer`: https://arxiv.org/pdf/2004.05150.pdf This class extends parent class of `SparsityConfig` and customizes it for `Longformer` sparsity. """ + def __init__(self, num_heads, block=16, @@ -687,11 +674,8 @@ class LocalSlidingWindowSparsityConfig(SparsityConfig): """Configuration class to store `Local Sliding Window` sparsity configuration - a purely-local sliding window attention. This class extends parent class of `SparsityConfig` and customizes it for `Local` sparsity. """ - def __init__(self, - num_heads, - block=16, - num_sliding_window_blocks=3, - attention='unidirectional'): + + def __init__(self, num_heads, block=16, num_sliding_window_blocks=3, attention='unidirectional'): """Initialize the Local Sliding Window Sparsity Pattern Config. For usage example please see, TODO DeepSpeed Sparse Transformer Tutorial Arguments: @@ -723,8 +707,7 @@ class LocalSlidingWindowSparsityConfig(SparsityConfig): w = self.num_sliding_window_blocks // 2 for row in range(0, num_blocks): start = max(0, row - w) - end = min(row + w + 1, - num_blocks) if self.attention == "bidirectional" else row + 1 + end = min(row + w + 1, num_blocks) if self.attention == "bidirectional" else row + 1 layout[h, row, start:end] = 1 return layout diff --git a/deepspeed/ops/transformer/inference/bias_add.py b/deepspeed/ops/transformer/inference/bias_add.py index b7d2944fb..9a3daccce 100644 --- a/deepspeed/ops/transformer/inference/bias_add.py +++ b/deepspeed/ops/transformer/inference/bias_add.py @@ -22,7 +22,4 @@ def nhwc_bias_add(activation: torch.Tensor, elif other_bias is None: return spatial_cuda_module.nhwc_bias_add_add(activation, bias, other) else: - return spatial_cuda_module.nhwc_bias_add_bias_add(activation, - bias, - other, - other_bias) + return spatial_cuda_module.nhwc_bias_add_bias_add(activation, bias, other, other_bias) diff --git a/deepspeed/ops/transformer/inference/config.py b/deepspeed/ops/transformer/inference/config.py index f0c87ba56..e9cd78fc0 100644 --- a/deepspeed/ops/transformer/inference/config.py +++ b/deepspeed/ops/transformer/inference/config.py @@ -6,6 +6,7 @@ from deepspeed.utils.types import ActivationFuncType class TransformerConfig(): + def __init__(self, hidden_size, intermediate_size, heads, num_hidden_layers): self.layer_id = -1 self.hidden_size = hidden_size @@ -40,6 +41,7 @@ class DeepSpeedInferenceConfig(TransformerConfig): return_tuple: if True, returns the transformer output as a tuple, otherwise returns as a tensor bigscience_bloom: This flag is added temporarily for supporting the BLOOM-176B model architecture. """ + def __init__(self, hidden_size=-1, intermediate_size=-1, @@ -70,11 +72,8 @@ class DeepSpeedInferenceConfig(TransformerConfig): scale_attn_by_inverse_layer_idx=False, return_single_tuple=False): super(DeepSpeedInferenceConfig, - self).__init__( - hidden_size, - (intermediate_size if intermediate_size > 0 else 4 * hidden_size), - heads, - num_hidden_layers) + self).__init__(hidden_size, (intermediate_size if intermediate_size > 0 else 4 * hidden_size), heads, + num_hidden_layers) self.fp16 = fp16 self.pre_layer_norm = pre_layer_norm self.local_rank = local_rank diff --git a/deepspeed/ops/transformer/inference/diffusers_2d_transformer.py b/deepspeed/ops/transformer/inference/diffusers_2d_transformer.py index 246eab867..985e58d09 100644 --- a/deepspeed/ops/transformer/inference/diffusers_2d_transformer.py +++ b/deepspeed/ops/transformer/inference/diffusers_2d_transformer.py @@ -4,5 +4,6 @@ Copyright 2022 The Microsoft DeepSpeed Team class Diffusers2DTransformerConfig(): + def __init__(self, int8_quantization=False): self.int8_quantization = int8_quantization diff --git a/deepspeed/ops/transformer/inference/diffusers_attention.py b/deepspeed/ops/transformer/inference/diffusers_attention.py index 9d829ce3e..63a444ba1 100644 --- a/deepspeed/ops/transformer/inference/diffusers_attention.py +++ b/deepspeed/ops/transformer/inference/diffusers_attention.py @@ -30,26 +30,12 @@ def load_triton_flash_attn(): class DeepSpeedDiffusersAttentionFunction(Function): + @staticmethod - def forward(ctx, - input, - context, - input_mask, - config, - attn_qkvw, - attn_qw, - attn_kw, - attn_vw, - attn_qkvb, - num_attention_heads_per_partition, - norm_factor, - hidden_size_per_partition, - attn_ow, - attn_ob, - do_out_bias, - score_context_func, - linear_func, - triton_flash_attn_kernel): + def forward(ctx, input, context, input_mask, config, attn_qkvw, attn_qw, attn_kw, attn_vw, attn_qkvb, + num_attention_heads_per_partition, norm_factor, hidden_size_per_partition, attn_ow, attn_ob, + do_out_bias, score_context_func, linear_func, triton_flash_attn_kernel): + def _transpose_for_context(x): x = x.permute(0, 2, 1, 3) new_x_layer_shape = x.size()[:-2] + \ @@ -58,8 +44,7 @@ class DeepSpeedDiffusersAttentionFunction(Function): def _transpose_for_scores(x): attention_head_size = x.shape[-1] // num_attention_heads_per_partition - new_x_shape = x.size()[:-1] + (num_attention_heads_per_partition, - attention_head_size) + new_x_shape = x.size()[:-1] + (num_attention_heads_per_partition, attention_head_size) x = x.reshape(*new_x_shape) x = x.permute(0, 2, 1, 3) return x.contiguous() @@ -71,19 +56,12 @@ class DeepSpeedDiffusersAttentionFunction(Function): do_flash_attn = (head_size <= 128) scale = (1 / norm_factor) * (1 / norm_factor) if do_flash_attn and context == None: - qkv_out = linear_func(input, - attn_qkvw, - attn_qkvb if attn_qkvb is not None else attn_qkvw, - attn_qkvb is not None, - do_flash_attn, - config.heads) + qkv_out = linear_func(input, attn_qkvw, attn_qkvb if attn_qkvb is not None else attn_qkvw, attn_qkvb + is not None, do_flash_attn, config.heads) - context_layer = triton_flash_attn_kernel(qkv_out[0], - qkv_out[1], - qkv_out[2], - scale, + context_layer = triton_flash_attn_kernel(qkv_out[0], qkv_out[1], qkv_out[2], scale, input.shape[-2] % 128 == 0) - context_layer = _transpose_for_context(context_layer[:,:,:,:head_size]) + context_layer = _transpose_for_context(context_layer[:, :, :, :head_size]) else: do_flash_attn = False @@ -97,21 +75,12 @@ class DeepSpeedDiffusersAttentionFunction(Function): query = query.contiguous() key = key.contiguous() value = value.contiguous() - query, key, value = inference_cuda_module.pad_transform_fp16(query, key, value, config.heads, do_flash_attn) - attention_scores = (torch.matmul(query, - key.transpose(-1, - -2)) * - scale).softmax(dim=-1) - context_layer = _transpose_for_context( - torch.matmul(attention_scores, - value)) + query, key, value = inference_cuda_module.pad_transform_fp16(query, key, value, config.heads, + do_flash_attn) + attention_scores = (torch.matmul(query, key.transpose(-1, -2)) * scale).softmax(dim=-1) + context_layer = _transpose_for_context(torch.matmul(attention_scores, value)) - output = linear_func(context_layer, - attn_ow, - attn_ob, - do_out_bias, - False, - config.heads) + output = linear_func(context_layer, attn_ow, attn_ob, do_out_bias, False, config.heads) return output output = selfAttention_fp(input, context, input_mask) @@ -142,8 +111,7 @@ class DeepSpeedDiffusersAttention(nn.Module): self.config = config self.config.layer_id = DeepSpeedDiffusersAttention.layer_id DeepSpeedDiffusersAttention.layer_id += 1 - device = get_accelerator().current_device_name( - ) if config.bigscience_bloom else 'cpu' + device = get_accelerator().current_device_name() if config.bigscience_bloom else 'cpu' qkv_size_per_partition = (self.config.hidden_size // self.config.mp_size) * 3 data_type = torch.int8 if config.q_int8 else torch.half if config.fp16 else torch.float @@ -176,9 +144,7 @@ class DeepSpeedDiffusersAttention(nn.Module): dtype=data_type, device=device), requires_grad=False) - self.attn_qkvb = nn.Parameter(torch.empty(qkv_size_per_partition, - dtype=data_type_fp, - device=device), + self.attn_qkvb = nn.Parameter(torch.empty(qkv_size_per_partition, dtype=data_type_fp, device=device), requires_grad=False) out_size_per_partition = self.config.hidden_size // self.config.mp_size self.attn_ow = nn.Parameter(torch.empty(out_size_per_partition, @@ -187,9 +153,7 @@ class DeepSpeedDiffusersAttention(nn.Module): device=device), requires_grad=False) - self.attn_ob = nn.Parameter(torch.empty(self.config.hidden_size, - dtype=data_type_fp, - device=device), + self.attn_ob = nn.Parameter(torch.empty(self.config.hidden_size, dtype=data_type_fp, device=device), requires_grad=False) self.do_out_bias = True @@ -200,8 +164,7 @@ class DeepSpeedDiffusersAttention(nn.Module): self.hidden_size_per_partition = self.config.hidden_size // self.config.mp_size self.hidden_size_per_attention_head = self.config.hidden_size // self.config.heads - self.norm_factor = math.sqrt( - math.sqrt(self.config.hidden_size // self.config.heads)) + self.norm_factor = math.sqrt(math.sqrt(self.config.hidden_size // self.config.heads)) if self.config.scale_attn_by_inverse_layer_idx is True: self.norm_factor *= math.sqrt(self.config.layer_id + 1) @@ -216,33 +179,15 @@ class DeepSpeedDiffusersAttention(nn.Module): def forward(self, input, context=None, input_mask=None): if self.config.layer_id == 0: - self.allocate_workspace(self.config.hidden_size, - self.config.heads, + self.allocate_workspace(self.config.hidden_size, self.config.heads, input.size()[1], - input.size()[0], - DeepSpeedDiffusersAttention.layer_id, - self.config.mp_size, - False, - 0, - self.config.max_out_tokens) - output = DeepSpeedDiffusersAttentionFunction.apply( - input, - context, - input_mask, - self.config, - self.attn_qkvw, - self.attn_qw, - self.attn_kw, - self.attn_vw, - self.attn_qkvb, - self.num_attention_heads_per_partition, - self.norm_factor, - self.hidden_size_per_partition, - self.attn_ow, - self.attn_ob, - self.do_out_bias, - self.score_context_func, - self.linear_func, - self.triton_flash_attn_kernel) + input.size()[0], DeepSpeedDiffusersAttention.layer_id, self.config.mp_size, False, + 0, self.config.max_out_tokens) + output = DeepSpeedDiffusersAttentionFunction.apply(input, context, input_mask, self.config, self.attn_qkvw, + self.attn_qw, self.attn_kw, self.attn_vw, self.attn_qkvb, + self.num_attention_heads_per_partition, self.norm_factor, + self.hidden_size_per_partition, self.attn_ow, self.attn_ob, + self.do_out_bias, self.score_context_func, self.linear_func, + self.triton_flash_attn_kernel) return output diff --git a/deepspeed/ops/transformer/inference/diffusers_transformer_block.py b/deepspeed/ops/transformer/inference/diffusers_transformer_block.py index e453c343e..b3de95f13 100644 --- a/deepspeed/ops/transformer/inference/diffusers_transformer_block.py +++ b/deepspeed/ops/transformer/inference/diffusers_transformer_block.py @@ -31,41 +31,30 @@ def load_spatial_module(): class DeepSpeedDiffusersTransformerBlock(nn.Module): - def __init__(self, - equivalent_module: nn.Module, - config: Diffusers2DTransformerConfig): + + def __init__(self, equivalent_module: nn.Module, config: Diffusers2DTransformerConfig): super(DeepSpeedDiffusersTransformerBlock, self).__init__() self.quantizer = module_inject.GroupQuantizer(q_int8=config.int8_quantization) # Ensure ops are built by the time we start running self.config = config self.ff1_w = self.quantizer.quantize( - nn.Parameter(equivalent_module.ff.net[0].proj.weight.data, - requires_grad=False)) - self.ff1_b = nn.Parameter(equivalent_module.ff.net[0].proj.bias.data, - requires_grad=False) - self.ff2_w = self.quantizer.quantize( - nn.Parameter(equivalent_module.ff.net[2].weight.data, - requires_grad=False)) - self.ff2_b = nn.Parameter(equivalent_module.ff.net[2].bias.data, - requires_grad=False) + nn.Parameter(equivalent_module.ff.net[0].proj.weight.data, requires_grad=False)) + self.ff1_b = nn.Parameter(equivalent_module.ff.net[0].proj.bias.data, requires_grad=False) + self.ff2_w = self.quantizer.quantize(nn.Parameter(equivalent_module.ff.net[2].weight.data, + requires_grad=False)) + self.ff2_b = nn.Parameter(equivalent_module.ff.net[2].bias.data, requires_grad=False) - self.norm1_g = nn.Parameter(equivalent_module.norm1.weight.data, - requires_grad=False) - self.norm1_b = nn.Parameter(equivalent_module.norm1.bias.data, - requires_grad=False) + self.norm1_g = nn.Parameter(equivalent_module.norm1.weight.data, requires_grad=False) + self.norm1_b = nn.Parameter(equivalent_module.norm1.bias.data, requires_grad=False) self.norm1_eps = equivalent_module.norm1.eps - self.norm2_g = nn.Parameter(equivalent_module.norm2.weight.data, - requires_grad=False) - self.norm2_b = nn.Parameter(equivalent_module.norm2.bias.data, - requires_grad=False) + self.norm2_g = nn.Parameter(equivalent_module.norm2.weight.data, requires_grad=False) + self.norm2_b = nn.Parameter(equivalent_module.norm2.bias.data, requires_grad=False) self.norm2_eps = equivalent_module.norm2.eps - self.norm3_g = nn.Parameter(equivalent_module.norm3.weight.data, - requires_grad=False) - self.norm3_b = nn.Parameter(equivalent_module.norm3.bias.data, - requires_grad=False) + self.norm3_g = nn.Parameter(equivalent_module.norm3.weight.data, requires_grad=False) + self.norm3_b = nn.Parameter(equivalent_module.norm3.bias.data, requires_grad=False) self.norm3_eps = equivalent_module.norm3.eps self.attn_1 = equivalent_module.attn1 @@ -76,16 +65,14 @@ class DeepSpeedDiffusersTransformerBlock(nn.Module): self.attn_1.do_out_bias = False self.attn_1_bias = self.attn_1.attn_ob else: - self.attn_1_bias = nn.Parameter(torch.zeros_like(self.norm2_g), - requires_grad=False) + self.attn_1_bias = nn.Parameter(torch.zeros_like(self.norm2_g), requires_grad=False) # Pull the bias in if we can if isinstance(self.attn_2, DeepSpeedDiffusersAttention): self.attn_2.do_out_bias = False self.attn_2_bias = self.attn_2.attn_ob else: - self.attn_2_bias = nn.Paramaeter(torch.zeros_like(self.norm3_g), - requires_grad=False) + self.attn_2_bias = nn.Paramaeter(torch.zeros_like(self.norm3_g), requires_grad=False) self.transformer_cuda_module = load_transformer_module() load_spatial_module() @@ -99,25 +86,14 @@ class DeepSpeedDiffusersTransformerBlock(nn.Module): if "encoder_hidden_states" in kwargs and kwargs["encoder_hidden_states"] != None: context = kwargs["encoder_hidden_states"] - out_norm_1 = self.transformer_cuda_module.layer_norm(hidden_states, - self.norm1_g, - self.norm1_b, - self.norm1_eps) + out_norm_1 = self.transformer_cuda_module.layer_norm(hidden_states, self.norm1_g, self.norm1_b, self.norm1_eps) out_attn_1 = self.attn_1(out_norm_1) - out_norm_2, out_attn_1 = self.transformer_cuda_module.layer_norm_residual_store_pre_ln_res(out_attn_1, - self.attn_1_bias, - hidden_states, - self.norm2_g, - self.norm2_b, - self.norm2_eps) + out_norm_2, out_attn_1 = self.transformer_cuda_module.layer_norm_residual_store_pre_ln_res( + out_attn_1, self.attn_1_bias, hidden_states, self.norm2_g, self.norm2_b, self.norm2_eps) out_attn_2 = self.attn_2(out_norm_2, context=context) - out_norm_3, out_attn_2 = self.transformer_cuda_module.layer_norm_residual_store_pre_ln_res(out_attn_2, - self.attn_2_bias, - out_attn_1, - self.norm3_g, - self.norm3_b, - self.norm3_eps) + out_norm_3, out_attn_2 = self.transformer_cuda_module.layer_norm_residual_store_pre_ln_res( + out_attn_2, self.attn_2_bias, out_attn_1, self.norm3_g, self.norm3_b, self.norm3_eps) out_ff1 = nn.functional.linear(out_norm_3, self.ff1_w) out_geglu = self.transformer_cuda_module.bias_geglu(out_ff1, self.ff1_b) diff --git a/deepspeed/ops/transformer/inference/ds_attention.py b/deepspeed/ops/transformer/inference/ds_attention.py index f4ec14bf4..851ffdc93 100644 --- a/deepspeed/ops/transformer/inference/ds_attention.py +++ b/deepspeed/ops/transformer/inference/ds_attention.py @@ -22,17 +22,14 @@ class DeepSpeedSelfAttention(nn.Module): data_type_fp = torch.half if config.fp16 else torch.float self.config.layer_id = DeepSpeedSelfAttention.num_layers DeepSpeedSelfAttention.num_layers = DeepSpeedSelfAttention.num_layers + 1 - device = get_accelerator().current_device_name( - ) #if config.bigscience_bloom else 'cpu' + device = get_accelerator().current_device_name() #if config.bigscience_bloom else 'cpu' qkv_size_per_partition = (self.config.hidden_size // self.config.mp_size) * 3 self.attn_qkvw = nn.Parameter(torch.empty(self.config.hidden_size, qkv_size_per_partition, dtype=data_type, device=device), requires_grad=False) - self.attn_qkvb = nn.Parameter(torch.empty(qkv_size_per_partition, - dtype=data_type_fp, - device=device), + self.attn_qkvb = nn.Parameter(torch.empty(qkv_size_per_partition, dtype=data_type_fp, device=device), requires_grad=False) out_size_per_partition = self.config.hidden_size // self.config.mp_size self.attn_ow = nn.Parameter(torch.empty(out_size_per_partition, @@ -41,9 +38,7 @@ class DeepSpeedSelfAttention(nn.Module): device=device), requires_grad=False) - self.attn_ob = nn.Parameter(torch.empty(self.config.hidden_size, - dtype=data_type_fp, - device=device), + self.attn_ob = nn.Parameter(torch.empty(self.config.hidden_size, dtype=data_type_fp, device=device), requires_grad=False) self.num_attention_heads_per_partition = self.config.heads // self.config.mp_size @@ -115,34 +110,32 @@ class DeepSpeedSelfAttention(nn.Module): num_heads=self.num_attention_heads_per_partition, num_layers=DeepSpeedSelfAttention.num_layers) else: - qkv_out = self.qkv_func( - input=input, - weight=self.attn_qkvw, - bias=(self.attn_qkvb if self.attn_qkvb is not None else norm_b), - gamma=norm_w, - beta=norm_b, - add_bias=(self.attn_qkvb is not None), - num_layers=DeepSpeedSelfAttention.num_layers, - num_heads=self.num_attention_heads_per_partition) + qkv_out = self.qkv_func(input=input, + weight=self.attn_qkvw, + bias=(self.attn_qkvb if self.attn_qkvb is not None else norm_b), + gamma=norm_w, + beta=norm_b, + add_bias=(self.attn_qkvb is not None), + num_layers=DeepSpeedSelfAttention.num_layers, + num_heads=self.num_attention_heads_per_partition) - context_layer, key_layer, value_layer = self.compute_attention( - qkv_out=qkv_out, - input_mask=input_mask, - layer_past=layer_past, - alibi=alibi) + context_layer, key_layer, value_layer = self.compute_attention(qkv_out=qkv_out, + input_mask=input_mask, + layer_past=layer_past, + alibi=alibi) output = self.vector_matmul_func(input=context_layer, weight=self.attn_ow) inp_norm = qkv_out[-1] - if self.config.mlp_after_attn and self.mp_group is not None and dist.get_world_size( - group=self.mp_group) > 1: + if self.config.mlp_after_attn and self.mp_group is not None and dist.get_world_size(group=self.mp_group) > 1: dist.all_reduce(output, group=self.mp_group) return (output, key_layer, value_layer, context_layer, inp_norm) class BloomSelfAttention(DeepSpeedSelfAttention): + def __init__(self, *args, **kwargs): super(BloomSelfAttention, self).__init__(*args, **kwargs) self.softmax_func = SoftmaxOp(self.config) @@ -156,10 +149,7 @@ class BloomSelfAttention(DeepSpeedSelfAttention): (self.hidden_size_per_partition,) return x.view(*new_x_layer_shape).contiguous() - def _split_tensor_along_last_dim(self, - tensor, - num_partitions, - contiguous_split_chunks=True): + def _split_tensor_along_last_dim(self, tensor, num_partitions, contiguous_split_chunks=True): """Split a tensor along its last dimension. Args: @@ -196,64 +186,43 @@ class BloomSelfAttention(DeepSpeedSelfAttention): mixed_x_layer = qkv_out alibi = alibi.to(get_accelerator().current_device_name()) head_dim = self.hidden_size_per_partition // self.num_attention_heads_per_partition - new_tensor_shape = mixed_x_layer.size()[:-1] + ( - self.num_attention_heads_per_partition, - 3 * head_dim) + new_tensor_shape = mixed_x_layer.size()[:-1] + (self.num_attention_heads_per_partition, 3 * head_dim) mixed_x_layer = mixed_x_layer.view(*new_tensor_shape) query_layer, key_layer, value_layer = self._split_tensor_along_last_dim(mixed_x_layer, 3) # [batch_size, head_dim, q_length, k_length] - output_size = (query_layer.size(0), - query_layer.size(2), - query_layer.size(1), - key_layer.size(1)) + output_size = (query_layer.size(0), query_layer.size(2), query_layer.size(1), key_layer.size(1)) # [batch_size, q_length, num_heads, head_dim] -> [q_length, batch_size * num_heads, head_dim] - query_layer = query_layer.transpose(1, - 2).reshape(output_size[0] * output_size[1], - output_size[2], - -1) + query_layer = query_layer.transpose(1, 2).reshape(output_size[0] * output_size[1], output_size[2], -1) # [batch_size, k_length, num_heads, head_dim] -> [k_length, batch_size * num_heads, head_dim] - key_layer = key_layer.transpose(1, - 2).reshape(output_size[0] * output_size[1], - output_size[3], - -1).transpose(-1, - -2) - value_layer = value_layer.transpose(1, - 2).reshape(output_size[0] * output_size[1], - output_size[3], - -1) + key_layer = key_layer.transpose(1, 2).reshape(output_size[0] * output_size[1], output_size[3], + -1).transpose(-1, -2) + value_layer = value_layer.transpose(1, 2).reshape(output_size[0] * output_size[1], output_size[3], -1) if layer_past is not None: past_key, past_value = layer_past # concatenate along seq_length dimension -> [batch_size, qk_length, num_heads, head_dim] key_layer = torch.cat((past_key.type_as(key_layer), key_layer), dim=-1) - value_layer = torch.cat((past_value.type_as(value_layer), - value_layer), - dim=-2) + value_layer = torch.cat((past_value.type_as(value_layer), value_layer), dim=-2) presents = (key_layer, value_layer) # Raw attention scores. [batch_size * num_heads, q_length, k_length] matmul_result = torch.matmul(query_layer, key_layer) # change view to [batch_size, num_heads, q_length, k_length] - attention_scores = matmul_result.view(output_size[0], - output_size[1], - output_size[2], - -1) + attention_scores = matmul_result.view(output_size[0], output_size[1], output_size[2], -1) - offset = dist.get_rank( - ) * self.num_attention_heads_per_partition if dist.is_initialized() else 0 - attention_probs = self.softmax_func( - attn_scores=attention_scores, - attn_mask=((1 - input_mask).half() * minus_inf), - alibi=alibi, - triangular=(self.config.triangular_masking - and (attention_scores.shape[-2] > 1)), - recompute=False, - local_attention=False, - window_size=1, - async_op=False, - layer_scale=1 / (self.norm_factor * self.norm_factor), - head_offset=offset) + offset = dist.get_rank() * self.num_attention_heads_per_partition if dist.is_initialized() else 0 + attention_probs = self.softmax_func(attn_scores=attention_scores, + attn_mask=((1 - input_mask).half() * minus_inf), + alibi=alibi, + triangular=(self.config.triangular_masking + and (attention_scores.shape[-2] > 1)), + recompute=False, + local_attention=False, + window_size=1, + async_op=False, + layer_scale=1 / (self.norm_factor * self.norm_factor), + head_offset=offset) # change view [batch_size x num_heads, q_length, k_length] attention_probs_reshaped = attention_probs.view(*matmul_result.shape) @@ -263,10 +232,8 @@ class BloomSelfAttention(DeepSpeedSelfAttention): # change view [batch_size, num_heads, q_length, head_dim] context_layer = context_layer.view( - context_layer.size(0) // self.num_attention_heads_per_partition, - self.num_attention_heads_per_partition, - context_layer.size(1), - context_layer.shape[-1]) + context_layer.size(0) // self.num_attention_heads_per_partition, self.num_attention_heads_per_partition, + context_layer.size(1), context_layer.shape[-1]) context_layer = self._transpose_for_context(context_layer) key_layer = presents[0] diff --git a/deepspeed/ops/transformer/inference/ds_mlp.py b/deepspeed/ops/transformer/inference/ds_mlp.py index 02d492d8e..1f398a164 100644 --- a/deepspeed/ops/transformer/inference/ds_mlp.py +++ b/deepspeed/ops/transformer/inference/ds_mlp.py @@ -11,26 +11,17 @@ from .op_binding import MLPGemmOp, VectorMatMulOp, GELUGemmOp, ResidualAddOp class DeepSpeedMLP(nn.Module): - def __init__(self, - config, - mp_group=None, - q_scales=None, - q_groups=1, - merge_count=1, - mlp_extra_grouping=False): + + def __init__(self, config, mp_group=None, q_scales=None, q_groups=1, merge_count=1, mlp_extra_grouping=False): super(DeepSpeedMLP, self).__init__() self.config = config data_type = torch.int8 if config.q_int8 else torch.half if config.fp16 else torch.float data_type_fp = torch.half if config.fp16 else torch.float device = get_accelerator().current_device_name() - self.attn_nw = nn.Parameter(torch.empty(self.config.hidden_size, - dtype=data_type_fp, - device=device), + self.attn_nw = nn.Parameter(torch.empty(self.config.hidden_size, dtype=data_type_fp, device=device), requires_grad=False) - self.attn_nb = nn.Parameter(torch.empty(self.config.hidden_size, - dtype=data_type_fp, - device=device), + self.attn_nb = nn.Parameter(torch.empty(self.config.hidden_size, dtype=data_type_fp, device=device), requires_grad=False) intm_size_per_partition = self.config.intermediate_size // self.config.mp_size self.inter_w = nn.Parameter(torch.empty(self.config.hidden_size, @@ -38,18 +29,14 @@ class DeepSpeedMLP(nn.Module): dtype=data_type, device=device), requires_grad=False) - self.inter_b = nn.Parameter(torch.empty(intm_size_per_partition, - dtype=data_type_fp, - device=device), + self.inter_b = nn.Parameter(torch.empty(intm_size_per_partition, dtype=data_type_fp, device=device), requires_grad=False) self.output_w = nn.Parameter(torch.empty(intm_size_per_partition, self.config.hidden_size, dtype=data_type, device=device), requires_grad=False) - self.output_b = nn.Parameter(torch.empty(self.config.hidden_size, - dtype=data_type_fp, - device=device), + self.output_b = nn.Parameter(torch.empty(self.config.hidden_size, dtype=data_type_fp, device=device), requires_grad=False) # used for quantization @@ -79,14 +66,13 @@ class DeepSpeedMLP(nn.Module): bias=self.inter_b, gamma=self.attn_nw, beta=self.attn_nb) - residual = self.residual_add_func( - hidden_state=output, - residual=residual, - attention_output=input, - attention_bias=bias if bias is not None else self.output_b, - final_bias=self.output_b, - add_bias=bias is not None, - residual_add=residual_add) + residual = self.residual_add_func(hidden_state=output, + residual=residual, + attention_output=input, + attention_bias=bias if bias is not None else self.output_b, + final_bias=self.output_b, + add_bias=bias is not None, + residual_add=residual_add) if self.mp_group is not None and dist.get_world_size(group=self.mp_group) > 1: dist.all_reduce(residual, group=self.mp_group) diff --git a/deepspeed/ops/transformer/inference/moe_inference.py b/deepspeed/ops/transformer/inference/moe_inference.py index d5e45c7eb..0fcb00846 100644 --- a/deepspeed/ops/transformer/inference/moe_inference.py +++ b/deepspeed/ops/transformer/inference/moe_inference.py @@ -43,6 +43,7 @@ class DeepSpeedMoEInferenceConfig(DeepSpeedInferenceConfig): scale_attention: If true, both q and k are scaled by 1/sqrt(attention_heads) before attention computation. return_tuple: if True, returns the transformer output as a tuple, otherwise returns as a tensor """ + def __init__(self, hidden_size=-1, intermediate_size=-1, @@ -72,23 +73,10 @@ class DeepSpeedMoEInferenceConfig(DeepSpeedInferenceConfig): mlp_type='standard', scale_attn_by_inverse_layer_idx=False): super(DeepSpeedMoEInferenceConfig, - self).__init__( - hidden_size, - (intermediate_size if intermediate_size > 0 else 4 * hidden_size), - heads, - num_hidden_layers, - layer_norm_eps, - local_rank, - mp_size, - fp16, - q_int8, - pre_layer_norm, - stochastic_mode, - scale_attention, - triangular_masking, - local_attention, - window_size, - return_tuple) + self).__init__(hidden_size, (intermediate_size if intermediate_size > 0 else 4 * hidden_size), heads, + num_hidden_layers, layer_norm_eps, local_rank, mp_size, fp16, q_int8, pre_layer_norm, + stochastic_mode, scale_attention, triangular_masking, local_attention, window_size, + return_tuple) self.moe_experts = moe_experts self.k = k self.capacity_factor = capacity_factor @@ -116,44 +104,21 @@ class DeepSpeedMoEInferenceConfig(DeepSpeedInferenceConfig): class DeepSpeedMLPFunction(Function): + @staticmethod - def forward(ctx, - input, - inter_w, - inter_b, - config, - output_b, - output_w, - q_scales, - q_groups, - merge_count, - mp_group, + def forward(ctx, input, inter_w, inter_b, config, output_b, output_w, q_scales, q_groups, merge_count, mp_group, async_op): if config.q_int8: - intermediate = inference_cuda_module.fused_gemm_gelu_int8( - input, - inter_w, - inter_b, - config.epsilon, - q_scales[2], - (q_groups * (2**merge_count)), - config.pre_layer_norm) - output = inference_cuda_module.vector_matmul_int8(intermediate, - output_w, - q_scales[3], - q_groups, + intermediate = inference_cuda_module.fused_gemm_gelu_int8(input, inter_w, inter_b, config.epsilon, + q_scales[2], (q_groups * (2**merge_count)), + config.pre_layer_norm) + output = inference_cuda_module.vector_matmul_int8(intermediate, output_w, q_scales[3], q_groups, (merge_count)) else: mlp_gemm_func = inference_cuda_module.fused_gemm_gelu_fp16 if config.fp16 else \ inference_cuda_module.fused_gemm_gelu_fp32 - output = mlp_gemm_func(input, - inter_w, - inter_b, - output_w, - config.epsilon, - config.pre_layer_norm, - async_op) + output = mlp_gemm_func(input, inter_w, inter_b, output_w, config.epsilon, config.pre_layer_norm, async_op) if mp_group is not None and dist.get_world_size(group=mp_group) > 1: dist.all_reduce(output, group=mp_group, async_op=async_op) @@ -166,24 +131,17 @@ class DeepSpeedMLPFunction(Function): class DeepSpeedMoEMLP(nn.Module): - def __init__(self, - config, - q_scales=None, - q_groups=1, - merge_count=1, - mlp_extra_grouping=False, - mp_group=None): + + def __init__(self, config, q_scales=None, q_groups=1, merge_count=1, mlp_extra_grouping=False, mp_group=None): super(DeepSpeedMoEMLP, self).__init__() self.config = config self.attn_nw = nn.Parameter(torch.Tensor(self.config.hidden_size)) self.attn_nb = nn.Parameter(torch.Tensor(self.config.hidden_size)) - interm_size = self.config.intermediate_size // ( - 1 if mp_group is None else dist.get_world_size(group=mp_group)) + interm_size = self.config.intermediate_size // (1 if mp_group is None else dist.get_world_size(group=mp_group)) self.inter_w = nn.Parameter(torch.Tensor(self.config.hidden_size, interm_size)) self.inter_b = nn.Parameter(torch.Tensor(interm_size)) - self.output_w = nn.Parameter(torch.Tensor((interm_size), - self.config.hidden_size)) + self.output_w = nn.Parameter(torch.Tensor((interm_size), self.config.hidden_size)) self.output_b = nn.Parameter(torch.Tensor(self.config.hidden_size)) # used for quantization @@ -193,17 +151,8 @@ class DeepSpeedMoEMLP(nn.Module): self.mp_group = mp_group def forward(self, input, async_op=False): - return DeepSpeedMLPFunction.apply(input, - self.inter_w, - self.inter_b, - self.config, - self.output_b, - self.output_w, - self.q_scales, - self.q_groups, - self.merge_count, - self.mp_group, - async_op) + return DeepSpeedMLPFunction.apply(input, self.inter_w, self.inter_b, self.config, self.output_b, self.output_w, + self.q_scales, self.q_groups, self.merge_count, self.mp_group, async_op) class DeepSpeedMoEInference(nn.Module): @@ -251,11 +200,7 @@ class DeepSpeedMoEInference(nn.Module): self.config.specialized_mode = specialized_mode DeepSpeedMoEInference.layer_id += 1 - self.attention = DeepSpeedSelfAttention(self.config, - mp_group, - quantize_scales, - quantize_groups, - merge_count) + self.attention = DeepSpeedSelfAttention(self.config, mp_group, quantize_scales, quantize_groups, merge_count) self.attn_nw = nn.Parameter(torch.Tensor(self.config.hidden_size)) self.attn_nb = nn.Parameter(torch.Tensor(self.config.hidden_size)) @@ -263,11 +208,7 @@ class DeepSpeedMoEInference(nn.Module): self.norm_b = nn.Parameter(torch.Tensor(self.config.hidden_size)) if config.mlp_type == 'residual': - self.res_mlp = DeepSpeedMoEMLP(config, - quantize_scales, - quantize_groups, - merge_count, - mlp_extra_grouping, + self.res_mlp = DeepSpeedMoEMLP(config, quantize_scales, quantize_groups, merge_count, mlp_extra_grouping, mp_group) self.res_coef = nn.Parameter(torch.Tensor(self.config.hidden_size, 2)) self.coef_func = inference_cuda_module.softmax_fp16 if self.config.fp16 or self.config.q_int8 else \ @@ -277,21 +218,12 @@ class DeepSpeedMoEInference(nn.Module): config.mp_size = 1 self.mlp = nn.ModuleList( - DeepSpeedMoEMLP(config, - quantize_scales, - quantize_groups, - merge_count, - mlp_extra_grouping, - expert_mp_group) for i in range(self.config.moe_experts)) + DeepSpeedMoEMLP(config, quantize_scales, quantize_groups, merge_count, mlp_extra_grouping, expert_mp_group) + for i in range(self.config.moe_experts)) - self.moe_gate = TopKGate(self.config.hidden_size, - self.config.global_experts, - self.config.k, - self.config.capacity_factor, - self.config.eval_capacity_factor, - self.config.min_capacity, - self.config.noisy_gate_policy, - self.config.drop_tokens, + self.moe_gate = TopKGate(self.config.hidden_size, self.config.global_experts, self.config.k, + self.config.capacity_factor, self.config.eval_capacity_factor, + self.config.min_capacity, self.config.noisy_gate_policy, self.config.drop_tokens, self.config.use_rts) self.ep_group = ep_group @@ -315,19 +247,14 @@ class DeepSpeedMoEInference(nn.Module): _, combined_weights, dispatch_mask, _ = self.moe_gate( attention_output.view(-1, self.config.hidden_size), None, - ) - dispatched_attention = self.einsum_sec_sm_ecm( - dispatch_mask.type_as(attention_output), - attention_output.view(-1, - self.config.hidden_size)) + ) + dispatched_attention = self.einsum_sec_sm_ecm(dispatch_mask.type_as(attention_output), + attention_output.view(-1, self.config.hidden_size)) return dispatched_attention, combined_weights def expert_exec(self, dispatched_input): - dispatched_input = dispatched_input.reshape( - self.config.global_experts // self.config.moe_experts, - self.config.moe_experts, - -1, - self.config.hidden_size) + dispatched_input = dispatched_input.reshape(self.config.global_experts // self.config.moe_experts, + self.config.moe_experts, -1, self.config.hidden_size) chunks = dispatched_input.chunk(self.config.moe_experts, dim=1) expert_outputs = torch.empty(( @@ -337,29 +264,22 @@ class DeepSpeedMoEInference(nn.Module): dtype=dispatched_input.dtype, device=dispatched_input.device) for chunk, expert in zip(chunks, range(len(self.mlp))): - expert_outputs[expert] = self.mlp[expert](chunk.view( - -1, - dispatched_input.shape[-2], - dispatched_input.shape[-1])) + expert_outputs[expert] = self.mlp[expert](chunk.view(-1, dispatched_input.shape[-2], + dispatched_input.shape[-1])) return expert_outputs def _alltoall(self, dispatched_attention): if dist.get_world_size(group=self.ep_group) > 1: dispatched_input = torch.empty_like(dispatched_attention) - dist.all_to_all_single(dispatched_input, - dispatched_attention, - group=self.ep_group) + dist.all_to_all_single(dispatched_input, dispatched_attention, group=self.ep_group) return dispatched_input else: return dispatched_attention def scale_expert_output(self, attention_output, expert_output, combined_weights): combined_output = torch.matmul( - combined_weights.type_as(attention_output).reshape( - combined_weights.shape[0], - -1), - expert_output.reshape(-1, - expert_output.shape[-1])) + combined_weights.type_as(attention_output).reshape(combined_weights.shape[0], -1), + expert_output.reshape(-1, expert_output.shape[-1])) return combined_output.reshape(attention_output.shape) def forward(self, @@ -385,16 +305,9 @@ class DeepSpeedMoEInference(nn.Module): input = input.half() with torch.no_grad(): - attention_output = self.attention(input, - input_mask, - head_mask, - layer_past, - get_present, - encoder_hidden_states, - encoder_attention_mask, - output_attentions, - self.norm_w, - self.norm_b) + attention_output = self.attention(input, input_mask, head_mask, layer_past, get_present, + encoder_hidden_states, encoder_attention_mask, output_attentions, + self.norm_w, self.norm_b) if get_present: attention_output, p_key, p_value = attention_output[0:3] @@ -405,10 +318,7 @@ class DeepSpeedMoEInference(nn.Module): attention_output = attention_output[0] residual_add = attention_output + self.attention.attn_ob - attention_output = self.ds_layernorm(residual_add, - self.attn_nw, - self.attn_nb, - self.config.epsilon) + attention_output = self.ds_layernorm(residual_add, self.attn_nw, self.attn_nb, self.config.epsilon) if self.config.mlp_type == 'residual': res_mlp_out = self.res_mlp(attention_output, async_op=True) @@ -416,13 +326,10 @@ class DeepSpeedMoEInference(nn.Module): if self.expert_mp_group is not None: tensor_list = [ - torch.empty_like(attention_output) - for _ in range(dist.get_world_size(group=self.expert_mp_group)) + torch.empty_like(attention_output) for _ in range(dist.get_world_size(group=self.expert_mp_group)) ] tensor_list[dist.get_rank(group=self.expert_mp_group)] = attention_output - dist.all_gather(tensor_list, - attention_output, - group=self.expert_mp_group) + dist.all_gather(tensor_list, attention_output, group=self.expert_mp_group) attention_output = torch.cat(tensor_list).contiguous() ############## MoE Gating + Experts ############### @@ -430,14 +337,11 @@ class DeepSpeedMoEInference(nn.Module): dispatched_input = self._alltoall(dispatched_attention) expert_outputs = self.expert_exec(dispatched_input) expert_output = self._alltoall(expert_outputs) - output = self.scale_expert_output(attention_output, - expert_output, - combined_weights) + output = self.scale_expert_output(attention_output, expert_output, combined_weights) ################################################ if self.expert_mp_group is not None: - output = output.split(output.shape[0] // - dist.get_world_size(group=self.expert_mp_group), + output = output.split(output.shape[0] // dist.get_world_size(group=self.expert_mp_group), dim=0)[dist.get_rank(group=self.expert_mp_group)] if self.config.mlp_type == 'residual': @@ -446,10 +350,7 @@ class DeepSpeedMoEInference(nn.Module): output = self.bias_residual_func(output, residual_add, torch.empty(1)) if not self.config.pre_layer_norm: - output = self.ds_layernorm(output, - self.norm_w, - self.norm_b, - self.config.epsilon) + output = self.ds_layernorm(output, self.norm_w, self.norm_b, self.config.epsilon) if input_type != output.dtype: output = output.to(input_type) diff --git a/deepspeed/ops/transformer/inference/op_binding/gelu_gemm.py b/deepspeed/ops/transformer/inference/op_binding/gelu_gemm.py index 9ab4ef926..d5ecc54d4 100644 --- a/deepspeed/ops/transformer/inference/op_binding/gelu_gemm.py +++ b/deepspeed/ops/transformer/inference/op_binding/gelu_gemm.py @@ -6,6 +6,7 @@ from .base import BaseOp class GELUGemmOp(BaseOp): + def __init__(self, config: DeepSpeedInferenceConfig): super(GELUGemmOp, self).__init__(config) if self.config.fp16: @@ -19,14 +20,6 @@ class GELUGemmOp(BaseOp): bias: torch.Tensor, weight_out: torch.Tensor, async_op: bool = False): - output = self.fused_gemm_gelu(input, - weight, - weight.scale, - bias, - weight_out, - weight_out.scale, - self.config.epsilon, - self.config.pre_layer_norm, - self.config.q_int8, - async_op) + output = self.fused_gemm_gelu(input, weight, weight.scale, bias, weight_out, weight_out.scale, + self.config.epsilon, self.config.pre_layer_norm, self.config.q_int8, async_op) return output diff --git a/deepspeed/ops/transformer/inference/op_binding/linear.py b/deepspeed/ops/transformer/inference/op_binding/linear.py index 6d83ffce2..f08268ef0 100644 --- a/deepspeed/ops/transformer/inference/op_binding/linear.py +++ b/deepspeed/ops/transformer/inference/op_binding/linear.py @@ -6,6 +6,7 @@ from .base import BaseOp class LinearOp(BaseOp): + def __init__(self, config: DeepSpeedInferenceConfig): super(LinearOp, self).__init__(config) if self.config.fp16: @@ -22,10 +23,5 @@ class LinearOp(BaseOp): num_heads: int, external_cache: bool = None, num_layers: int = None): - qkv_out = self.linear_func(input, - weight, - bias, - add_bias, - do_flash_attn, - num_heads) + qkv_out = self.linear_func(input, weight, bias, add_bias, do_flash_attn, num_heads) return qkv_out diff --git a/deepspeed/ops/transformer/inference/op_binding/mlp_gemm.py b/deepspeed/ops/transformer/inference/op_binding/mlp_gemm.py index 4df8ef52c..c89c02202 100644 --- a/deepspeed/ops/transformer/inference/op_binding/mlp_gemm.py +++ b/deepspeed/ops/transformer/inference/op_binding/mlp_gemm.py @@ -6,6 +6,7 @@ from .base import BaseOp class MLPGemmOp(BaseOp): + def __init__(self, config: DeepSpeedInferenceConfig): super(MLPGemmOp, self).__init__(config) if self.config.fp16: @@ -13,29 +14,11 @@ class MLPGemmOp(BaseOp): else: self.mlp_gemm_func = self.inference_cuda_module.mlp_gemm_fp32 - def forward(self, - input: torch.Tensor, - residual: torch.Tensor, - input_bias: torch.Tensor, - weight_interm: torch.Tensor, - weight_out: torch.Tensor, - bias: torch.Tensor, - gamma: torch.Tensor, + def forward(self, input: torch.Tensor, residual: torch.Tensor, input_bias: torch.Tensor, + weight_interm: torch.Tensor, weight_out: torch.Tensor, bias: torch.Tensor, gamma: torch.Tensor, beta: torch.Tensor): - output, residual_add = self.mlp_gemm_func( - input, - residual, - input_bias, - weight_interm, - weight_out, - bias, - gamma, - beta, - self.config.epsilon, - self.config.pre_layer_norm, - self.config.mlp_after_attn, - weight_interm.scale, - weight_out.scale, - self.config.q_int8, - self.config.mlp_act_func_type) + output, residual_add = self.mlp_gemm_func(input, residual, input_bias, weight_interm, weight_out, bias, gamma, + beta, self.config.epsilon, self.config.pre_layer_norm, + self.config.mlp_after_attn, weight_interm.scale, weight_out.scale, + self.config.q_int8, self.config.mlp_act_func_type) return output, residual_add diff --git a/deepspeed/ops/transformer/inference/op_binding/qkv_gemm.py b/deepspeed/ops/transformer/inference/op_binding/qkv_gemm.py index d50dbfd3e..cf1b979dc 100644 --- a/deepspeed/ops/transformer/inference/op_binding/qkv_gemm.py +++ b/deepspeed/ops/transformer/inference/op_binding/qkv_gemm.py @@ -7,6 +7,7 @@ from deepspeed import comm as dist class QKVGemmOp(BaseOp): + def __init__(self, config: DeepSpeedInferenceConfig): super(QKVGemmOp, self).__init__(config) if self.config.fp16: @@ -28,17 +29,6 @@ class QKVGemmOp(BaseOp): external_cache = self.config.bigscience_bloom rank = dist.get_rank() if dist.is_initialized() else 0 q_int8 = self.config.q_int8 - output = self.qkv_gemm_func(input, - weight, - q_scale, - bias, - gamma, - beta, - self.config.epsilon, - add_bias, - num_layers, - external_cache, - self.config.mp_size, - rank, - q_int8) + output = self.qkv_gemm_func(input, weight, q_scale, bias, gamma, beta, self.config.epsilon, add_bias, + num_layers, external_cache, self.config.mp_size, rank, q_int8) return output diff --git a/deepspeed/ops/transformer/inference/op_binding/residual_add.py b/deepspeed/ops/transformer/inference/op_binding/residual_add.py index 0fb1741a0..808ead4c0 100644 --- a/deepspeed/ops/transformer/inference/op_binding/residual_add.py +++ b/deepspeed/ops/transformer/inference/op_binding/residual_add.py @@ -6,6 +6,7 @@ from .base import BaseOp class ResidualAddOp(BaseOp): + def __init__(self, config: DeepSpeedInferenceConfig): super(ResidualAddOp, self).__init__(config) if self.config.fp16 or self.config.q_int8: @@ -13,26 +14,13 @@ class ResidualAddOp(BaseOp): else: self.residual_add_func = self.inference_cuda_module.residual_add_bias_fp32 - def forward(self, - hidden_state: torch.Tensor, - residual: torch.Tensor, - attention_output: torch.Tensor, - attention_bias: torch.Tensor, - final_bias: torch.Tensor, - add_bias: bool, - residual_add: torch.Tensor): + def forward(self, hidden_state: torch.Tensor, residual: torch.Tensor, attention_output: torch.Tensor, + attention_bias: torch.Tensor, final_bias: torch.Tensor, add_bias: bool, residual_add: torch.Tensor): if not self.config.pre_layer_norm and residual_add is not None: # only use residual add if its set and we are not pre layer norm residual = residual_add - self.residual_add_func(hidden_state, - residual, - attention_output, - attention_bias, - final_bias, - self.config.mp_size, - self.config.mlp_after_attn, - add_bias, - self.config.pre_layer_norm) + self.residual_add_func(hidden_state, residual, attention_output, attention_bias, final_bias, + self.config.mp_size, self.config.mlp_after_attn, add_bias, self.config.pre_layer_norm) return residual diff --git a/deepspeed/ops/transformer/inference/op_binding/softmax.py b/deepspeed/ops/transformer/inference/op_binding/softmax.py index 4d58ba4a4..1c11eab0d 100644 --- a/deepspeed/ops/transformer/inference/op_binding/softmax.py +++ b/deepspeed/ops/transformer/inference/op_binding/softmax.py @@ -6,6 +6,7 @@ from .base import BaseOp class SoftmaxOp(BaseOp): + def __init__(self, config: DeepSpeedInferenceConfig): super(SoftmaxOp, self).__init__(config) if self.config.fp16: @@ -16,26 +17,9 @@ class SoftmaxOp(BaseOp): def _not_implemented(self, *args, **kwargs): raise NotImplementedError - def forward(self, - attn_scores: torch.Tensor, - attn_mask: torch.Tensor, - alibi: torch.Tensor, - triangular: bool, - recompute: bool, - local_attention: bool, - window_size: int, - async_op: bool, - layer_scale: float, + def forward(self, attn_scores: torch.Tensor, attn_mask: torch.Tensor, alibi: torch.Tensor, triangular: bool, + recompute: bool, local_attention: bool, window_size: int, async_op: bool, layer_scale: float, head_offset: int): - output = self.softmax_func(attn_scores, - attn_mask, - alibi, - triangular, - recompute, - local_attention, - window_size, - async_op, - layer_scale, - head_offset, - self.config.mp_size) + output = self.softmax_func(attn_scores, attn_mask, alibi, triangular, recompute, local_attention, window_size, + async_op, layer_scale, head_offset, self.config.mp_size) return output diff --git a/deepspeed/ops/transformer/inference/op_binding/softmax_context.py b/deepspeed/ops/transformer/inference/op_binding/softmax_context.py index 818af5f34..b2ebbe137 100644 --- a/deepspeed/ops/transformer/inference/op_binding/softmax_context.py +++ b/deepspeed/ops/transformer/inference/op_binding/softmax_context.py @@ -7,6 +7,7 @@ from .base import BaseOp class SoftmaxContextOp(BaseOp): + def __init__(self, config: DeepSpeedInferenceConfig): super(SoftmaxContextOp, self).__init__(config) if self.config.fp16: @@ -14,15 +15,8 @@ class SoftmaxContextOp(BaseOp): else: self.softmax_context_func = self.inference_cuda_module.softmax_context_fp32 - def forward(self, - query_key_value: torch.Tensor, - attn_mask: torch.Tensor, - heads: int, - norm_factor: float, - no_masking: bool, - layer_id: int, - num_layers: int, - alibi: torch.Tensor): + def forward(self, query_key_value: torch.Tensor, attn_mask: torch.Tensor, heads: int, norm_factor: float, + no_masking: bool, layer_id: int, num_layers: int, alibi: torch.Tensor): if alibi is not None: batch_heads = query_key_value.shape[0] * heads @@ -31,18 +25,8 @@ class SoftmaxContextOp(BaseOp): else: alibi = torch.empty(1) - output = self.softmax_context_func(query_key_value, - attn_mask, - self.config.rotary_dim, - self.config.rotate_half, - self.config.rotate_every_two, - heads, - norm_factor, - self.config.triangular_masking, - self.config.local_attention, - self.config.window_size, - no_masking, - layer_id, - num_layers, - alibi) + output = self.softmax_context_func(query_key_value, attn_mask, self.config.rotary_dim, self.config.rotate_half, + self.config.rotate_every_two, heads, norm_factor, + self.config.triangular_masking, self.config.local_attention, + self.config.window_size, no_masking, layer_id, num_layers, alibi) return output diff --git a/deepspeed/ops/transformer/inference/op_binding/vector_matmul.py b/deepspeed/ops/transformer/inference/op_binding/vector_matmul.py index c22b61cbd..f29398554 100644 --- a/deepspeed/ops/transformer/inference/op_binding/vector_matmul.py +++ b/deepspeed/ops/transformer/inference/op_binding/vector_matmul.py @@ -6,6 +6,7 @@ from .base import BaseOp class VectorMatMulOp(BaseOp): + def __init__(self, config: DeepSpeedInferenceConfig): super(VectorMatMulOp, self).__init__(config) if self.config.fp16: diff --git a/deepspeed/ops/transformer/inference/triton_ops.py b/deepspeed/ops/transformer/inference/triton_ops.py index 45c6db66d..8922e495c 100644 --- a/deepspeed/ops/transformer/inference/triton_ops.py +++ b/deepspeed/ops/transformer/inference/triton_ops.py @@ -102,6 +102,7 @@ def _fwd_kernel( class triton_flash_attn(torch.nn.Module): + def __init__(self, ): super(triton_flash_attn, self).__init__() @@ -111,10 +112,7 @@ class triton_flash_attn(torch.nn.Module): Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1] o = torch.empty_like(q) grid = (triton.cdiv(q.shape[2], BLOCK), q.shape[0] * q.shape[1]) - tmp = torch.empty((q.shape[0] * q.shape[1], - q.shape[2]), - device=q.device, - dtype=torch.float32) + tmp = torch.empty((q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32) num_warps = 4 if Lk <= 64 else 8 _fwd_kernel[grid]( diff --git a/deepspeed/ops/transformer/transformer.py b/deepspeed/ops/transformer/transformer.py index ae627304f..34b8e566f 100755 --- a/deepspeed/ops/transformer/transformer.py +++ b/deepspeed/ops/transformer/transformer.py @@ -15,15 +15,9 @@ stochastic_transformer_cuda_module = None class TransformerConfig(): - def __init__(self, - batch_size, - hidden_size, - intermediate_size, - heads, - attn_dropout_ratio, - hidden_dropout_ratio, - num_hidden_layers, - initializer_range): + + def __init__(self, batch_size, hidden_size, intermediate_size, heads, attn_dropout_ratio, hidden_dropout_ratio, + num_hidden_layers, initializer_range): self.layer_id = -1 self.batch_size = batch_size self.hidden_size = hidden_size @@ -89,6 +83,7 @@ class DeepSpeedTransformerConfig(TransformerConfig): training: Enable for training rather than inference. """ + def __init__(self, batch_size=-1, hidden_size=-1, @@ -111,15 +106,9 @@ class DeepSpeedTransformerConfig(TransformerConfig): return_tuple=False, training=True): super(DeepSpeedTransformerConfig, - self).__init__( - batch_size, - hidden_size, - (intermediate_size if intermediate_size > 0 else 4 * hidden_size), - heads, - attn_dropout_ratio, - hidden_dropout_ratio, - num_hidden_layers, - initializer_range) + self).__init__(batch_size, hidden_size, + (intermediate_size if intermediate_size > 0 else 4 * hidden_size), heads, + attn_dropout_ratio, hidden_dropout_ratio, num_hidden_layers, initializer_range) self.fp16 = fp16 self.pre_layer_norm = pre_layer_norm self.local_rank = local_rank @@ -150,97 +139,42 @@ class DeepSpeedTransformerConfig(TransformerConfig): class DeepSpeedTransformerFunction(Function): + @staticmethod - def forward(ctx, - input, - input_mask, - self, - grads, - layer_id, - attn_qkvw, - attn_qkvb, - attn_ow, - attn_ob, - attn_nw, - attn_nb, - inter_w, - inter_b, - output_w, - output_b, - norm_w, - norm_b, - config): + def forward(ctx, input, input_mask, self, grads, layer_id, attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw, + attn_nb, inter_w, inter_b, output_w, output_b, norm_w, norm_b, config): cuda_module = stochastic_transformer_cuda_module if config.stochastic_mode else transformer_cuda_module forward_func = cuda_module.forward_fp16 if config.fp16 else cuda_module.forward_fp32 inp_size = input.size() if inp_size[1] % 16 != 0: - input = torch.cat((input, - torch.randn((inp_size[0], - (16 - (inp_size[1] % 16)), - inp_size[2]), - device=input.device, - dtype=input.dtype)), - 1) + input = torch.cat( + (input, + torch.randn( + (inp_size[0], (16 - (inp_size[1] % 16)), inp_size[2]), device=input.device, dtype=input.dtype)), + 1) input_mask = torch.cat((input_mask, torch.ones((inp_size[0], input_mask.shape[1], input_mask.shape[2], \ (16 - (inp_size[1] % 16))), device=input_mask.device, dtype=input_mask.dtype) * -10000), 3) - (output, - inp_norm, - qkv_tf, - soft_inp, - ctx_bufB, - attn_o_inp, - add_res, - ff1_inp, - gelu_inp, - ff2_inp, - attn_prob_dropout_mask, - attn_output_dropout_mask, - layer_output_dropout_mask, - attn_layer_norm_var, - attn_layer_norm_mean, - layer_norm_var, - layer_norm_mean) = forward_func(config.layer_id, - input, - input_mask, - attn_qkvw, - attn_qkvb, - attn_ow, - attn_ob, - attn_nw, - attn_nb, - inter_w, - inter_b, - output_w, - output_b, - norm_w, - norm_b, - config.training and config.is_grad_enabled, - config.pre_layer_norm, - config.attn_dropout_checkpoint, - config.normalize_invertible, - config.gelu_checkpoint) + (output, inp_norm, qkv_tf, soft_inp, ctx_bufB, attn_o_inp, add_res, ff1_inp, gelu_inp, ff2_inp, + attn_prob_dropout_mask, attn_output_dropout_mask, layer_output_dropout_mask, attn_layer_norm_var, + attn_layer_norm_mean, layer_norm_var, layer_norm_mean) = forward_func( + config.layer_id, input, input_mask, attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw, attn_nb, inter_w, + inter_b, output_w, output_b, norm_w, norm_b, config.training and config.is_grad_enabled, + config.pre_layer_norm, config.attn_dropout_checkpoint, config.normalize_invertible, + config.gelu_checkpoint) # For testing only. if grads is not None: for i in [2]: - attn_qkvw.register_hook( - lambda x, - i=i, - self=self: grads.append([ - x[i * attn_ow.size(0):(i + 1) * attn_ow.size(0)], - ("Q_W" if i == 0 else "K_W" if i == 1 else "V_W") - ])) + attn_qkvw.register_hook(lambda x, i=i, self=self: grads.append([ + x[i * attn_ow.size(0):(i + 1) * attn_ow.size(0)], ("Q_W" if i == 0 else "K_W" if i == 1 else "V_W") + ])) for i in [2]: - attn_qkvb.register_hook( - lambda x, - i=i, - self=self: grads.append([ - x[i * attn_ow.size(0):(i + 1) * attn_ow.size(0)], - ("Q_B" if i == 0 else "K_B" if i == 1 else "V_B") - ])) + attn_qkvb.register_hook(lambda x, i=i, self=self: grads.append([ + x[i * attn_ow.size(0):(i + 1) * attn_ow.size(0)], ("Q_B" if i == 0 else "K_B" if i == 1 else "V_B") + ])) attn_ow.register_hook(lambda x, self=self: grads.append([x, "O_W"])) attn_ob.register_hook(lambda x, self=self: grads.append([x, "O_B"])) @@ -255,35 +189,11 @@ class DeepSpeedTransformerFunction(Function): if config.is_grad_enabled and config.training: if (config.pre_layer_norm and config.normalize_invertible): - ctx.save_for_backward(input_mask, - attn_qkvw, - attn_qkvb, - attn_ow, - attn_ob, - attn_nw, - attn_nb, - inter_w, - inter_b, - output_w, - output_b, - norm_w, - norm_b) + ctx.save_for_backward(input_mask, attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw, attn_nb, inter_w, + inter_b, output_w, output_b, norm_w, norm_b) else: - ctx.save_for_backward(output, - input, - input_mask, - attn_qkvw, - attn_qkvb, - attn_ow, - attn_ob, - attn_nw, - attn_nb, - inter_w, - inter_b, - output_w, - output_b, - norm_w, - norm_b) + ctx.save_for_backward(output, input, input_mask, attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw, + attn_nb, inter_w, inter_b, output_w, output_b, norm_w, norm_b) ctx.config = config if (config.pre_layer_norm or not config.normalize_invertible): @@ -331,88 +241,28 @@ class DeepSpeedTransformerFunction(Function): assert ctx.config.training if (ctx.config.pre_layer_norm and ctx.config.normalize_invertible): - (input_mask, - attn_qkvw, - attn_qkvb, - attn_ow, - attn_ob, - attn_nw, - attn_nb, - inter_w, - inter_b, - output_w, - output_b, - norm_w, - norm_b) = ctx.saved_tensors + (input_mask, attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw, attn_nb, inter_w, inter_b, output_w, + output_b, norm_w, norm_b) = ctx.saved_tensors else: - (output, - input, - input_mask, - attn_qkvw, - attn_qkvb, - attn_ow, - attn_ob, - attn_nw, - attn_nb, - inter_w, - inter_b, - output_w, - output_b, - norm_w, - norm_b) = ctx.saved_tensors + (output, input, input_mask, attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw, attn_nb, inter_w, inter_b, + output_w, output_b, norm_w, norm_b) = ctx.saved_tensors cuda_module = stochastic_transformer_cuda_module if ctx.config.stochastic_mode else transformer_cuda_module backward_func = cuda_module.backward_fp16 if ctx.config.fp16 else cuda_module.backward_fp32 - (grad_input, - grad_attn_qkvw, - grad_attn_qkvb, - grad_attn_ow, - grad_attn_ob, - grad_attn_nw, - grad_attn_nb, - grad_inter_w, - grad_inter_b, - grad_output_w, - grad_output_b, - grad_norm_w, - grad_norm_b) = backward_func( - ctx.config.layer_id, - grad_output, - (ctx.inp_norm if (ctx.config.pre_layer_norm - and ctx.config.normalize_invertible) else output), - (ctx.inp_norm if (ctx.config.pre_layer_norm - or not ctx.config.normalize_invertible) else input), - ctx.qkv_tf, - ctx.soft_inp, - (ctx.soft_inp if ctx.config.attn_dropout_checkpoint else ctx.ctx_bufB), - ctx.attn_o_inp, - (ctx.ff1_inp if ctx.config.normalize_invertible else ctx.add_res), - ctx.ff1_inp, - (ctx.ff2_inp if ctx.config.gelu_checkpoint else ctx.gelu_inp), - ctx.ff2_inp, - ctx.attn_prob_dropout_mask, - ctx.attn_output_dropout_mask, - ctx.layer_output_dropout_mask, - ctx.attn_layer_norm_var, - ctx.attn_layer_norm_mean, - ctx.layer_norm_var, - ctx.layer_norm_mean, - (ctx.inp_norm if (ctx.config.pre_layer_norm - and ctx.config.normalize_invertible) else input), - input_mask, - attn_qkvw, - attn_qkvb, - attn_ow, - attn_ob, - attn_nw, - attn_nb, - inter_w, - inter_b, - output_w, - output_b, - norm_w, - norm_b) + (grad_input, grad_attn_qkvw, grad_attn_qkvb, grad_attn_ow, grad_attn_ob, grad_attn_nw, grad_attn_nb, + grad_inter_w, grad_inter_b, grad_output_w, grad_output_b, grad_norm_w, grad_norm_b) = backward_func( + ctx.config.layer_id, grad_output, + (ctx.inp_norm if (ctx.config.pre_layer_norm and ctx.config.normalize_invertible) else output), + (ctx.inp_norm if (ctx.config.pre_layer_norm or not ctx.config.normalize_invertible) else input), + ctx.qkv_tf, ctx.soft_inp, (ctx.soft_inp if ctx.config.attn_dropout_checkpoint else ctx.ctx_bufB), + ctx.attn_o_inp, (ctx.ff1_inp if ctx.config.normalize_invertible else ctx.add_res), ctx.ff1_inp, + (ctx.ff2_inp if ctx.config.gelu_checkpoint else ctx.gelu_inp), ctx.ff2_inp, ctx.attn_prob_dropout_mask, + ctx.attn_output_dropout_mask, ctx.layer_output_dropout_mask, ctx.attn_layer_norm_var, + ctx.attn_layer_norm_mean, ctx.layer_norm_var, ctx.layer_norm_mean, + (ctx.inp_norm if + (ctx.config.pre_layer_norm and ctx.config.normalize_invertible) else input), input_mask, attn_qkvw, + attn_qkvb, attn_ow, attn_ob, attn_nw, attn_nb, inter_w, inter_b, output_w, output_b, norm_w, norm_b) # This appears to be an effective way to release context memory ctx.qkv_tf = None @@ -436,24 +286,9 @@ class DeepSpeedTransformerFunction(Function): if grad_output_shape[1] % 16 != 0: grad_input = torch.narrow(grad_input, 1, 0, grad_output_shape[1]) - return (grad_input, - None, - None, - None, - None, - grad_attn_qkvw, - grad_attn_qkvb, - grad_attn_ow, - grad_attn_ob, - grad_attn_nw, - grad_attn_nb, - grad_inter_w, - grad_inter_b, - grad_output_w, - grad_output_b, - grad_norm_w, - grad_norm_b, - None) + return (grad_input, None, None, None, None, grad_attn_qkvw, grad_attn_qkvb, grad_attn_ow, grad_attn_ob, + grad_attn_nw, grad_attn_nb, grad_inter_w, grad_inter_b, grad_output_w, grad_output_b, grad_norm_w, + grad_norm_b, None) class DeepSpeedTransformerLayer(nn.Module): @@ -484,23 +319,15 @@ class DeepSpeedTransformerLayer(nn.Module): get_accelerator().set_device(self.config.local_rank) if initial_weights is None and initial_biases is None: - self.attn_qkvw = nn.Parameter( - torch.Tensor(self.config.hidden_size * 3, - self.config.hidden_size)) + self.attn_qkvw = nn.Parameter(torch.Tensor(self.config.hidden_size * 3, self.config.hidden_size)) self.attn_qkvb = nn.Parameter(torch.Tensor(self.config.hidden_size * 3)) - self.attn_ow = nn.Parameter( - torch.Tensor(self.config.hidden_size, - self.config.hidden_size)) + self.attn_ow = nn.Parameter(torch.Tensor(self.config.hidden_size, self.config.hidden_size)) self.attn_ob = nn.Parameter(torch.Tensor(self.config.hidden_size)) self.attn_nw = nn.Parameter(torch.Tensor(self.config.hidden_size)) self.attn_nb = nn.Parameter(torch.Tensor(self.config.hidden_size)) - self.inter_w = nn.Parameter( - torch.Tensor(self.config.intermediate_size, - self.config.hidden_size)) + self.inter_w = nn.Parameter(torch.Tensor(self.config.intermediate_size, self.config.hidden_size)) self.inter_b = nn.Parameter(torch.Tensor(self.config.intermediate_size)) - self.output_w = nn.Parameter( - torch.Tensor(self.config.hidden_size, - self.config.intermediate_size)) + self.output_w = nn.Parameter(torch.Tensor(self.config.hidden_size, self.config.intermediate_size)) self.output_b = nn.Parameter(torch.Tensor(self.config.hidden_size)) self.norm_w = nn.Parameter(torch.Tensor(self.config.hidden_size)) self.norm_b = nn.Parameter(torch.Tensor(self.config.hidden_size)) @@ -539,21 +366,11 @@ class DeepSpeedTransformerLayer(nn.Module): cuda_module = stochastic_transformer_cuda_module if self.config.stochastic_mode else transformer_cuda_module create_layer_func = cuda_module.create_transformer_layer_fp16 if self.config.fp16 else cuda_module.create_transformer_layer_fp32 - create_layer_func(self.config.layer_id, - self.config.batch_size, - self.config.hidden_size, - self.config.heads, - self.config.intermediate_size, - self.config.attn_dropout_ratio, - self.config.hidden_dropout_ratio, - self.config.layer_norm_eps, - self.config.seed, - self.config.pre_layer_norm, - self.config.test_gemm, - self.config.attn_dropout_checkpoint, - self.config.normalize_invertible, - self.config.gelu_checkpoint, - self.config.stochastic_mode) + create_layer_func(self.config.layer_id, self.config.batch_size, self.config.hidden_size, self.config.heads, + self.config.intermediate_size, self.config.attn_dropout_ratio, + self.config.hidden_dropout_ratio, self.config.layer_norm_eps, self.config.seed, + self.config.pre_layer_norm, self.config.test_gemm, self.config.attn_dropout_checkpoint, + self.config.normalize_invertible, self.config.gelu_checkpoint, self.config.stochastic_mode) def init_transformer_weights(self, adjust_init_range=False): num_layers = self.config.num_hidden_layers @@ -587,21 +404,7 @@ class DeepSpeedTransformerLayer(nn.Module): grads=None): self.config.is_grad_enabled = torch.is_grad_enabled() self.config.training = self.training - return DeepSpeedTransformerFunction.apply(hidden_states, - attention_mask, - self, - grads, - self.config.layer_id, - self.attn_qkvw, - self.attn_qkvb, - self.attn_ow, - self.attn_ob, - self.attn_nw, - self.attn_nb, - self.inter_w, - self.inter_b, - self.output_w, - self.output_b, - self.norm_w, - self.norm_b, - self.config) + return DeepSpeedTransformerFunction.apply(hidden_states, attention_mask, self, grads, self.config.layer_id, + self.attn_qkvw, self.attn_qkvb, self.attn_ow, self.attn_ob, + self.attn_nw, self.attn_nb, self.inter_w, self.inter_b, + self.output_w, self.output_b, self.norm_w, self.norm_b, self.config) diff --git a/deepspeed/profiling/config.py b/deepspeed/profiling/config.py index c22cd453f..5483941d1 100644 --- a/deepspeed/profiling/config.py +++ b/deepspeed/profiling/config.py @@ -9,6 +9,7 @@ from deepspeed.profiling.constants import * class DeepSpeedFlopsProfilerConfig(DeepSpeedConfigObject): + def __init__(self, param_dict): super(DeepSpeedFlopsProfilerConfig, self).__init__() @@ -25,26 +26,18 @@ class DeepSpeedFlopsProfilerConfig(DeepSpeedConfigObject): self._initialize(flops_profiler_dict) def _initialize(self, flops_profiler_dict): - self.enabled = get_scalar_param(flops_profiler_dict, - FLOPS_PROFILER_ENABLED, - FLOPS_PROFILER_ENABLED_DEFAULT) + self.enabled = get_scalar_param(flops_profiler_dict, FLOPS_PROFILER_ENABLED, FLOPS_PROFILER_ENABLED_DEFAULT) - self.profile_step = get_scalar_param(flops_profiler_dict, - FLOPS_PROFILER_PROFILE_STEP, + self.profile_step = get_scalar_param(flops_profiler_dict, FLOPS_PROFILER_PROFILE_STEP, FLOPS_PROFILER_PROFILE_STEP_DEFAULT) - self.module_depth = get_scalar_param(flops_profiler_dict, - FLOPS_PROFILER_MODULE_DEPTH, + self.module_depth = get_scalar_param(flops_profiler_dict, FLOPS_PROFILER_MODULE_DEPTH, FLOPS_PROFILER_MODULE_DEPTH_DEFAULT) - self.top_modules = get_scalar_param(flops_profiler_dict, - FLOPS_PROFILER_TOP_MODULES, + self.top_modules = get_scalar_param(flops_profiler_dict, FLOPS_PROFILER_TOP_MODULES, FLOPS_PROFILER_TOP_MODULES_DEFAULT) - self.detailed = get_scalar_param(flops_profiler_dict, - FLOPS_PROFILER_DETAILED, - FLOPS_PROFILER_DETAILED_DEFAULT) + self.detailed = get_scalar_param(flops_profiler_dict, FLOPS_PROFILER_DETAILED, FLOPS_PROFILER_DETAILED_DEFAULT) - self.output_file = get_scalar_param(flops_profiler_dict, - FLOPS_PROFILER_OUTPUT_FILE, + self.output_file = get_scalar_param(flops_profiler_dict, FLOPS_PROFILER_OUTPUT_FILE, FLOPS_PROFILER_OUTPUT_FILE_DEFAULT) diff --git a/deepspeed/profiling/flops_profiler/profiler.py b/deepspeed/profiling/flops_profiler/profiler.py index b6684f697..0285679b2 100644 --- a/deepspeed/profiling/flops_profiler/profiler.py +++ b/deepspeed/profiling/flops_profiler/profiler.py @@ -53,6 +53,7 @@ class FlopsProfiler(object): Args: object (torch.nn.Module): The PyTorch model to profile. """ + def __init__(self, model, ds_engine=None): self.model = model self.ds_engine = ds_engine @@ -78,8 +79,7 @@ class FlopsProfiler(object): # if computing the flops of a module directly if type(module) in MODULE_HOOK_MAPPING: if not hasattr(module, "__flops_handle__"): - module.__flops_handle__ = module.register_forward_hook( - MODULE_HOOK_MAPPING[type(module)]) + module.__flops_handle__ = module.register_forward_hook(MODULE_HOOK_MAPPING[type(module)]) return # if computing the flops of the functionals in a module @@ -105,16 +105,14 @@ class FlopsProfiler(object): module.__start_time__ = time.time() if not hasattr(module, "__start_time_hook_handle"): - module.__start_time_hook_handle__ = module.register_forward_pre_hook( - start_time_hook) + module.__start_time_hook_handle__ = module.register_forward_pre_hook(start_time_hook) def end_time_hook(module, input, output): get_accelerator().synchronize() module.__duration__ += time.time() - module.__start_time__ if not hasattr(module, "__end_time_hook_handle__"): - module.__end_time_hook_handle__ = module.register_forward_hook( - end_time_hook) + module.__end_time_hook_handle__ = module.register_forward_hook(end_time_hook) self.model.apply(partial(register_module_hooks, ignore_list=ignore_list)) self.started = True @@ -154,6 +152,7 @@ class FlopsProfiler(object): Adds or resets the extra attributes. """ + def add_or_reset_attrs(module): module.__flops__ = 0 module.__macs__ = 0 @@ -232,15 +231,9 @@ class FlopsProfiler(object): Returns: The number of parameters in the model. """ - return params_to_string( - self.model.__params__) if as_string else self.model.__params__ + return params_to_string(self.model.__params__) if as_string else self.model.__params__ - def print_model_profile(self, - profile_step=1, - module_depth=-1, - top_modules=1, - detailed=True, - output_file=None): + def print_model_profile(self, profile_step=1, module_depth=-1, top_modules=1, detailed=True, output_file=None): """Prints the model graph with the measured profile attached to each module. Args: @@ -273,28 +266,21 @@ class FlopsProfiler(object): self.macs = total_macs self.params = total_params - print( - "\n-------------------------- DeepSpeed Flops Profiler --------------------------" - ) + print("\n-------------------------- DeepSpeed Flops Profiler --------------------------") print(f'Profile Summary at step {profile_step}:') print( "Notations:\ndata parallel size (dp_size), model parallel size(mp_size),\nnumber of parameters (params), number of multiply-accumulate operations(MACs),\nnumber of floating-point operations (flops), floating-point operations per second (FLOPS),\nfwd latency (forward propagation latency), bwd latency (backward propagation latency),\nstep (weights update latency), iter latency (sum of fwd, bwd and step latency)\n" ) if self.ds_engine: print('{:<60} {:<8}'.format('world size: ', self.ds_engine.world_size)) - print('{:<60} {:<8}'.format('data parallel size: ', - self.ds_engine.dp_world_size)) - print('{:<60} {:<8}'.format('model parallel size: ', - self.ds_engine.mp_world_size)) - print('{:<60} {:<8}'.format( - 'batch size per GPU: ', - self.ds_engine.train_micro_batch_size_per_gpu())) + print('{:<60} {:<8}'.format('data parallel size: ', self.ds_engine.dp_world_size)) + print('{:<60} {:<8}'.format('model parallel size: ', self.ds_engine.mp_world_size)) + print('{:<60} {:<8}'.format('batch size per GPU: ', self.ds_engine.train_micro_batch_size_per_gpu())) print('{:<60} {:<8}'.format('params per gpu: ', params_to_string(total_params))) print('{:<60} {:<8}'.format( 'params of model = params per GPU * mp_size: ', - params_to_string(total_params * - ((self.ds_engine.mp_world_size) if self.ds_engine else 1)))) + params_to_string(total_params * ((self.ds_engine.mp_world_size) if self.ds_engine else 1)))) print('{:<60} {:<8}'.format('fwd MACs per GPU: ', macs_to_string(total_macs))) @@ -302,43 +288,33 @@ class FlopsProfiler(object): print('{:<60} {:<8}'.format( 'fwd flops of model = fwd flops per GPU * mp_size: ', - num_to_string(total_flops * - ((self.ds_engine.mp_world_size) if self.ds_engine else 1)))) + num_to_string(total_flops * ((self.ds_engine.mp_world_size) if self.ds_engine else 1)))) fwd_latency = self.get_total_duration() if self.ds_engine and self.ds_engine.wall_clock_breakdown(): fwd_latency = self.ds_engine.timers('forward').elapsed(False) / 1000.0 print('{:<60} {:<8}'.format('fwd latency: ', duration_to_string(fwd_latency))) - print('{:<60} {:<8}'.format( - 'fwd FLOPS per GPU = fwd flops per GPU / fwd latency: ', - flops_to_string(total_flops / fwd_latency))) + print('{:<60} {:<8}'.format('fwd FLOPS per GPU = fwd flops per GPU / fwd latency: ', + flops_to_string(total_flops / fwd_latency))) if self.ds_engine and self.ds_engine.wall_clock_breakdown(): bwd_latency = self.ds_engine.timers('backward').elapsed(False) / 1000.0 step_latency = self.ds_engine.timers('step').elapsed(False) / 1000.0 - print('{:<60} {:<8}'.format('bwd latency: ', - duration_to_string(bwd_latency))) - print('{:<60} {:<8}'.format( - 'bwd FLOPS per GPU = 2 * fwd flops per GPU / bwd latency: ', - flops_to_string(2 * total_flops / bwd_latency))) - print('{:<60} {:<8}'.format( - 'fwd+bwd FLOPS per GPU = 3 * fwd flops per GPU / (fwd+bwd latency): ', - flops_to_string(3 * total_flops / (fwd_latency + bwd_latency)))) + print('{:<60} {:<8}'.format('bwd latency: ', duration_to_string(bwd_latency))) + print('{:<60} {:<8}'.format('bwd FLOPS per GPU = 2 * fwd flops per GPU / bwd latency: ', + flops_to_string(2 * total_flops / bwd_latency))) + print('{:<60} {:<8}'.format('fwd+bwd FLOPS per GPU = 3 * fwd flops per GPU / (fwd+bwd latency): ', + flops_to_string(3 * total_flops / (fwd_latency + bwd_latency)))) - print('{:<60} {:<8}'.format('step latency: ', - duration_to_string(step_latency))) + print('{:<60} {:<8}'.format('step latency: ', duration_to_string(step_latency))) iter_latency = fwd_latency + bwd_latency + step_latency - print('{:<60} {:<8}'.format('iter latency: ', - duration_to_string(iter_latency))) - print('{:<60} {:<8}'.format( - 'FLOPS per GPU = 3 * fwd flops per GPU / iter latency: ', - flops_to_string(3 * total_flops / iter_latency))) + print('{:<60} {:<8}'.format('iter latency: ', duration_to_string(iter_latency))) + print('{:<60} {:<8}'.format('FLOPS per GPU = 3 * fwd flops per GPU / iter latency: ', + flops_to_string(3 * total_flops / iter_latency))) - samples_per_iter = self.ds_engine.train_micro_batch_size_per_gpu( - ) * self.ds_engine.world_size - print('{:<60} {:<8.2f}'.format('samples/second: ', - samples_per_iter / iter_latency)) + samples_per_iter = self.ds_engine.train_micro_batch_size_per_gpu() * self.ds_engine.world_size + print('{:<60} {:<8.2f}'.format('samples/second: ', samples_per_iter / iter_latency)) def flops_repr(module): params = module.__params__ @@ -353,9 +329,7 @@ class FlopsProfiler(object): duration = get_module_duration(module) items.append(duration_to_string(duration)) - items.append( - "{:.2%} latency".format(0.0 if total_duration == 0 else duration / - total_duration)) + items.append("{:.2%} latency".format(0.0 if total_duration == 0 else duration / total_duration)) items.append(flops_to_string(0.0 if duration == 0 else flops / duration)) items.append(module.original_extra_repr()) return ", ".join(items) @@ -374,16 +348,11 @@ class FlopsProfiler(object): self.model.apply(add_extra_repr) - print( - "\n----------------------------- Aggregated Profile per GPU -----------------------------" - ) - self.print_model_aggregated_profile(module_depth=module_depth, - top_modules=top_modules) + print("\n----------------------------- Aggregated Profile per GPU -----------------------------") + self.print_model_aggregated_profile(module_depth=module_depth, top_modules=top_modules) if detailed: - print( - "\n------------------------------ Detailed Profile per GPU ------------------------------" - ) + print("\n------------------------------ Detailed Profile per GPU ------------------------------") print( "Each module profile is listed after its name in the following order: \nparams, percentage of total params, MACs, percentage of total MACs, fwd latency, percentage of total fwd latency, fwd FLOPS" ) @@ -394,9 +363,7 @@ class FlopsProfiler(object): self.model.apply(del_extra_repr) - print( - "------------------------------------------------------------------------------" - ) + print("------------------------------------------------------------------------------") if output_file: sys.stdout = original_stdout @@ -411,9 +378,7 @@ class FlopsProfiler(object): """ info = {} if not hasattr(self.model, "__flops__"): - print( - "no __flops__ attribute in the model, call this function after start_profile and before end_profile" - ) + print("no __flops__ attribute in the model, call this function after start_profile and before end_profile") return def walk_module(module, curr_depth, info): @@ -439,33 +404,22 @@ class FlopsProfiler(object): if module_depth == -1: depth = len(info) - 1 - print( - f'Top {top_modules} modules in terms of params, MACs or fwd latency at different model depths:' - ) + print(f'Top {top_modules} modules in terms of params, MACs or fwd latency at different model depths:') for d in range(depth): num_items = min(top_modules, len(info[d])) sort_macs = { k: macs_to_string(v[0]) - for k, - v in sorted(info[d].items(), - key=lambda item: item[1][0], - reverse=True)[:num_items] + for k, v in sorted(info[d].items(), key=lambda item: item[1][0], reverse=True)[:num_items] } sort_params = { k: params_to_string(v[1]) - for k, - v in sorted(info[d].items(), - key=lambda item: item[1][1], - reverse=True)[:num_items] + for k, v in sorted(info[d].items(), key=lambda item: item[1][1], reverse=True)[:num_items] } sort_time = { k: duration_to_string(v[2]) - for k, - v in sorted(info[d].items(), - key=lambda item: item[1][2], - reverse=True)[:num_items] + for k, v in sorted(info[d].items(), key=lambda item: item[1][2], reverse=True)[:num_items] } print(f"depth {d}:") @@ -499,9 +453,7 @@ def _elu_flops_compute(input: Tensor, alpha: float = 1.0, inplace: bool = False) return input.numel(), 0 -def _leaky_relu_flops_compute(input: Tensor, - negative_slope: float = 0.01, - inplace: bool = False): +def _leaky_relu_flops_compute(input: Tensor, negative_slope: float = 0.01, inplace: bool = False): return input.numel(), 0 @@ -529,13 +481,7 @@ def _pool_flops_compute(input, return input.numel(), 0 -def _conv_flops_compute(input, - weight, - bias=None, - stride=1, - padding=0, - dilation=1, - groups=1): +def _conv_flops_compute(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1): assert weight.shape[1] * groups == input.shape[1] batch_size = input.shape[0] @@ -552,8 +498,8 @@ def _conv_flops_compute(input, output_dims = [] for idx, input_dim in enumerate(input_dims): - output_dim = (input_dim + 2 * paddings[idx] - - (dilations[idx] * (kernel_dims[idx] - 1) + 1)) // strides[idx] + 1 + output_dim = (input_dim + 2 * paddings[idx] - (dilations[idx] * + (kernel_dims[idx] - 1) + 1)) // strides[idx] + 1 output_dims.append(output_dim) filters_per_channel = out_channels // groups @@ -594,8 +540,8 @@ def _conv_trans_flops_compute( output_dims = [] for idx, input_dim in enumerate(input_dims): - output_dim = (input_dim + 2 * paddings[idx] - - (dilations[idx] * (kernel_dims[idx] - 1) + 1)) // strides[idx] + 1 + output_dim = (input_dim + 2 * paddings[idx] - (dilations[idx] * + (kernel_dims[idx] - 1) + 1)) // strides[idx] + 1 output_dims.append(output_dim) paddings = padding if type(padding) is tuple else (padding, padding) @@ -1210,8 +1156,7 @@ def get_model_profile( assert len(input_shape) >= 1, "input_shape must have at least one element" try: input = torch.ones(()).new_empty( - (*input_shape, - ), + (*input_shape, ), dtype=next(model.parameters()).dtype, device=next(model.parameters()).device, ) diff --git a/deepspeed/runtime/activation_checkpointing/checkpointing.py b/deepspeed/runtime/activation_checkpointing/checkpointing.py index eccf9073a..f5e1c3674 100644 --- a/deepspeed/runtime/activation_checkpointing/checkpointing.py +++ b/deepspeed/runtime/activation_checkpointing/checkpointing.py @@ -82,9 +82,7 @@ def detach_variable(inputs, device=None): out.append(x) return tuple(out) else: - raise RuntimeError( - "Only tuple of tensors is supported. Got Unsupported input type: ", - type(inputs).__name__) + raise RuntimeError("Only tuple of tensors is supported. Got Unsupported input type: ", type(inputs).__name__) def _set_cuda_rng_state(new_state, device=-1): @@ -128,6 +126,7 @@ class CudaRNGStatesTracker: rng state, we can perform operations and return to our starting cuda state. """ + def __init__(self): # Map from a string name to the cuda rng state. self.states_ = {} @@ -227,13 +226,9 @@ def model_parallel_cuda_manual_seed(seed): logger.info( '> initializing model parallel cuda seeds on global rank {}, ' 'model parallel rank {}, and data parallel rank {} with ' - 'model parallel seed: {} and data parallel seed: {}'.format( - dist.get_rank(), - tp_rank, - mpu.get_data_parallel_rank(), - model_parallel_seed, - data_parallel_seed), - ) + 'model parallel seed: {} and data parallel seed: {}'.format(dist.get_rank(), tp_rank, + mpu.get_data_parallel_rank(), + model_parallel_seed, data_parallel_seed), ) _CUDA_RNG_STATE_TRACKER.reset() # Set the default state. get_accelerator().manual_seed(data_parallel_seed) @@ -282,9 +277,7 @@ def gather_partitioned_activations(tensors, device=None): if device is not None: flat_tensor = torch.zeros([tensor_size], dtype=item.dtype, device=device) else: - flat_tensor = torch.zeros([tensor_size], - dtype=item.dtype, - device=item.device) + flat_tensor = torch.zeros([tensor_size], dtype=item.dtype, device=item.device) partitions = [] for i in range(mp_size): part_i = flat_tensor.narrow(0, partition_size * i, partition_size) @@ -384,28 +377,21 @@ def partition_activations(args, cpu_checkpoint, contiguous_checkpoint): i = arg_index - num_non_fp_tensors partition_size = get_partition_size(item) - partition = item.detach().contiguous().view(-1).narrow( - 0, - get_partition_start(item), - partition_size).clone() + partition = item.detach().contiguous().view(-1).narrow(0, get_partition_start(item), partition_size).clone() buffer_device = torch.device('cpu') if cpu_checkpoint else partition.device if contiguous_checkpoint: if i >= len(contiguous_data_buffers): tensor_list = [ - torch.tensor(()).new_empty([partition_size], - dtype=partition.dtype, - device=buffer_device) + torch.tensor(()).new_empty([partition_size], dtype=partition.dtype, device=buffer_device) for _ in range(num_layers) ] contiguous_data_buffers.append(tensor_list) data_offsets.append(0) elif contiguous_data_buffers[i] is None: tensor_list = [ - torch.tensor(()).new_empty([partition_size], - dtype=partition.dtype, - device=buffer_device) + torch.tensor(()).new_empty([partition_size], dtype=partition.dtype, device=buffer_device) for _ in range(num_layers) ] contiguous_data_buffers[i] = tensor_list @@ -419,14 +405,10 @@ def partition_activations(args, cpu_checkpoint, contiguous_checkpoint): # previously launched GPU kernels, there is a small # window of time here for CPUs to populate pages asynchronously. contiguous_data_buffers[i][data_offsets[i]].data[range( - 0, - contiguous_data_buffers[i][data_offsets[i]].data.shape[0], - int(mmap.PAGESIZE / - contiguous_data_buffers[i][data_offsets[i]].data.element_size()) - )] = 0 + 0, contiguous_data_buffers[i][data_offsets[i]].data.shape[0], + int(mmap.PAGESIZE / contiguous_data_buffers[i][data_offsets[i]].data.element_size()))] = 0 - contiguous_partition = contiguous_data_buffers[i][ - data_offsets[i]].data.copy_(partition.data) + contiguous_partition = contiguous_data_buffers[i][data_offsets[i]].data.copy_(partition.data) data_offsets[i] = data_offsets[i] + 1 inputs.append(contiguous_partition) else: @@ -459,21 +441,14 @@ def get_partitioned_activations_for_backward(args, inputs, contiguous_checkpoint if i >= len(contiguous_size_buffers): tmp = torch.tensor(()) contiguous_size_buffers.append( - tmp.new_empty([numel * num_layers], - dtype=size.dtype, - device=size.device)) + tmp.new_empty([numel * num_layers], dtype=size.dtype, device=size.device)) size_offsets.append(0) elif contiguous_size_buffers[i] is None: tmp = torch.tensor(()) - contiguous_size_buffers[i] = tmp.new_empty([numel * num_layers], - dtype=size.dtype, - device=size.device) + contiguous_size_buffers[i] = tmp.new_empty([numel * num_layers], dtype=size.dtype, device=size.device) size_offsets[i] = 0 - contiguous_size = contiguous_size_buffers[i].narrow( - 0, - size_offsets[i], - numel).data.copy_(size.data) + contiguous_size = contiguous_size_buffers[i].narrow(0, size_offsets[i], numel).data.copy_(size.data) contiguous_size = contiguous_size.view_as(size) size_offsets[i] = size_offsets[i] + numel new_args.append(contiguous_size) @@ -506,6 +481,7 @@ class CheckpointFunction(torch.autograd.Function): 4) CPU Checkpointing 5) Profile forward and backward functions """ + @staticmethod def forward(ctx, run_function, all_outputs, *args): global mpu, timers, SYNCHRONIZE, PROFILE_TIME @@ -551,12 +527,9 @@ class CheckpointFunction(torch.autograd.Function): see_memory_usage("First Forward Beginning", force=False) if dist.get_rank() == 0: logger.info(f"Activation Checkpointing Information") + logger.info(f"----Partition Activations {PARTITION_ACTIVATIONS}, CPU CHECKPOINTING {CPU_CHECKPOINT}") logger.info( - f"----Partition Activations {PARTITION_ACTIVATIONS}, CPU CHECKPOINTING {CPU_CHECKPOINT}" - ) - logger.info( - f"----contiguous Memory Checkpointing {CONTIGUOUS_CHECKPOINTING} with {num_layers} total layers" - ) + f"----contiguous Memory Checkpointing {CONTIGUOUS_CHECKPOINTING} with {num_layers} total layers") logger.info(f"----Synchronization {SYNCHRONIZE}") logger.info(f"----Profiling time in checkpointing {PROFILE_TIME}") @@ -564,18 +537,12 @@ class CheckpointFunction(torch.autograd.Function): transport_stream = get_accelerator().Stream(device=cuda_device) if PARTITION_ACTIVATIONS: - inputs = partition_activations(args, - CPU_CHECKPOINT, - CONTIGUOUS_CHECKPOINTING) + inputs = partition_activations(args, CPU_CHECKPOINT, CONTIGUOUS_CHECKPOINTING) elif CPU_CHECKPOINT: - inputs = copy_to_device(args, - device=torch.device('cpu'), - criterion_func=is_activation_to_checkpoint) + inputs = copy_to_device(args, device=torch.device('cpu'), criterion_func=is_activation_to_checkpoint) # just in case something funky is happening such as reuse of inputs - inputs_cuda = copy_to_device(args, - device=cuda_device, - criterion_func=is_activation_to_checkpoint) + inputs_cuda = copy_to_device(args, device=cuda_device, criterion_func=is_activation_to_checkpoint) # Copy the rng states. ctx.fwd_cpu_rng_state = torch.get_rng_state() @@ -591,10 +558,7 @@ class CheckpointFunction(torch.autograd.Function): del inputs_cuda if PARTITION_ACTIVATIONS: - new_args = get_partitioned_activations_for_backward( - args, - inputs, - CONTIGUOUS_CHECKPOINTING) + new_args = get_partitioned_activations_for_backward(args, inputs, CONTIGUOUS_CHECKPOINTING) assert len(new_args) % 2 == 0, f'save_for_backward called with odd number of args, {len(new_args)}' save_args_for_backward(*new_args) elif CPU_CHECKPOINT: @@ -613,9 +577,7 @@ class CheckpointFunction(torch.autograd.Function): if torch.is_tensor(outputs): non_grad_outputs = [outputs] if not outputs.is_floating_point() else [] else: - non_grad_outputs = [ - o for o in outputs if torch.is_tensor(o) and not o.is_floating_point() - ] + non_grad_outputs = [o for o in outputs if torch.is_tensor(o) and not o.is_floating_point()] ctx.mark_non_differentiable(*non_grad_outputs) if torch.is_tensor(outputs): @@ -661,14 +623,11 @@ class CheckpointFunction(torch.autograd.Function): if PARTITION_ACTIVATIONS: # with get_accelerator().stream(transport_stream): - inputs = gather_partitioned_activations( - ctx.deepspeed_saved_tensors, - device=cuda_device if CPU_CHECKPOINT else None) + inputs = gather_partitioned_activations(ctx.deepspeed_saved_tensors, + device=cuda_device if CPU_CHECKPOINT else None) detached_inputs = detach_variable(inputs) elif CPU_CHECKPOINT: - inputs = move_to_device(ctx.deepspeed_saved_tensors, - cuda_device, - is_activation_to_checkpoint) + inputs = move_to_device(ctx.deepspeed_saved_tensors, cuda_device, is_activation_to_checkpoint) detached_inputs = detach_variable(inputs) else: inputs = ctx.deepspeed_saved_tensors @@ -762,8 +721,7 @@ def partition_activations_in_checkpoint(partition_activation): global PARTITION_ACTIVATIONS PARTITION_ACTIVATIONS = partition_activation if dist.get_rank() == 0: - logger.info( - f"**************Partition Activations {PARTITION_ACTIVATIONS}************") + logger.info(f"**************Partition Activations {PARTITION_ACTIVATIONS}************") def set_num_layers(nlayers): diff --git a/deepspeed/runtime/activation_checkpointing/config.py b/deepspeed/runtime/activation_checkpointing/config.py index 0e7957943..7f5b8c613 100755 --- a/deepspeed/runtime/activation_checkpointing/config.py +++ b/deepspeed/runtime/activation_checkpointing/config.py @@ -48,16 +48,15 @@ ACT_CHKPT = 'activation_checkpointing' ACT_CHKPT_DEFAULT = { ACT_CHKPT_PARTITION_ACTIVATIONS: ACT_CHKPT_PARTITION_ACTIVATIONS_DEFAULT, ACT_CHKPT_NUMBER_CHECKPOINTS: ACT_CHKPT_NUMBER_CHECKPOINTS_DEFAULT, - ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION: - ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION_DEFAULT, - ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY: - ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY_DEFAULT, + ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION: ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION_DEFAULT, + ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY: ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY_DEFAULT, ACT_CHKPT_PROFILE: ACT_CHKPT_PROFILE_DEFAULT, ACT_CHKPT_CPU_CHECKPOINTING: ACT_CHKPT_CPU_CHECKPOINTING_DEFAULT } class DeepSpeedActivationCheckpointingConfig(DeepSpeedConfigObject): + def __init__(self, param_dict): super(DeepSpeedActivationCheckpointingConfig, self).__init__() @@ -76,29 +75,21 @@ class DeepSpeedActivationCheckpointingConfig(DeepSpeedConfigObject): self._initialize(act_chkpt_config_dict) def _initialize(self, act_chkpt_config_dict): - self.partition_activations = get_scalar_param( - act_chkpt_config_dict, - ACT_CHKPT_PARTITION_ACTIVATIONS, - ACT_CHKPT_PARTITION_ACTIVATIONS_DEFAULT) + self.partition_activations = get_scalar_param(act_chkpt_config_dict, ACT_CHKPT_PARTITION_ACTIVATIONS, + ACT_CHKPT_PARTITION_ACTIVATIONS_DEFAULT) - self.contiguous_memory_optimization = get_scalar_param( - act_chkpt_config_dict, - ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION, - ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION_DEFAULT) + self.contiguous_memory_optimization = get_scalar_param(act_chkpt_config_dict, + ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION, + ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION_DEFAULT) - self.cpu_checkpointing = get_scalar_param(act_chkpt_config_dict, - ACT_CHKPT_CPU_CHECKPOINTING, + self.cpu_checkpointing = get_scalar_param(act_chkpt_config_dict, ACT_CHKPT_CPU_CHECKPOINTING, ACT_CHKPT_CPU_CHECKPOINTING_DEFAULT) - self.number_checkpoints = get_scalar_param(act_chkpt_config_dict, - ACT_CHKPT_NUMBER_CHECKPOINTS, + self.number_checkpoints = get_scalar_param(act_chkpt_config_dict, ACT_CHKPT_NUMBER_CHECKPOINTS, ACT_CHKPT_NUMBER_CHECKPOINTS_DEFAULT) - self.profile = get_scalar_param(act_chkpt_config_dict, - ACT_CHKPT_PROFILE, - ACT_CHKPT_PROFILE_DEFAULT) + self.profile = get_scalar_param(act_chkpt_config_dict, ACT_CHKPT_PROFILE, ACT_CHKPT_PROFILE_DEFAULT) - self.synchronize_checkpoint_boundary = get_scalar_param( - act_chkpt_config_dict, - ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY, - ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY_DEFAULT) + self.synchronize_checkpoint_boundary = get_scalar_param(act_chkpt_config_dict, + ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY, + ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY_DEFAULT) diff --git a/deepspeed/runtime/bf16_optimizer.py b/deepspeed/runtime/bf16_optimizer.py index f667d279c..3884d8ce1 100644 --- a/deepspeed/runtime/bf16_optimizer.py +++ b/deepspeed/runtime/bf16_optimizer.py @@ -13,29 +13,21 @@ from deepspeed.runtime import ZeROOptimizer from packaging import version as pkg_version from deepspeed.git_version_info import version -from deepspeed.runtime.utils import (get_global_norm_of_tensors, - clip_tensors_by_global_norm, - DummyOptim, - align_dense_tensors, - all_gather_dp_groups, - bwc_tensor_model_parallel_rank, - is_model_parallel_parameter, - see_memory_usage) +from deepspeed.runtime.utils import (get_global_norm_of_tensors, clip_tensors_by_global_norm, DummyOptim, + align_dense_tensors, all_gather_dp_groups, bwc_tensor_model_parallel_rank, + is_model_parallel_parameter, see_memory_usage) from deepspeed.utils import link_hp_params, fragment_address from deepspeed.checkpoint import enable_universal_checkpoint -from deepspeed.checkpoint.constants import (DS_VERSION, - PARTITION_COUNT, - BASE_OPTIMIZER_STATE, - SINGLE_PARTITION_OF_FP32_GROUPS, - CLIP_GRAD, - GROUP_PADDINGS, +from deepspeed.checkpoint.constants import (DS_VERSION, PARTITION_COUNT, BASE_OPTIMIZER_STATE, + SINGLE_PARTITION_OF_FP32_GROUPS, CLIP_GRAD, GROUP_PADDINGS, PARAM_SLICE_MAPPINGS) setattr(sys.modules[__name__], 'fragment_address', fragment_address) class BF16_Optimizer(ZeROOptimizer): + def __init__(self, init_optimizer, param_names, @@ -58,9 +50,7 @@ class BF16_Optimizer(ZeROOptimizer): self.allgather_bucket_size = int(allgather_bucket_size) self.dp_process_group = dp_process_group self.dp_rank = dist.get_rank(group=self.dp_process_group) - self.real_dp_process_group = [ - dp_process_group for i in range(len(self.optimizer.param_groups)) - ] + self.real_dp_process_group = [dp_process_group for i in range(len(self.optimizer.param_groups))] # Load pre-built or JIT compile (un)flatten ops util_ops = UtilsBuilder().load() @@ -95,9 +85,7 @@ class BF16_Optimizer(ZeROOptimizer): def _setup_for_real_optimizer(self): dp_world_size = dist.get_world_size(group=self.dp_process_group) - self.partition_count = [ - dp_world_size for i in range(len(self.optimizer.param_groups)) - ] + self.partition_count = [dp_world_size for i in range(len(self.optimizer.param_groups))] for i, param_group in enumerate(self.optimizer.param_groups): see_memory_usage(f'before initializing group {i}', force=True) @@ -109,65 +97,50 @@ class BF16_Optimizer(ZeROOptimizer): # create flat bf16 params self.bf16_groups_flat.append( - self._flatten_dense_tensors_aligned( - self.bf16_groups[i], - self.nccl_start_alignment_factor * dp_world_size)) + self._flatten_dense_tensors_aligned(self.bf16_groups[i], + self.nccl_start_alignment_factor * dp_world_size)) # Make bf16 params point to flat tensor storage - self._update_storage_to_flattened_tensor( - tensor_list=self.bf16_groups[i], - flat_tensor=self.bf16_groups_flat[i]) + self._update_storage_to_flattened_tensor(tensor_list=self.bf16_groups[i], + flat_tensor=self.bf16_groups_flat[i]) # divide flat weights into equal sized partitions partition_size = self.bf16_groups_flat[i].numel() // dp_world_size bf16_dp_partitions = [ - self.bf16_groups_flat[i].narrow(0, - dp_index * partition_size, - partition_size) + self.bf16_groups_flat[i].narrow(0, dp_index * partition_size, partition_size) for dp_index in range(dp_world_size) ] self.bf16_partitioned_groups.append(bf16_dp_partitions) # create fp32 params partition - self.fp32_groups_flat_partition.append( - bf16_dp_partitions[partition_id].clone().float().detach()) + self.fp32_groups_flat_partition.append(bf16_dp_partitions[partition_id].clone().float().detach()) self.fp32_groups_flat_partition[i].requires_grad = True num_elem_list = [t.numel() for t in self.bf16_groups[i]] # create fp32 gradients - self.fp32_groups_gradients_flat.append( - torch.zeros_like(self.bf16_groups_flat[i], - dtype=torch.float32)) + self.fp32_groups_gradients_flat.append(torch.zeros_like(self.bf16_groups_flat[i], dtype=torch.float32)) # track individual fp32 gradients for entire model - fp32_gradients = self._split_flat_tensor( - flat_tensor=self.fp32_groups_gradients_flat[i], - num_elem_list=num_elem_list) + fp32_gradients = self._split_flat_tensor(flat_tensor=self.fp32_groups_gradients_flat[i], + num_elem_list=num_elem_list) self.fp32_groups_gradients.append(fp32_gradients) self.fp32_groups_gradient_dict[i] = fp32_gradients # flat tensor corresponding to actual fp32 gradients (i.e., minus alignment padding) length_without_padding = sum(num_elem_list) self.fp32_groups_actual_gradients_flat.append( - torch.narrow(self.fp32_groups_gradients_flat[i], - 0, - 0, - length_without_padding)) + torch.narrow(self.fp32_groups_gradients_flat[i], 0, 0, length_without_padding)) # flat tensor corresponding to gradient partition self.fp32_groups_gradient_flat_partition.append( - torch.narrow(self.fp32_groups_gradients_flat[i], - 0, - partition_id * partition_size, - partition_size)) + torch.narrow(self.fp32_groups_gradients_flat[i], 0, partition_id * partition_size, partition_size)) # track fp32 gradient updates self.fp32_groups_has_gradients.append([False] * len(self.bf16_groups[i])) # Record padding required for alignment - if partition_id == dist.get_world_size( - group=self.real_dp_process_group[i]) - 1: + if partition_id == dist.get_world_size(group=self.real_dp_process_group[i]) - 1: padding = self.bf16_groups_flat[i].numel() - length_without_padding else: padding = 0 @@ -199,8 +172,7 @@ class BF16_Optimizer(ZeROOptimizer): for lp in self.bf16_groups[i]: if lp._hp_mapping is not None: lp_name = self.param_names[lp] - param_mapping_per_group[ - lp_name] = lp._hp_mapping.get_hp_fragment_address() + param_mapping_per_group[lp_name] = lp._hp_mapping.get_hp_fragment_address() param_mapping.append(param_mapping_per_group) return param_mapping @@ -212,17 +184,16 @@ class BF16_Optimizer(ZeROOptimizer): partition_id = dist.get_rank(group=self.real_dp_process_group[i]) partition_size = self.bf16_groups_flat[i].numel() // dp_world_size flat_hp_partition = self.fp32_groups_flat_partition[i] - link_hp_params( - lp_param_list=self.bf16_groups[i], - flat_hp_partition=flat_hp_partition, - gradient_dict=self.fp32_groups_gradient_dict, - offload_gradient_dict=None, - use_offload=False, - param_group_index=i, - partition_start=partition_id * partition_size, - partition_size=partition_size, - partition_optimizer_state=self.optimizer.state[flat_hp_partition], - dp_group=self.real_dp_process_group[i]) + link_hp_params(lp_param_list=self.bf16_groups[i], + flat_hp_partition=flat_hp_partition, + gradient_dict=self.fp32_groups_gradient_dict, + offload_gradient_dict=None, + use_offload=False, + param_group_index=i, + partition_start=partition_id * partition_size, + partition_size=partition_size, + partition_optimizer_state=self.optimizer.state[flat_hp_partition], + dp_group=self.real_dp_process_group[i]) def initialize_optimizer_states(self): """Take an optimizer step with zero-valued gradients to allocate internal @@ -231,7 +202,8 @@ class BF16_Optimizer(ZeROOptimizer): This helps prevent memory fragmentation by allocating optimizer state at the beginning of training instead of after activations have been allocated. """ - for param_partition, grad_partition in zip(self.fp32_groups_flat_partition, self.fp32_groups_gradient_flat_partition): + for param_partition, grad_partition in zip(self.fp32_groups_flat_partition, + self.fp32_groups_gradient_flat_partition): param_partition.grad = grad_partition self.optimizer.step() @@ -262,19 +234,17 @@ class BF16_Optimizer(ZeROOptimizer): if closure is not None: raise NotImplementedError(f'{self.__class__} does not support closure.') - all_groups_norm = get_global_norm_of_tensors( - input_tensors=self.get_grads_for_norm(), - mpu=self.mpu, - norm_type=self.norm_type) + all_groups_norm = get_global_norm_of_tensors(input_tensors=self.get_grads_for_norm(), + mpu=self.mpu, + norm_type=self.norm_type) self._global_grad_norm = all_groups_norm assert all_groups_norm > 0. if self.clip_grad > 0.: - clip_tensors_by_global_norm( - input_tensors=self.get_grads_for_norm(for_clipping=True), - max_norm=self.clip_grad, - global_norm=all_groups_norm, - mpu=self.mpu) + clip_tensors_by_global_norm(input_tensors=self.get_grads_for_norm(for_clipping=True), + max_norm=self.clip_grad, + global_norm=all_groups_norm, + mpu=self.mpu) self.optimizer.step() @@ -343,7 +313,8 @@ class BF16_Optimizer(ZeROOptimizer): @torch.no_grad() def update_lp_params(self): - for i, (bf16_partitions, fp32_partition) in enumerate(zip(self.bf16_partitioned_groups, self.fp32_groups_flat_partition)): + for i, (bf16_partitions, + fp32_partition) in enumerate(zip(self.bf16_partitioned_groups, self.fp32_groups_flat_partition)): partition_id = dist.get_rank(group=self.real_dp_process_group[i]) bf16_partitions[partition_id].data.copy_(fp32_partition.data) # print_rank_0(f'update_lp_params {i=} {partition_id=}', force=True) @@ -395,18 +366,11 @@ class BF16_Optimizer(ZeROOptimizer): load_optimizer_states=True, load_from_fp32_weights=False): if checkpoint_folder: - self._load_universal_checkpoint(checkpoint_folder, - load_optimizer_states, - load_from_fp32_weights) + self._load_universal_checkpoint(checkpoint_folder, load_optimizer_states, load_from_fp32_weights) else: - self._load_legacy_checkpoint(state_dict_list, - load_optimizer_states, - load_from_fp32_weights) + self._load_legacy_checkpoint(state_dict_list, load_optimizer_states, load_from_fp32_weights) - def _load_legacy_checkpoint(self, - state_dict_list, - load_optimizer_states=True, - load_from_fp32_weights=False): + def _load_legacy_checkpoint(self, state_dict_list, load_optimizer_states=True, load_from_fp32_weights=False): dp_rank = dist.get_rank(group=self.dp_process_group) current_rank_sd = state_dict_list[dp_rank] @@ -421,17 +385,15 @@ class BF16_Optimizer(ZeROOptimizer): self.optimizer.load_state_dict(current_rank_sd[BASE_OPTIMIZER_STATE]) if load_from_fp32_weights: - for current, saved in zip(self.fp32_groups_flat_partition, current_rank_sd[SINGLE_PARTITION_OF_FP32_GROUPS]): + for current, saved in zip(self.fp32_groups_flat_partition, + current_rank_sd[SINGLE_PARTITION_OF_FP32_GROUPS]): src_tensor = _get_padded_tensor(saved, current.numel()) current.data.copy_(src_tensor.data) if load_optimizer_states: self._link_all_hp_params() - def _load_universal_checkpoint(self, - checkpoint_folder, - load_optimizer_states, - load_from_fp32_weights): + def _load_universal_checkpoint(self, checkpoint_folder, load_optimizer_states, load_from_fp32_weights): self._load_hp_checkpoint_state(checkpoint_folder) @property @@ -448,11 +410,8 @@ class BF16_Optimizer(ZeROOptimizer): for lp in self.bf16_groups[i]: if lp._hp_mapping is not None: #print(f"Loading {self.param_names[lp]} {tp_rank=} {tp_world_size=}") - lp.load_hp_checkpoint_state( - os.path.join(checkpoint_dir, - self.param_names[lp]), - tp_rank, - tp_world_size) + lp.load_hp_checkpoint_state(os.path.join(checkpoint_dir, self.param_names[lp]), tp_rank, + tp_world_size) def _get_padded_tensor(src_tensor, size): diff --git a/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py b/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py index 42bb067d0..0863568f8 100644 --- a/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py +++ b/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py @@ -15,6 +15,7 @@ def _get_tag_from_path(path): class NebulaCheckpointEngine(CheckpointEngine): + def __init__(self, config_params=None): super().__init__(config_params) self.checkpoint = None @@ -27,8 +28,7 @@ class NebulaCheckpointEngine(CheckpointEngine): nebula_config_params = { NEBULA_PERSISTENT_STORAGE_PATH: config_params.persistent_storage_path, NEBULA_PERSISTENT_TIME_INTERVAL: config_params.persistent_time_interval, - NEBULA_NUM_OF_VERSION_IN_RETENTION: - config_params.num_of_version_in_retention, + NEBULA_NUM_OF_VERSION_IN_RETENTION: config_params.num_of_version_in_retention, } torch_nebula.init(**nebula_config_params) @@ -54,16 +54,13 @@ class NebulaCheckpointEngine(CheckpointEngine): first_load_flag = self.tag_flag is None or self.tag_flag == tag if not self.enable_nebula_load and first_load_flag: self.tag_flag = tag - logger.info( - f"[Nebula] Disable nebula load. Loading checkpoint from {path} ...") + logger.info(f"[Nebula] Disable nebula load. Loading checkpoint from {path} ...") partition = torch.load(path, map_location=map_location) logger.info(f"[Nebula] Disable nebula load. Loaded checkpoint from {path} .") return partition partititon_name = os.path.basename(path) - logger.info( - f"[Nebula] Loading {path} under tag {tag} from nebula path {self.nebula_load_path}..." - ) + logger.info(f"[Nebula] Loading {path} under tag {tag} from nebula path {self.nebula_load_path}...") checkpoint = None if tag in (None, 'latest', 'latest_universal'): @@ -72,44 +69,36 @@ class NebulaCheckpointEngine(CheckpointEngine): # will try to load the valid latest checkpoint from nebula(tier3 > tier1). So, in summary # when met failure loading for given tag, the loading priority would be like: # nebula tier3 latest > nebula tier1 latest. - checkpoint = torch_nebula.get_latest_checkpoint( - persist_path=self.nebula_load_path) + checkpoint = torch_nebula.get_latest_checkpoint(persist_path=self.nebula_load_path) else: - checkpoint = torch_nebula.get_checkpoint(tag=tag, - persist_path=self.nebula_load_path) + checkpoint = torch_nebula.get_checkpoint(tag=tag, persist_path=self.nebula_load_path) if checkpoint is None or (checkpoint is not None and checkpoint.tag == ''): logger.info( f"Unable to find valid checkpoint tag:{tag} from Nebula, try to get latest checkpoint again from nebula {self.nebula_load_path} path!" ) # nebula tier3 latest - checkpoint = torch_nebula.get_latest_checkpoint( - persist_path=self.nebula_load_path) + checkpoint = torch_nebula.get_latest_checkpoint(persist_path=self.nebula_load_path) if checkpoint is None or (checkpoint is not None and checkpoint.tag == ''): logger.info( f"Unable to find latest checkpoint from Nebula tier3, try to get latest checkpoint again from nebula tier1 path!" ) # nebula tier1 latest checkpoint = torch_nebula.get_latest_checkpoint() - logger.warning( - f"Unable to find valid checkpoint from Nebula under tag:{tag}.") + logger.warning(f"Unable to find valid checkpoint from Nebula under tag:{tag}.") return None tag = checkpoint.tag self.tag_flag = -1 partition = checkpoint.load(partititon_name, map_location=map_location) - logger.info( - f"[Nebula] Loaded {path} under tag {tag} from {self.nebula_load_path}.") + logger.info(f"[Nebula] Loaded {path} under tag {tag} from {self.nebula_load_path}.") return partition def commit(self, tag): # nebula commit will be call when all files under give tag are ready to be persisted in the async way. - logger.info( - f"[Nebula] all files for {tag} are saved in tier1. It is ready to start persisting" - ) + logger.info(f"[Nebula] all files for {tag} are saved in tier1. It is ready to start persisting") commit_rls = self.checkpoint.commit() if not commit_rls: - logger.error( - f"[Nebula] failed to commit the checkpoint, please check the log.") + logger.error(f"[Nebula] failed to commit the checkpoint, please check the log.") return False return commit_rls diff --git a/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py b/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py index 51633459e..3fc758ec0 100644 --- a/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py +++ b/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py @@ -7,6 +7,7 @@ from deepspeed.runtime.checkpoint_engine.checkpoint_engine import \ class TorchCheckpointEngine(CheckpointEngine): + def __init__(self, config_params=None): super().__init__(config_params) diff --git a/deepspeed/runtime/comm/coalesced_collectives.py b/deepspeed/runtime/comm/coalesced_collectives.py index 07302b1b7..add14924c 100644 --- a/deepspeed/runtime/comm/coalesced_collectives.py +++ b/deepspeed/runtime/comm/coalesced_collectives.py @@ -15,15 +15,8 @@ import torch.nn.functional from deepspeed.utils import instrument_w_nvtx -def _torch_reduce_scatter_fn(input_tensor: Tensor, - output_tensor: Tensor, - group=None, - async_op=False, - prof=False): - return instrument_w_nvtx(dist.reduce_scatter_fn)(output_tensor, - input_tensor, - group=group, - async_op=async_op) +def _torch_reduce_scatter_fn(input_tensor: Tensor, output_tensor: Tensor, group=None, async_op=False, prof=False): + return instrument_w_nvtx(dist.reduce_scatter_fn)(output_tensor, input_tensor, group=group, async_op=async_op) @instrument_w_nvtx @@ -45,13 +38,10 @@ def reduce_scatter_coalesced( flattened_tensor = tensor.view(-1) chunk_sz = math.ceil(tensor.numel() / world_sz) partition_lst_for_each_tensor[tensor_idx] = [ - flattened_tensor[rank * chunk_sz:rank * chunk_sz + chunk_sz] - for rank in range(0, - world_sz) + flattened_tensor[rank * chunk_sz:rank * chunk_sz + chunk_sz] for rank in range(0, world_sz) ] - padded_partition_sz_for_each_tensor = tuple( - math.ceil(t.numel() / world_sz) for t in tensors) + padded_partition_sz_for_each_tensor = tuple(math.ceil(t.numel() / world_sz) for t in tensors) if len(tensors) == 1 and tensors[0].numel() % world_sz == 0: # if there's only one tensor being reduced and we don't need to pad @@ -68,21 +58,15 @@ def reduce_scatter_coalesced( tensor_partitions_lst_with_padding.append(tensor_chunk) # add padding if necessary - padding_sz = padded_partition_sz_for_each_tensor[ - tensor_idx] - tensor_chunk.numel() + padding_sz = padded_partition_sz_for_each_tensor[tensor_idx] - tensor_chunk.numel() if padding_sz > 0: tensor_partitions_lst_with_padding.append( - torch.empty(padding_sz, - dtype=tensor_chunk.dtype, - device=tensor_chunk.device)) + torch.empty(padding_sz, dtype=tensor_chunk.dtype, device=tensor_chunk.device)) - tensor_partition_flat_buffer = instrument_w_nvtx( - torch.cat)(tensor_partitions_lst_with_padding) + tensor_partition_flat_buffer = instrument_w_nvtx(torch.cat)(tensor_partitions_lst_with_padding) tensor_partition_flat_buffer.div_(world_sz) # pre-divide - tensor_partition_buffer_for_each_rank: List[Tensor] = torch.chunk( - tensor_partition_flat_buffer, - world_sz) + tensor_partition_buffer_for_each_rank: List[Tensor] = torch.chunk(tensor_partition_flat_buffer, world_sz) # batched reduce-scatter call _torch_reduce_scatter_fn(tensor_partition_flat_buffer, @@ -95,9 +79,7 @@ def reduce_scatter_coalesced( offset = 0 for tensor_idx in range(len(tensors)): output_lst[tensor_idx] = tensor_partition_buffer_for_each_rank[this_rank].narrow( - 0, - offset, - partition_lst_for_each_tensor[tensor_idx][this_rank].numel()) + 0, offset, partition_lst_for_each_tensor[tensor_idx][this_rank].numel()) offset += padded_partition_sz_for_each_tensor[tensor_idx] diff --git a/deepspeed/runtime/comm/mpi.py b/deepspeed/runtime/comm/mpi.py index 9e112bccc..6a3306bcc 100644 --- a/deepspeed/runtime/comm/mpi.py +++ b/deepspeed/runtime/comm/mpi.py @@ -12,6 +12,7 @@ from deepspeed.runtime.compression.cupy import CupyBackend class MpiBackend(object): + def __init__(self, cuda_aware): self.comm = MPI.COMM_WORLD self.rank = self.comm.Get_rank() @@ -31,49 +32,26 @@ class MpiBackend(object): req.append(comm.Isend(sendbuf, dest=root)) return req - def gather_cuda(self, - rank, - world_size, - comm, - cupy_sign_list_packed, - cupy_recvbuf_sign, - cupy_worker_scale, + def gather_cuda(self, rank, world_size, comm, cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale): # We do in-place operations on cupy buffers so we do not return any buffers requests = [] for idx in range(world_size): - req_sign = self.my_igather(rank, - world_size, - comm, - cupy_sign_list_packed[idx], - cupy_recvbuf_sign, - root=idx) + req_sign = self.my_igather(rank, world_size, comm, cupy_sign_list_packed[idx], cupy_recvbuf_sign, root=idx) requests += req_sign for idx in range(world_size): - req_scale = self.my_igather(rank, - world_size, - comm, - cupy_worker_scale, - cupy_recvbuf_scale, - root=idx) + req_scale = self.my_igather(rank, world_size, comm, cupy_worker_scale, cupy_recvbuf_scale, root=idx) requests += req_scale MPI.Request.Waitall(requests) - def gather_host(self, - rank, - world_size, - comm, - cupy_sign_list_packed, - cupy_recvbuf_sign, - cupy_worker_scale, + def gather_host(self, rank, world_size, comm, cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale): # In-place operations are not possible for newly created cupy arrays # so we need to return the new buffers - numpy_recvbuf_sign = np.zeros([world_size, - cupy_sign_list_packed[rank].size], + numpy_recvbuf_sign = np.zeros([world_size, cupy_sign_list_packed[rank].size], dtype=cupy_sign_list_packed[0].dtype) numpy_recvbuf_scale = np.zeros([world_size, 1], dtype=cupy_worker_scale.dtype) @@ -101,12 +79,7 @@ class MpiBackend(object): requests += req_sign for idx in range(world_size): - req_scale = self.my_igather(rank, - world_size, - comm, - numpy_worker_scale, - numpy_recvbuf_scale, - root=idx) + req_scale = self.my_igather(rank, world_size, comm, numpy_worker_scale, numpy_recvbuf_scale, root=idx) requests += req_scale MPI.Request.Waitall(requests) @@ -122,30 +95,18 @@ class MpiBackend(object): return cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale - def allgather_cuda(self, - comm, - cupy_server_sign_packed, - cupy_recvbuf_sign_server, - cupy_server_scale, + def allgather_cuda(self, comm, cupy_server_sign_packed, cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server): comm.Allgather(cupy_server_sign_packed, cupy_recvbuf_sign_server) comm.Allgather(cupy_server_scale, cupy_recvbuf_scale_server) - def allgather_host(self, - comm, - cupy_server_sign_packed, - cupy_recvbuf_sign_server, - cupy_server_scale, + def allgather_host(self, comm, cupy_server_sign_packed, cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server): # 1. Convert cupy to numpy - numpy_recvbuf_sign_server = np.zeros( - [comm.Get_size(), - cupy_server_sign_packed.size], - dtype=cupy_server_sign_packed.dtype) - numpy_recvbuf_scale_server = np.zeros([comm.Get_size(), - 1], - dtype=cupy_server_scale.dtype) + numpy_recvbuf_sign_server = np.zeros([comm.Get_size(), cupy_server_sign_packed.size], + dtype=cupy_server_sign_packed.dtype) + numpy_recvbuf_scale_server = np.zeros([comm.Get_size(), 1], dtype=cupy_server_scale.dtype) numpy_server_sign_packed = cupy.asnumpy(cupy_server_sign_packed) numpy_recvbuf_sign_server = cupy.asnumpy(cupy_recvbuf_sign_server) @@ -167,11 +128,7 @@ class MpiBackend(object): return cupy_server_sign_packed, cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server - def compressed_allreduce(self, - buffer_m: torch.tensor, - worker_error, - server_error, - local_rank): + def compressed_allreduce(self, buffer_m: torch.tensor, worker_error, server_error, local_rank): all_start_time = time.time() original_shape = buffer_m.size() @@ -182,104 +139,71 @@ class MpiBackend(object): cupy.cuda.Device(local_rank).use() if original_size != worker_error_size: - empty_tensor = torch.zeros(worker_error_size - original_size, - device=buffer_m.device) + empty_tensor = torch.zeros(worker_error_size - original_size, device=buffer_m.device) buffer_m = torch.cat([buffer_m, empty_tensor]) buffer_m.add_(worker_error) worker_scale = torch.norm(buffer_m) / np.sqrt(torch.numel(buffer_m)) - worker_error.set_(buffer_m - worker_scale * - buffer_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)) + worker_error.set_(buffer_m - worker_scale * buffer_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)) cupy_sign_list_packed = self.compression_backend.compress_by_chunk( - self.compression_backend.torch2cupy(buffer_m.sign_().add_(1).bool()), - self.size) + self.compression_backend.torch2cupy(buffer_m.sign_().add_(1).bool()), self.size) cupy_worker_scale = self.compression_backend.torch2cupy(worker_scale) - cupy_recvbuf_sign = cupy.zeros( - [self.size, - cupy_sign_list_packed[self.rank].size], - dtype=cupy_sign_list_packed[0].dtype) + cupy_recvbuf_sign = cupy.zeros([self.size, cupy_sign_list_packed[self.rank].size], + dtype=cupy_sign_list_packed[0].dtype) cupy_recvbuf_scale = cupy.zeros([self.size, 1], dtype=cupy_worker_scale.dtype) # Communication Phase 1 gather_start = time.time() if self.cuda_aware: - self.gather_cuda(self.rank, - self.size, - self.comm, - cupy_sign_list_packed, - cupy_recvbuf_sign, - cupy_worker_scale, - cupy_recvbuf_scale) + self.gather_cuda(self.rank, self.size, self.comm, cupy_sign_list_packed, cupy_recvbuf_sign, + cupy_worker_scale, cupy_recvbuf_scale) else: - _, cupy_recvbuf_sign, _, cupy_recvbuf_scale = self.gather_host(self.rank, - self.size, - self.comm, - cupy_sign_list_packed, - cupy_recvbuf_sign, - cupy_worker_scale, - cupy_recvbuf_scale) + _, cupy_recvbuf_sign, _, cupy_recvbuf_scale = self.gather_host(self.rank, self.size, self.comm, + cupy_sign_list_packed, cupy_recvbuf_sign, + cupy_worker_scale, cupy_recvbuf_scale) gather_end = time.time() # cupy_sign_list_packed, cupy_worker_scale, worker_scale = None, None, None cupy_sign_list_packed = None compensated_server_m = self.compression_backend.cupy2torch( - (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape( - self.size, - -1)).float().add_(-0.5).mul_(2.0).mul_( - self.compression_backend.cupy2torch(cupy_recvbuf_scale).mul_( - 1 / self.size)).sum(0) + (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape(self.size, -1)).float().add_(-0.5).mul_(2.0).mul_( + self.compression_backend.cupy2torch(cupy_recvbuf_scale).mul_(1 / self.size)).sum(0) compensated_server_m.add_(server_error) - server_scale = torch.norm(compensated_server_m) / np.sqrt( - compensated_server_m.numel()) - server_error.set_( - compensated_server_m - server_scale * - compensated_server_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)) + server_scale = torch.norm(compensated_server_m) / np.sqrt(compensated_server_m.numel()) + server_error.set_(compensated_server_m - + server_scale * compensated_server_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)) cupy_server_scale = self.compression_backend.torch2cupy(server_scale) cupy_server_sign_packed = self.compression_backend.compress_by_chunk( - self.compression_backend.torch2cupy( - compensated_server_m.sign_().add_(1).bool()), - 1) + self.compression_backend.torch2cupy(compensated_server_m.sign_().add_(1).bool()), 1) compensated_server_m = None - cupy_recvbuf_sign_server = cupy.zeros( - [self.size, - cupy_server_sign_packed[0].size], - dtype=cupy_recvbuf_sign.dtype) - cupy_recvbuf_scale_server = cupy.zeros([self.size, - 1], - dtype=cupy_recvbuf_scale.dtype) + cupy_recvbuf_sign_server = cupy.zeros([self.size, cupy_server_sign_packed[0].size], + dtype=cupy_recvbuf_sign.dtype) + cupy_recvbuf_scale_server = cupy.zeros([self.size, 1], dtype=cupy_recvbuf_scale.dtype) # cupy_recvbuf_sign, cupy_recvbuf_scale = None, None cupy_recvbuf_sign = None # Communication Phase 2 if self.cuda_aware: - self.allgather_cuda(self.comm, - cupy_server_sign_packed[0], - cupy_recvbuf_sign_server, - cupy_server_scale, + self.allgather_cuda(self.comm, cupy_server_sign_packed[0], cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server) else: - _, cupy_recvbuf_sign_server, _, cupy_recvbuf_scale_server = self.allgather_host(self.comm, - cupy_server_sign_packed[0], - cupy_recvbuf_sign_server, - cupy_server_scale, - cupy_recvbuf_scale_server) + _, cupy_recvbuf_sign_server, _, cupy_recvbuf_scale_server = self.allgather_host( + self.comm, cupy_server_sign_packed[0], cupy_recvbuf_sign_server, cupy_server_scale, + cupy_recvbuf_scale_server) # cupy_server_sign_packed, cupy_server_scale, server_scale = None, None, None cupy_server_sign_packed = None buffer_m.data.copy_( - self.compression_backend.cupy2torch( - (cupy.unpackbits(cupy_recvbuf_sign_server.flatten())).reshape( - self.size, - -1)).float().add_(-0.5).mul_(2.0).mul_( - self.compression_backend.cupy2torch( - cupy_recvbuf_scale_server)).flatten().data) + self.compression_backend.cupy2torch((cupy.unpackbits(cupy_recvbuf_sign_server.flatten())).reshape( + self.size, -1)).float().add_(-0.5).mul_(2.0).mul_( + self.compression_backend.cupy2torch(cupy_recvbuf_scale_server)).flatten().data) if original_size != worker_error_size: buffer_m = buffer_m[0:original_size] if len(original_shape) > 1: diff --git a/deepspeed/runtime/comm/nccl.py b/deepspeed/runtime/comm/nccl.py index fd99068df..6018627f8 100644 --- a/deepspeed/runtime/comm/nccl.py +++ b/deepspeed/runtime/comm/nccl.py @@ -12,6 +12,7 @@ from deepspeed.accelerator import get_accelerator class NcclBackend(object): + def __init__(self, mpu=None): if mpu is None: self.world_group = dist.new_group(ranks=range(dist.get_world_size())) @@ -49,11 +50,7 @@ class NcclBackend(object): else: dist.send(sendbuf, group=group, dst=root) - def compressed_allreduce(self, - buffer_m: torch.tensor, - worker_error, - server_error, - local_rank): + def compressed_allreduce(self, buffer_m: torch.tensor, worker_error, server_error, local_rank): # all_start_time = time.time() original_shape = buffer_m.size() @@ -64,53 +61,41 @@ class NcclBackend(object): cupy.cuda.Device(local_rank).use() if original_size != worker_error_size: - empty_tensor = torch.zeros(worker_error_size - original_size, - device=buffer_m.device) + empty_tensor = torch.zeros(worker_error_size - original_size, device=buffer_m.device) buffer_m = torch.cat([buffer_m, empty_tensor]) buffer_m.add_(worker_error) worker_scale = torch.norm(buffer_m) / np.sqrt(buffer_m.numel()) - worker_error.set_(buffer_m - worker_scale * - buffer_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)) + worker_error.set_(buffer_m - worker_scale * buffer_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)) if self.bool_not_supported: cupy_sign_list_packed = self.compression_backend.compress_by_chunk( - self.compression_backend.torch2cupy( - buffer_m.sign_().add_(1).bool().to(dtype=torch.uint8)), - self.size) + self.compression_backend.torch2cupy(buffer_m.sign_().add_(1).bool().to(dtype=torch.uint8)), self.size) else: cupy_sign_list_packed = self.compression_backend.compress_by_chunk( - self.compression_backend.torch2cupy(buffer_m.sign_().add_(1).bool()), - self.size) + self.compression_backend.torch2cupy(buffer_m.sign_().add_(1).bool()), self.size) cupy_worker_scale = self.compression_backend.torch2cupy(worker_scale) - cupy_recvbuf_sign = cupy.zeros( - [self.size, - cupy_sign_list_packed[self.rank].size], - dtype=cupy_sign_list_packed[0].dtype) + cupy_recvbuf_sign = cupy.zeros([self.size, cupy_sign_list_packed[self.rank].size], + dtype=cupy_sign_list_packed[0].dtype) # cupy_recvbuf_scale = cupy.zeros([self.size, 1], dtype=cupy_worker_scale.dtype) sign_list_packed = [ - self.compression_backend.cupy2torch(cupy_sign_list_packed[idx]) - for idx in range(self.size) + self.compression_backend.cupy2torch(cupy_sign_list_packed[idx]) for idx in range(self.size) ] # worker_scale = self.compression_backend.cupy2torch(cupy_worker_scale) recvbuf_sign = self.compression_backend.cupy2torch(cupy_recvbuf_sign) #recvbuf_scale = self.compression_backend.cupy2torch(cupy_recvbuf_scale) recvbuf_scale = [ - torch.zeros(1, - dtype=worker_scale.dtype, - device=torch.device(get_accelerator().device_name(local_rank))) + torch.zeros(1, dtype=worker_scale.dtype, device=torch.device(get_accelerator().device_name(local_rank))) for i in range(self.size) ] # communication phase 1 # gather_start = time.time() # Alltoall for sign - dist.all_to_all_single(recvbuf_sign, - torch.stack(sign_list_packed), - group=self.world_group) + dist.all_to_all_single(recvbuf_sign, torch.stack(sign_list_packed), group=self.world_group) # Allgather for scale dist.all_gather(recvbuf_scale, worker_scale, group=self.world_group) @@ -123,61 +108,44 @@ class NcclBackend(object): #cupy_recvbuf_scale = self.compression_backend.torch2cupy(torch.stack(recvbuf_scale)) compensated_server_m = self.compression_backend.cupy2torch( - (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape( - self.size, - -1)).float().add_(-0.5).mul_(2.0).mul_( - torch.stack(recvbuf_scale).mul_(1 / self.size)).sum(0) + (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape(self.size, -1)).float().add_(-0.5).mul_(2.0).mul_( + torch.stack(recvbuf_scale).mul_(1 / self.size)).sum(0) compensated_server_m.add_(server_error) - server_scale = torch.norm(compensated_server_m) / np.sqrt( - compensated_server_m.numel()) - server_error.set_( - compensated_server_m - server_scale * - compensated_server_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)) + server_scale = torch.norm(compensated_server_m) / np.sqrt(compensated_server_m.numel()) + server_error.set_(compensated_server_m - + server_scale * compensated_server_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)) # cupy_server_scale = self.compression_backend.torch2cupy(server_scale) if self.bool_not_supported: cupy_server_sign_packed = self.compression_backend.compress_by_chunk( - self.compression_backend.torch2cupy( - compensated_server_m.sign_().add_(1).bool().to(dtype=torch.uint8)), + self.compression_backend.torch2cupy(compensated_server_m.sign_().add_(1).bool().to(dtype=torch.uint8)), 1) else: cupy_server_sign_packed = self.compression_backend.compress_by_chunk( - self.compression_backend.torch2cupy( - compensated_server_m.sign_().add_(1).bool()), - 1) + self.compression_backend.torch2cupy(compensated_server_m.sign_().add_(1).bool()), 1) compensated_server_m = None - cupy_recvbuf_sign_server = cupy.zeros( - [self.size, - cupy_server_sign_packed[0].size], - dtype=cupy_recvbuf_sign.dtype) + cupy_recvbuf_sign_server = cupy.zeros([self.size, cupy_server_sign_packed[0].size], + dtype=cupy_recvbuf_sign.dtype) # cupy_recvbuf_sign, recvbuf_sign = None, None cupy_recvbuf_sign = None - server_sign_packed = [ - self.compression_backend.cupy2torch(cupy_server_sign_packed[0]) - ] + server_sign_packed = [self.compression_backend.cupy2torch(cupy_server_sign_packed[0])] recvbuf_sign_server = [ - self.compression_backend.cupy2torch(cupy_recvbuf_sign_server[idx]) - for idx in range(self.size) + self.compression_backend.cupy2torch(cupy_recvbuf_sign_server[idx]) for idx in range(self.size) ] # server_scale = self.compression_backend.cupy2torch(cupy_server_scale) - cupy_recvbuf_scale_server = cupy.zeros([self.size, - 1], - dtype=cupy_worker_scale.dtype) + cupy_recvbuf_scale_server = cupy.zeros([self.size, 1], dtype=cupy_worker_scale.dtype) # cupy_recvbuf_scale, recvbuf_scale = None, None recvbuf_scale_server = [ - self.compression_backend.cupy2torch(cupy_recvbuf_scale_server[idx]) - for idx in range(self.size) + self.compression_backend.cupy2torch(cupy_recvbuf_scale_server[idx]) for idx in range(self.size) ] # Communication Phase 2 - dist.all_gather(recvbuf_sign_server, - server_sign_packed[0], - group=self.world_group) + dist.all_gather(recvbuf_sign_server, server_sign_packed[0], group=self.world_group) dist.all_gather(recvbuf_scale_server, server_scale, group=self.world_group) cupy_server_sign_packed = None @@ -186,16 +154,12 @@ class NcclBackend(object): # dist.all_gather only provides a tensor list as the recv/output buffer recvbuf_sign_server = torch.stack(recvbuf_sign_server) - cupy_recvbuf_sign_server = self.compression_backend.torch2cupy( - recvbuf_sign_server) + cupy_recvbuf_sign_server = self.compression_backend.torch2cupy(recvbuf_sign_server) buffer_m.data.copy_( - self.compression_backend.cupy2torch( - (cupy.unpackbits(cupy_recvbuf_sign_server.flatten())).reshape( - self.size, - -1)).float().add_(-0.5).mul_(2.0).mul_( - self.compression_backend.cupy2torch( - cupy_recvbuf_scale_server)).flatten().data) + self.compression_backend.cupy2torch((cupy.unpackbits(cupy_recvbuf_sign_server.flatten())).reshape( + self.size, -1)).float().add_(-0.5).mul_(2.0).mul_( + self.compression_backend.cupy2torch(cupy_recvbuf_scale_server)).flatten().data) if original_size != worker_error_size: buffer_m = buffer_m[0:original_size] if len(original_shape) > 1: diff --git a/deepspeed/runtime/compression/cupy.py b/deepspeed/runtime/compression/cupy.py index 68e56c68e..8aa454fb8 100644 --- a/deepspeed/runtime/compression/cupy.py +++ b/deepspeed/runtime/compression/cupy.py @@ -8,6 +8,7 @@ from torch.utils.dlpack import from_dlpack class CupyBackend(object): + def __init__(self): pass diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py index 89314e868..572bcd18d 100755 --- a/deepspeed/runtime/config.py +++ b/deepspeed/runtime/config.py @@ -72,12 +72,7 @@ ONEBIT_ADAM_OPTIMIZER = 'onebitadam' ZERO_ONE_ADAM_OPTIMIZER = 'zerooneadam' ONEBIT_LAMB_OPTIMIZER = 'onebitlamb' DEEPSPEED_OPTIMIZERS = [ - ADAGRAD_OPTIMIZER, - ADAM_OPTIMIZER, - ADAMW_OPTIMIZER, - LAMB_OPTIMIZER, - ONEBIT_ADAM_OPTIMIZER, - ONEBIT_LAMB_OPTIMIZER, + ADAGRAD_OPTIMIZER, ADAM_OPTIMIZER, ADAMW_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, ONEBIT_LAMB_OPTIMIZER, ZERO_ONE_ADAM_OPTIMIZER ] @@ -122,9 +117,7 @@ class DtypeEnum(Enum): def get_pld_enabled(param_dict): if PROGRESSIVE_LAYER_DROP in param_dict.keys(): - return get_scalar_param(param_dict[PROGRESSIVE_LAYER_DROP], - PLD_ENABLED, - PLD_ENABLED_DEFAULT) + return get_scalar_param(param_dict[PROGRESSIVE_LAYER_DROP], PLD_ENABLED, PLD_ENABLED_DEFAULT) else: return False @@ -164,17 +157,13 @@ def get_fp16_enabled(param_dict): def get_bfloat16_enabled(param_dict): for key in [BFLOAT16, BFLOAT16_OLD]: if key in param_dict.keys(): - return get_scalar_param(param_dict[key], - BFLOAT16_ENABLED, - BFLOAT16_ENABLED_DEFAULT) + return get_scalar_param(param_dict[key], BFLOAT16_ENABLED, BFLOAT16_ENABLED_DEFAULT) return False def get_fp16_master_weights_and_grads_enabled(param_dict): if get_fp16_enabled(param_dict): - return get_scalar_param(param_dict[FP16], - FP16_MASTER_WEIGHTS_AND_GRADS, - FP16_MASTER_WEIGHTS_AND_GRADS_DEFAULT) + return get_scalar_param(param_dict[FP16], FP16_MASTER_WEIGHTS_AND_GRADS, FP16_MASTER_WEIGHTS_AND_GRADS_DEFAULT) else: return False @@ -186,9 +175,7 @@ def get_fp16_auto_cast(param_dict): def get_loss_scale(param_dict): if get_fp16_enabled(param_dict): - return get_scalar_param(param_dict[FP16], - FP16_LOSS_SCALE, - FP16_LOSS_SCALE_DEFAULT) + return get_scalar_param(param_dict[FP16], FP16_LOSS_SCALE, FP16_LOSS_SCALE_DEFAULT) elif get_bfloat16_enabled(param_dict): return 1.0 else: @@ -197,8 +184,7 @@ def get_loss_scale(param_dict): def get_initial_dynamic_scale(param_dict): if get_fp16_enabled(param_dict): - initial_scale_power = get_scalar_param(param_dict[FP16], - FP16_INITIAL_SCALE_POWER, + initial_scale_power = get_scalar_param(param_dict[FP16], FP16_INITIAL_SCALE_POWER, FP16_INITIAL_SCALE_POWER_DEFAULT) elif get_bfloat16_enabled(param_dict): initial_scale_power = 0 @@ -219,18 +205,10 @@ def get_dynamic_loss_scale_args(param_dict): FP16_HYSTERESIS, ] if any(arg in list(fp16_dict.keys()) for arg in dynamic_loss_args): - init_scale = get_scalar_param(fp16_dict, - FP16_INITIAL_SCALE_POWER, - FP16_INITIAL_SCALE_POWER_DEFAULT) - scale_window = get_scalar_param(fp16_dict, - FP16_LOSS_SCALE_WINDOW, - FP16_LOSS_SCALE_WINDOW_DEFAULT) - delayed_shift = get_scalar_param(fp16_dict, - FP16_HYSTERESIS, - FP16_HYSTERESIS_DEFAULT) - min_loss_scale = get_scalar_param(fp16_dict, - FP16_MIN_LOSS_SCALE, - FP16_MIN_LOSS_SCALE_DEFAULT) + init_scale = get_scalar_param(fp16_dict, FP16_INITIAL_SCALE_POWER, FP16_INITIAL_SCALE_POWER_DEFAULT) + scale_window = get_scalar_param(fp16_dict, FP16_LOSS_SCALE_WINDOW, FP16_LOSS_SCALE_WINDOW_DEFAULT) + delayed_shift = get_scalar_param(fp16_dict, FP16_HYSTERESIS, FP16_HYSTERESIS_DEFAULT) + min_loss_scale = get_scalar_param(fp16_dict, FP16_MIN_LOSS_SCALE, FP16_MIN_LOSS_SCALE_DEFAULT) loss_scale_args = { INITIAL_LOSS_SCALE: 2**init_scale, SCALE_WINDOW: scale_window, @@ -242,9 +220,7 @@ def get_dynamic_loss_scale_args(param_dict): def get_gradient_accumulation_steps(param_dict): - return get_scalar_param(param_dict, - GRADIENT_ACCUMULATION_STEPS, - GRADIENT_ACCUMULATION_STEPS_DEFAULT) + return get_scalar_param(param_dict, GRADIENT_ACCUMULATION_STEPS, GRADIENT_ACCUMULATION_STEPS_DEFAULT) def get_sparse_gradients_enabled(param_dict): @@ -252,9 +228,7 @@ def get_sparse_gradients_enabled(param_dict): def get_communication_data_type(param_dict): - val = get_scalar_param(param_dict, - COMMUNICATION_DATA_TYPE, - COMMUNICATION_DATA_TYPE_DEFAULT) + val = get_scalar_param(param_dict, COMMUNICATION_DATA_TYPE, COMMUNICATION_DATA_TYPE_DEFAULT) val = val.lower() if val is not None else val if val is None: return val # we must determine it by other parameters @@ -265,9 +239,7 @@ def get_communication_data_type(param_dict): elif val == "bfp16": return torch.bfloat16 - raise ValueError( - f"Invalid communication_data_type. Supported data types: ['fp16', 'bfp16', 'fp32']. Got: {val}" - ) + raise ValueError(f"Invalid communication_data_type. Supported data types: ['fp16', 'bfp16', 'fp32']. Got: {val}") def get_prescale_gradients(param_dict): @@ -275,9 +247,7 @@ def get_prescale_gradients(param_dict): def get_gradient_predivide_factor(param_dict): - return get_scalar_param(param_dict, - GRADIENT_PREDIVIDE_FACTOR, - GRADIENT_PREDIVIDE_FACTOR_DEFAULT) + return get_scalar_param(param_dict, GRADIENT_PREDIVIDE_FACTOR, GRADIENT_PREDIVIDE_FACTOR_DEFAULT) def get_steps_per_print(param_dict): @@ -312,8 +282,7 @@ def get_sparse_attention(param_dict): elif mode == SPARSE_BSLONGFORMER_MODE: return get_sparse_bslongformer_config(sparsity) else: - raise NotImplementedError( - f"Given sparsity mode, {mode}, has not been implemented yet!") + raise NotImplementedError(f"Given sparsity mode, {mode}, has not been implemented yet!") else: return None @@ -331,15 +300,9 @@ def get_sparse_fixed_config(sparsity): SPARSE_DIFFERENT_LAYOUT_PER_HEAD, SPARSE_DIFFERENT_LAYOUT_PER_HEAD_DEFAULT, ) - num_local_blocks = get_scalar_param(sparsity, - SPARSE_NUM_LOCAL_BLOCKS, - SPARSE_NUM_LOCAL_BLOCKS_DEFAULT) - num_global_blocks = get_scalar_param(sparsity, - SPARSE_NUM_GLOBAL_BLOCKS, - SPARSE_NUM_GLOBAL_BLOCKS_DEFAULT) - attention = get_scalar_param(sparsity, - SPARSE_ATTENTION_TYPE, - SPARSE_ATTENTION_TYPE_DEFAULT) + num_local_blocks = get_scalar_param(sparsity, SPARSE_NUM_LOCAL_BLOCKS, SPARSE_NUM_LOCAL_BLOCKS_DEFAULT) + num_global_blocks = get_scalar_param(sparsity, SPARSE_NUM_GLOBAL_BLOCKS, SPARSE_NUM_GLOBAL_BLOCKS_DEFAULT) + attention = get_scalar_param(sparsity, SPARSE_ATTENTION_TYPE, SPARSE_ATTENTION_TYPE_DEFAULT) horizontal_global_attention = get_scalar_param( sparsity, SPARSE_HORIZONTAL_GLOBAL_ATTENTION, @@ -370,23 +333,15 @@ def get_sparse_variable_config(sparsity): SPARSE_DIFFERENT_LAYOUT_PER_HEAD, SPARSE_DIFFERENT_LAYOUT_PER_HEAD_DEFAULT, ) - num_random_blocks = get_scalar_param(sparsity, - SPARSE_NUM_RANDOM_BLOCKS, - SPARSE_NUM_RANDOM_BLOCKS_DEFAULT) - local_window_blocks = get_scalar_param(sparsity, - SPARSE_LOCAL_WINDOW_BLOCKS, - SPARSE_LOCAL_WINDOW_BLOCKS_DEFAULT) - global_block_indices = get_scalar_param(sparsity, - SPARSE_GLOBAL_BLOCK_INDICES, - SPARSE_GLOBAL_BLOCK_INDICES_DEFAULT) + num_random_blocks = get_scalar_param(sparsity, SPARSE_NUM_RANDOM_BLOCKS, SPARSE_NUM_RANDOM_BLOCKS_DEFAULT) + local_window_blocks = get_scalar_param(sparsity, SPARSE_LOCAL_WINDOW_BLOCKS, SPARSE_LOCAL_WINDOW_BLOCKS_DEFAULT) + global_block_indices = get_scalar_param(sparsity, SPARSE_GLOBAL_BLOCK_INDICES, SPARSE_GLOBAL_BLOCK_INDICES_DEFAULT) global_block_end_indices = get_scalar_param( sparsity, SPARSE_GLOBAL_BLOCK_END_INDICES, SPARSE_GLOBAL_BLOCK_END_INDICES_DEFAULT, ) - attention = get_scalar_param(sparsity, - SPARSE_ATTENTION_TYPE, - SPARSE_ATTENTION_TYPE_DEFAULT) + attention = get_scalar_param(sparsity, SPARSE_ATTENTION_TYPE, SPARSE_ATTENTION_TYPE_DEFAULT) horizontal_global_attention = get_scalar_param( sparsity, SPARSE_HORIZONTAL_GLOBAL_ATTENTION, @@ -413,17 +368,13 @@ def get_sparse_bigbird_config(sparsity): SPARSE_DIFFERENT_LAYOUT_PER_HEAD, SPARSE_DIFFERENT_LAYOUT_PER_HEAD_DEFAULT, ) - num_random_blocks = get_scalar_param(sparsity, - SPARSE_NUM_RANDOM_BLOCKS, - SPARSE_NUM_RANDOM_BLOCKS_DEFAULT) + num_random_blocks = get_scalar_param(sparsity, SPARSE_NUM_RANDOM_BLOCKS, SPARSE_NUM_RANDOM_BLOCKS_DEFAULT) num_sliding_window_blocks = get_scalar_param( sparsity, SPARSE_NUM_SLIDING_WINDOW_BLOCKS, SPARSE_NUM_SLIDING_WINDOW_BLOCKS_DEFAULT, ) - num_global_blocks = get_scalar_param(sparsity, - SPARSE_NUM_GLOBAL_BLOCKS, - SPARSE_NUM_GLOBAL_BLOCKS_DEFAULT) + num_global_blocks = get_scalar_param(sparsity, SPARSE_NUM_GLOBAL_BLOCKS, SPARSE_NUM_GLOBAL_BLOCKS_DEFAULT) return { SPARSE_MODE: SPARSE_BIGBIRD_MODE, @@ -447,9 +398,7 @@ def get_sparse_bslongformer_config(sparsity): SPARSE_NUM_SLIDING_WINDOW_BLOCKS, SPARSE_NUM_SLIDING_WINDOW_BLOCKS_DEFAULT, ) - global_block_indices = get_scalar_param(sparsity, - SPARSE_GLOBAL_BLOCK_INDICES, - SPARSE_GLOBAL_BLOCK_INDICES_DEFAULT) + global_block_indices = get_scalar_param(sparsity, SPARSE_GLOBAL_BLOCK_INDICES, SPARSE_GLOBAL_BLOCK_INDICES_DEFAULT) global_block_end_indices = get_scalar_param( sparsity, SPARSE_GLOBAL_BLOCK_END_INDICES, @@ -502,8 +451,7 @@ def get_optimizer_name(param_dict): def get_optimizer_params(param_dict): - if (get_optimizer_name(param_dict) is not None - and OPTIMIZER_PARAMS in param_dict[OPTIMIZER].keys()): + if (get_optimizer_name(param_dict) is not None and OPTIMIZER_PARAMS in param_dict[OPTIMIZER].keys()): return param_dict[OPTIMIZER][OPTIMIZER_PARAMS] else: return None @@ -525,15 +473,11 @@ def get_optimizer_legacy_fusion(param_dict): def get_zero_allow_untested_optimizer(param_dict): - return get_scalar_param(param_dict, - ZERO_ALLOW_UNTESTED_OPTIMIZER, - ZERO_ALLOW_UNTESTED_OPTIMIZER_DEFAULT) + return get_scalar_param(param_dict, ZERO_ALLOW_UNTESTED_OPTIMIZER, ZERO_ALLOW_UNTESTED_OPTIMIZER_DEFAULT) def get_zero_force_ds_cpu_optimizer(param_dict): - return get_scalar_param(param_dict, - ZERO_FORCE_DS_CPU_OPTIMIZER, - ZERO_FORCE_DS_CPU_OPTIMIZER_DEFAULT) + return get_scalar_param(param_dict, ZERO_FORCE_DS_CPU_OPTIMIZER, ZERO_FORCE_DS_CPU_OPTIMIZER_DEFAULT) def get_scheduler_name(param_dict): @@ -544,8 +488,7 @@ def get_scheduler_name(param_dict): def get_scheduler_params(param_dict): - if (get_scheduler_name(param_dict) is not None - and SCHEDULER_PARAMS in param_dict[SCHEDULER].keys()): + if (get_scheduler_name(param_dict) is not None and SCHEDULER_PARAMS in param_dict[SCHEDULER].keys()): return param_dict[SCHEDULER][SCHEDULER_PARAMS] else: return None @@ -564,9 +507,7 @@ def get_train_micro_batch_size_per_gpu(param_dict): def get_wall_clock_breakdown(param_dict): - return get_scalar_param(param_dict, - WALL_CLOCK_BREAKDOWN, - WALL_CLOCK_BREAKDOWN_DEFAULT) + return get_scalar_param(param_dict, WALL_CLOCK_BREAKDOWN, WALL_CLOCK_BREAKDOWN_DEFAULT) def get_memory_breakdown(param_dict): @@ -602,45 +543,35 @@ def get_eigenvalue_config(param_dict): def get_eigenvalue_enabled(param_dict): if EIGENVALUE in param_dict.keys(): - return get_scalar_param(param_dict[EIGENVALUE], - EIGENVALUE_ENABLED, - EIGENVALUE_ENABLED_DEFAULT) + return get_scalar_param(param_dict[EIGENVALUE], EIGENVALUE_ENABLED, EIGENVALUE_ENABLED_DEFAULT) else: return EIGENVALUE_ENABLED_DEFAULT def get_eigenvalue_verbose(param_dict): if EIGENVALUE in param_dict.keys(): - return get_scalar_param(param_dict[EIGENVALUE], - EIGENVALUE_VERBOSE, - EIGENVALUE_VERBOSE_DEFAULT) + return get_scalar_param(param_dict[EIGENVALUE], EIGENVALUE_VERBOSE, EIGENVALUE_VERBOSE_DEFAULT) else: return EIGENVALUE_VERBOSE_DEFAULT def get_eigenvalue_max_iter(param_dict): if EIGENVALUE in param_dict.keys(): - return get_scalar_param(param_dict[EIGENVALUE], - EIGENVALUE_MAX_ITER, - EIGENVALUE_MAX_ITER_DEFAULT) + return get_scalar_param(param_dict[EIGENVALUE], EIGENVALUE_MAX_ITER, EIGENVALUE_MAX_ITER_DEFAULT) else: return EIGENVALUE_MAX_ITER_DEFAULT def get_eigenvalue_tol(param_dict): if EIGENVALUE in param_dict.keys(): - return get_scalar_param(param_dict[EIGENVALUE], - EIGENVALUE_TOL, - EIGENVALUE_TOL_DEFAULT) + return get_scalar_param(param_dict[EIGENVALUE], EIGENVALUE_TOL, EIGENVALUE_TOL_DEFAULT) else: return EIGENVALUE_TOL_DEFAULT def get_eigenvalue_stability(param_dict): if EIGENVALUE in param_dict.keys(): - return get_scalar_param(param_dict[EIGENVALUE], - EIGENVALUE_STABILITY, - EIGENVALUE_STABILITY_DEFAULT) + return get_scalar_param(param_dict[EIGENVALUE], EIGENVALUE_STABILITY, EIGENVALUE_STABILITY_DEFAULT) else: return EIGENVALUE_STABILITY_DEFAULT @@ -658,18 +589,14 @@ def get_eigenvalue_gas_boundary_resolution(param_dict): def get_eigenvalue_layer_name(param_dict): if EIGENVALUE in param_dict.keys(): - return get_scalar_param(param_dict[EIGENVALUE], - EIGENVALUE_LAYER_NAME, - EIGENVALUE_LAYER_NAME_DEFAULT) + return get_scalar_param(param_dict[EIGENVALUE], EIGENVALUE_LAYER_NAME, EIGENVALUE_LAYER_NAME_DEFAULT) else: return EIGENVALUE_LAYER_NAME_DEFAULT def get_eigenvalue_layer_num(param_dict): if EIGENVALUE in param_dict.keys(): - return get_scalar_param(param_dict[EIGENVALUE], - EIGENVALUE_LAYER_NUM, - EIGENVALUE_LAYER_NUM_DEFAULT) + return get_scalar_param(param_dict[EIGENVALUE], EIGENVALUE_LAYER_NUM, EIGENVALUE_LAYER_NUM_DEFAULT) else: return EIGENVALUE_LAYER_NUM_DEFAULT @@ -683,35 +610,29 @@ def get_data_types_params(param_dict): def get_checkpoint_tag_validation_mode(checkpoint_params): - tag_validation_mode = checkpoint_params.get(CHECKPOINT_TAG_VALIDATION, - CHECKPOINT_TAG_VALIDATION_DEFAULT) + tag_validation_mode = checkpoint_params.get(CHECKPOINT_TAG_VALIDATION, CHECKPOINT_TAG_VALIDATION_DEFAULT) tag_validation_mode = tag_validation_mode.upper() if tag_validation_mode in CHECKPOINT_TAG_VALIDATION_MODES: return tag_validation_mode else: raise DeepSpeedConfigError( "Checkpoint config contains invalid tag_validation " - f"value of {tag_validation_mode}, expecting one of {CHECKPOINT_TAG_VALIDATION_MODES}" - ) + f"value of {tag_validation_mode}, expecting one of {CHECKPOINT_TAG_VALIDATION_MODES}") def get_checkpoint_parallel_write_pipeline(checkpoint_params): par_write_params = checkpoint_params.get(CHECKPOINT_PARALLEL_WRITE, {}) - par_write_pipeline = par_write_params.get( - CHECKPOINT_PARALLEL_WRITE_PIPELINE_STAGE, - CHECKPOINT_PARALLEL_WRITE_PIPELINE_STAGE_DEFAULT) + par_write_pipeline = par_write_params.get(CHECKPOINT_PARALLEL_WRITE_PIPELINE_STAGE, + CHECKPOINT_PARALLEL_WRITE_PIPELINE_STAGE_DEFAULT) if par_write_pipeline in [True, False]: return par_write_pipeline else: - raise DeepSpeedConfigError( - "checkpoint::parallel_write::pipeline_stage " - f"value of '{par_write_pipeline}' is invalid, expecting: true or false") + raise DeepSpeedConfigError("checkpoint::parallel_write::pipeline_stage " + f"value of '{par_write_pipeline}' is invalid, expecting: true or false") def get_dataloader_drop_last(param_dict): - return get_scalar_param(param_dict, - DATALOADER_DROP_LAST, - DATALOADER_DROP_LAST_DEFAULT) + return get_scalar_param(param_dict, DATALOADER_DROP_LAST, DATALOADER_DROP_LAST_DEFAULT) '''Write deepspeed config files by modifying basic templates. @@ -719,6 +640,7 @@ Can be used for quickly changing parameters via command line parameters.''' class DeepSpeedConfigWriter: + def __init__(self, data=None): self.data = data if data is not None else {} @@ -726,9 +648,7 @@ class DeepSpeedConfigWriter: self.data[key] = value def load_config(self, filename): - self.data = json.load(open(filename, - "r"), - object_pairs_hook=dict_raise_error_on_duplicate_keys) + self.data = json.load(open(filename, "r"), object_pairs_hook=dict_raise_error_on_duplicate_keys) def write_config(self, filename): with open(filename, "w") as outfile: @@ -736,15 +656,13 @@ class DeepSpeedConfigWriter: class DeepSpeedConfig(object): + def __init__(self, config: Union[str, dict], mpu=None): super(DeepSpeedConfig, self).__init__() if isinstance(config, dict): self._param_dict = config elif os.path.exists(config): - self._param_dict = hjson.load( - open(config, - "r"), - object_pairs_hook=dict_raise_error_on_duplicate_keys) + self._param_dict = hjson.load(open(config, "r"), object_pairs_hook=dict_raise_error_on_duplicate_keys) else: try: config_decoded = base64.urlsafe_b64decode(config).decode('utf-8') @@ -778,24 +696,18 @@ class DeepSpeedConfig(object): # Ensure the resource scheduler saw the same elastic config we are using at runtime ensure_immutable_elastic_config(runtime_elastic_config_dict=elastic_dict) - self.elastic_model_parallel_size = elastic_dict.get( - MODEL_PARLLEL_SIZE, - MODEL_PARLLEL_SIZE_DEFAULT) + self.elastic_model_parallel_size = elastic_dict.get(MODEL_PARLLEL_SIZE, MODEL_PARLLEL_SIZE_DEFAULT) if self.elastic_model_parallel_size < 1: - raise ElasticityConfigError( - "Model-Parallel size cannot be less than 1, " - f"given model-parallel size: {self.elastic_model_parallel_size}") + raise ElasticityConfigError("Model-Parallel size cannot be less than 1, " + f"given model-parallel size: {self.elastic_model_parallel_size}") - self.num_gpus_per_node = elastic_dict.get(NUM_GPUS_PER_NODE, - NUM_GPUS_PER_NODE_DEFAULT) + self.num_gpus_per_node = elastic_dict.get(NUM_GPUS_PER_NODE, NUM_GPUS_PER_NODE_DEFAULT) if self.num_gpus_per_node < 1: - raise ElasticityConfigError( - "NUmber of GPUs per node cannot be less than 1, " - f"given number of GPUs per node: {self.num_gpus_per_node}") + raise ElasticityConfigError("NUmber of GPUs per node cannot be less than 1, " + f"given number of GPUs per node: {self.num_gpus_per_node}") - ignore_non_elastic_batch_info = elastic_dict.get( - IGNORE_NON_ELASTIC_BATCH_INFO, - IGNORE_NON_ELASTIC_BATCH_INFO_DEFAULT) + ignore_non_elastic_batch_info = elastic_dict.get(IGNORE_NON_ELASTIC_BATCH_INFO, + IGNORE_NON_ELASTIC_BATCH_INFO_DEFAULT) if not ignore_non_elastic_batch_info: batch_params = [ @@ -813,23 +725,17 @@ class DeepSpeedConfig(object): # micro_bsz * world_size * gas = total_batch_size # gas = total_batch_size // (micro_bsz * world_size) - gradient_accu_steps = final_batch_size // (micro_batch_size * - self.world_size) + gradient_accu_steps = final_batch_size // (micro_batch_size * self.world_size) if TRAIN_BATCH_SIZE in self._param_dict: - logger.warning( - "[Elasticity] overriding training_batch_size: " - f"{self._param_dict[TRAIN_BATCH_SIZE]} -> {final_batch_size}") + logger.warning("[Elasticity] overriding training_batch_size: " + f"{self._param_dict[TRAIN_BATCH_SIZE]} -> {final_batch_size}") if TRAIN_MICRO_BATCH_SIZE_PER_GPU in self._param_dict: - logger.warning( - "[Elasticity] overriding train_micro_batch_size_per_gpu: " - f"{self._param_dict[TRAIN_MICRO_BATCH_SIZE_PER_GPU]} -> {micro_batch_size}" - ) + logger.warning("[Elasticity] overriding train_micro_batch_size_per_gpu: " + f"{self._param_dict[TRAIN_MICRO_BATCH_SIZE_PER_GPU]} -> {micro_batch_size}") if GRADIENT_ACCUMULATION_STEPS in self._param_dict: - logger.warning( - "[Elasticity] overriding gradient_accumulation_steps: " - f"{self._param_dict[GRADIENT_ACCUMULATION_STEPS]} -> {gradient_accu_steps}" - ) + logger.warning("[Elasticity] overriding gradient_accumulation_steps: " + f"{self._param_dict[GRADIENT_ACCUMULATION_STEPS]} -> {gradient_accu_steps}") logger.info(f"[Elasticity] valid GPU counts: {valid_gpus}") @@ -845,8 +751,7 @@ class DeepSpeedConfig(object): def _initialize_params(self, param_dict): self.train_batch_size = get_train_batch_size(param_dict) #print(f"beginning get_train_batch_size = {get_train_batch_size}") - self.train_micro_batch_size_per_gpu = get_train_micro_batch_size_per_gpu( - param_dict) + self.train_micro_batch_size_per_gpu = get_train_micro_batch_size_per_gpu(param_dict) self.gradient_accumulation_steps = get_gradient_accumulation_steps(param_dict) self.steps_per_print = get_steps_per_print(param_dict) self.dump_state = get_dump_state(param_dict) @@ -861,8 +766,7 @@ class DeepSpeedConfig(object): self.zero_optimization_stage = self.zero_config.stage self.zero_enabled = self.zero_optimization_stage > 0 - self.activation_checkpointing_config = DeepSpeedActivationCheckpointingConfig( - param_dict) + self.activation_checkpointing_config = DeepSpeedActivationCheckpointingConfig(param_dict) self.comms_config = DeepSpeedCommsConfig(param_dict) self.monitor_config = get_monitor_config(param_dict) @@ -871,9 +775,9 @@ class DeepSpeedConfig(object): self.fp16_enabled = get_fp16_enabled(param_dict) self.fp16_auto_cast = get_fp16_auto_cast(param_dict) self.bfloat16_enabled = get_bfloat16_enabled(param_dict) - assert not (self.fp16_enabled and self.bfloat16_enabled), 'bfloat16 and fp16 modes cannot be simultaneously enabled' - self.fp16_master_weights_and_gradients = get_fp16_master_weights_and_grads_enabled( - param_dict) + assert not (self.fp16_enabled + and self.bfloat16_enabled), 'bfloat16 and fp16 modes cannot be simultaneously enabled' + self.fp16_master_weights_and_gradients = get_fp16_master_weights_and_grads_enabled(param_dict) self.amp_enabled = get_amp_enabled(param_dict) self.amp_params = get_amp_params(param_dict) self.loss_scale = get_loss_scale(param_dict) @@ -883,15 +787,13 @@ class DeepSpeedConfig(object): self.compression_config = get_compression_config(param_dict) self.optimizer_name = get_optimizer_name(param_dict) - if (self.optimizer_name is not None - and self.optimizer_name.lower() in DEEPSPEED_OPTIMIZERS): + if (self.optimizer_name is not None and self.optimizer_name.lower() in DEEPSPEED_OPTIMIZERS): self.optimizer_name = self.optimizer_name.lower() self.optimizer_params = get_optimizer_params(param_dict) self.optimizer_legacy_fusion = get_optimizer_legacy_fusion(param_dict) - self.zero_allow_untested_optimizer = get_zero_allow_untested_optimizer( - param_dict) + self.zero_allow_untested_optimizer = get_zero_allow_untested_optimizer(param_dict) self.zero_force_ds_cpu_optimizer = get_zero_force_ds_cpu_optimizer(param_dict) @@ -899,8 +801,7 @@ class DeepSpeedConfig(object): self.scheduler_params = get_scheduler_params(param_dict) self.flops_profiler_config = DeepSpeedFlopsProfilerConfig(param_dict) - self.wall_clock_breakdown = (get_wall_clock_breakdown(param_dict) - | self.flops_profiler_config.enabled) + self.wall_clock_breakdown = (get_wall_clock_breakdown(param_dict) | self.flops_profiler_config.enabled) self.memory_breakdown = get_memory_breakdown(param_dict) self.autotuning_config = DeepSpeedAutotuningConfig(param_dict) @@ -929,20 +830,16 @@ class DeepSpeedConfig(object): checkpoint_params = get_checkpoint_params(param_dict) validation_mode = get_checkpoint_tag_validation_mode(checkpoint_params) - self.checkpoint_tag_validation_enabled = (validation_mode != - ValidationMode.IGNORE) + self.checkpoint_tag_validation_enabled = (validation_mode != ValidationMode.IGNORE) self.checkpoint_tag_validation_fail = validation_mode == ValidationMode.FAIL - self.load_universal_checkpoint = checkpoint_params.get( - LOAD_UNIVERSAL_CHECKPOINT, - LOAD_UNIVERSAL_CHECKPOINT_DEFAULT) + self.load_universal_checkpoint = checkpoint_params.get(LOAD_UNIVERSAL_CHECKPOINT, + LOAD_UNIVERSAL_CHECKPOINT_DEFAULT) - self.use_node_local_storage = checkpoint_params.get( - USE_NODE_LOCAL_STORAGE_CHECKPOINT, - USE_NODE_LOCAL_STORAGE_CHECKPOINT_DEFAULT) + self.use_node_local_storage = checkpoint_params.get(USE_NODE_LOCAL_STORAGE_CHECKPOINT, + USE_NODE_LOCAL_STORAGE_CHECKPOINT_DEFAULT) data_types_params = get_data_types_params(param_dict) - self.grad_accum_dtype = data_types_params.get(GRAD_ACCUM_DTYPE, - GRAD_ACCUM_DTYPE_DEFAULT) + self.grad_accum_dtype = data_types_params.get(GRAD_ACCUM_DTYPE, GRAD_ACCUM_DTYPE_DEFAULT) par_write_pipe = get_checkpoint_parallel_write_pipeline(checkpoint_params) self.checkpoint_parallel_write_pipeline = par_write_pipe @@ -959,23 +856,16 @@ class DeepSpeedConfig(object): micro_batch = self.train_micro_batch_size_per_gpu grad_acc = self.gradient_accumulation_steps - assert ( - train_batch > 0 - ), f"Train batch size: {train_batch} has to be greater than 0" + assert (train_batch > 0), f"Train batch size: {train_batch} has to be greater than 0" - assert ( - micro_batch > 0 - ), f"Micro batch size per gpu: {micro_batch} has to be greater than 0" + assert (micro_batch > 0), f"Micro batch size per gpu: {micro_batch} has to be greater than 0" - assert ( - grad_acc > 0 - ), f"Gradient accumulation steps: {grad_acc} has to be greater than 0" + assert (grad_acc > 0), f"Gradient accumulation steps: {grad_acc} has to be greater than 0" assert train_batch == micro_batch * grad_acc * self.world_size, ( f"Check batch related parameters. train_batch_size is not equal " "to micro_batch_per_gpu * gradient_acc_step * world_size " - f"{train_batch} != {micro_batch} * {grad_acc} * {self.world_size}" - ) + f"{train_batch} != {micro_batch} * {grad_acc} * {self.world_size}") def _set_batch_related_parameters(self): @@ -1038,8 +928,7 @@ class DeepSpeedConfig(object): sort_keys=True, indent=4, cls=ScientificNotationEncoder, - separators=(",", - ":"), + separators=(",", ":"), ))) def print(self, name): @@ -1052,20 +941,16 @@ class DeepSpeedConfig(object): self.print_user_config() def _do_error_check(self): - assert ( - self.train_micro_batch_size_per_gpu - ), "DeepSpeedConfig: {} is not defined".format(TRAIN_MICRO_BATCH_SIZE_PER_GPU) + assert (self.train_micro_batch_size_per_gpu + ), "DeepSpeedConfig: {} is not defined".format(TRAIN_MICRO_BATCH_SIZE_PER_GPU) assert ( - self.gradient_accumulation_steps - ), "DeepSpeedConfig: {} is not defined".format(GRADIENT_ACCUMULATION_STEPS) + self.gradient_accumulation_steps), "DeepSpeedConfig: {} is not defined".format(GRADIENT_ACCUMULATION_STEPS) if self.zero_enabled: - assert ( - self.zero_optimization_stage <= ZeroStageEnum.max_stage - ), "DeepSpeedConfig: Maximum supported ZeRO stage is {}".format( - ZeroStageEnum.max_stage - ) + assert (self.zero_optimization_stage <= + ZeroStageEnum.max_stage), "DeepSpeedConfig: Maximum supported ZeRO stage is {}".format( + ZeroStageEnum.max_stage) if self.fp16_master_weights_and_gradients: assert self.zero_enabled and self.zero_optimization_stage == ZeroStageEnum.gradients, "Fp16_master_weights_and_grads is only supported with ZeRO Stage 2 for now." @@ -1076,19 +961,15 @@ class DeepSpeedConfig(object): vocabulary_size = self._param_dict.get(VOCABULARY_SIZE, VOCABULARY_SIZE_DEFAULT) if vocabulary_size and vocabulary_size % TENSOR_CORE_ALIGN_SIZE != 0: logger.warning( - "DeepSpeedConfig: vocabulary size {} is not aligned to {}, may import tensor core utilization." - .format(vocabulary_size, - TENSOR_CORE_ALIGN_SIZE)) + "DeepSpeedConfig: vocabulary size {} is not aligned to {}, may import tensor core utilization.".format( + vocabulary_size, TENSOR_CORE_ALIGN_SIZE)) - if (self.optimizer_params is not None - and MAX_GRAD_NORM in self.optimizer_params.keys() + if (self.optimizer_params is not None and MAX_GRAD_NORM in self.optimizer_params.keys() and self.optimizer_params[MAX_GRAD_NORM] > 0): if fp16_enabled: if self.global_rank == 0: - logger.warning( - "DeepSpeedConfig: In FP16 mode, DeepSpeed will pass {}:{} to FP16 wrapper" - .format(MAX_GRAD_NORM, - self.optimizer_params[MAX_GRAD_NORM])) + logger.warning("DeepSpeedConfig: In FP16 mode, DeepSpeed will pass {}:{} to FP16 wrapper".format( + MAX_GRAD_NORM, self.optimizer_params[MAX_GRAD_NORM])) else: if self.global_rank == 0: logger.warning( diff --git a/deepspeed/runtime/config_utils.py b/deepspeed/runtime/config_utils.py index cd262f14a..c4a4f86a6 100755 --- a/deepspeed/runtime/config_utils.py +++ b/deepspeed/runtime/config_utils.py @@ -50,15 +50,10 @@ class DeepSpeedConfigModel(BaseModel): new_param='my_new_field', new_param_fn=(lambda x: int(x))) """ + def __init__(self, strict=False, **data): - if ( - not strict - ): # This is temporary until we refactor all DS configs, allows HF to load models - data = { - k: v - for k, - v in data.items() if (v != "auto" or k == "replace_method") - } + if (not strict): # This is temporary until we refactor all DS configs, allows HF to load models + data = {k: v for k, v in data.items() if (v != "auto" or k == "replace_method")} super().__init__(**data) self._deprecated_fields_check(self) @@ -73,8 +68,7 @@ class DeepSpeedConfigModel(BaseModel): dep_msg = kwargs.get("deprecated_msg", "") if dep_param in fields_set: logger.warning(f"Config parameter {dep_param} is deprecated" + - (f" use {new_param} instead" if new_param else "") + - (f". {dep_msg}" if dep_msg else "")) + (f" use {new_param} instead" if new_param else "") + (f". {dep_msg}" if dep_msg else "")) # Check if there is a new param and if it should be set with a value if new_param and kwargs.get("set_new_param", True): # Remove the deprecate field if there is a replacing field @@ -89,9 +83,7 @@ class DeepSpeedConfigModel(BaseModel): if len(new_param_nested) > 1: # If the new param exists in a subconfig, we need to get # the fields set for that subconfig - pydantic_config = reduce(getattr, - new_param_nested[:-1], - pydantic_config) + pydantic_config = reduce(getattr, new_param_nested[:-1], pydantic_config) fields_set = pydantic_config.__fields_set__ new_param_name = new_param_nested[-1] assert ( @@ -101,9 +93,7 @@ class DeepSpeedConfigModel(BaseModel): try: setattr(pydantic_config, new_param_name, param_value) except Exception as e: - logger.error( - f"Tried setting value for '{new_param}' with value from deprecated '{dep_param}'" - ) + logger.error(f"Tried setting value for '{new_param}' with value from deprecated '{dep_param}'") raise e def _deprecated_fields_check(self, pydantic_config): @@ -127,6 +117,7 @@ class pp_int(int): string of the integer. For example, print(pp_int(1e5)) will return "10,000". This is useful mainly for auto-generated documentation purposes. """ + def __new__(cls, val, custom_print_str=None): inst = super().__new__(cls, val) inst.custom_print_str = custom_print_str @@ -148,6 +139,7 @@ class ScientificNotationEncoder(json.JSONEncoder): Just pass ``cls=ScientificNotationEncoder`` to ``json.dumps`` to activate it """ + def iterencode(self, o, _one_shot=False, level=0): indent = self.indent if self.indent is not None else 4 prefix_close = " " * level * indent @@ -161,10 +153,7 @@ class ScientificNotationEncoder(json.JSONEncoder): else: return f"{o}" elif isinstance(o, collections.abc.Mapping): - x = [ - f'\n{prefix}"{k}": {self.iterencode(v, level=level)}' for k, - v in o.items() - ] + x = [f'\n{prefix}"{k}": {self.iterencode(v, level=level)}' for k, v in o.items()] return "{" + ", ".join(x) + f"\n{prefix_close}" + "}" elif isinstance(o, collections.abc.Sequence) and not isinstance(o, str): return f"[{ f', '.join(map(self.iterencode, o)) }]" @@ -175,6 +164,7 @@ class DeepSpeedConfigObject(object): """ For json serialization """ + def repr(self): return self.__dict__ diff --git a/deepspeed/runtime/constants.py b/deepspeed/runtime/constants.py index 450847126..2787e49d8 100755 --- a/deepspeed/runtime/constants.py +++ b/deepspeed/runtime/constants.py @@ -370,11 +370,7 @@ class ValidationMode: CHECKPOINT = "checkpoint" CHECKPOINT_TAG_VALIDATION = "tag_validation" CHECKPOINT_TAG_VALIDATION_DEFAULT = ValidationMode.WARN -CHECKPOINT_TAG_VALIDATION_MODES = [ - ValidationMode.WARN, - ValidationMode.IGNORE, - ValidationMode.FAIL -] +CHECKPOINT_TAG_VALIDATION_MODES = [ValidationMode.WARN, ValidationMode.IGNORE, ValidationMode.FAIL] LOAD_UNIVERSAL_CHECKPOINT = "load_universal" LOAD_UNIVERSAL_CHECKPOINT_DEFAULT = False diff --git a/deepspeed/runtime/data_pipeline/config.py b/deepspeed/runtime/data_pipeline/config.py index eefa1402e..e326f2dd6 100644 --- a/deepspeed/runtime/data_pipeline/config.py +++ b/deepspeed/runtime/data_pipeline/config.py @@ -24,18 +24,14 @@ def get_data_efficiency_config(param_dict): def get_data_efficiency_enabled(param_dict): if DATA_EFFICIENCY in param_dict.keys(): - return get_scalar_param(param_dict[DATA_EFFICIENCY], - DATA_EFFICIENCY_ENABLED, - DATA_EFFICIENCY_ENABLED_DEFAULT) + return get_scalar_param(param_dict[DATA_EFFICIENCY], DATA_EFFICIENCY_ENABLED, DATA_EFFICIENCY_ENABLED_DEFAULT) else: return False def get_data_efficiency_seed(param_dict): if DATA_EFFICIENCY in param_dict.keys(): - return get_scalar_param(param_dict[DATA_EFFICIENCY], - DATA_EFFICIENCY_SEED, - DATA_EFFICIENCY_SEED_DEFAULT) + return get_scalar_param(param_dict[DATA_EFFICIENCY], DATA_EFFICIENCY_SEED, DATA_EFFICIENCY_SEED_DEFAULT) else: return DATA_EFFICIENCY_SEED_DEFAULT @@ -55,26 +51,21 @@ def get_data_sampling(param_dict): def get_data_sampling_enabled(param_dict): if DATA_SAMPLING in param_dict.keys(): - return get_scalar_param(param_dict[DATA_SAMPLING], - DATA_SAMPLING_ENABLED, - DATA_SAMPLING_ENABLED_DEFAULT) + return get_scalar_param(param_dict[DATA_SAMPLING], DATA_SAMPLING_ENABLED, DATA_SAMPLING_ENABLED_DEFAULT) else: return False def get_data_sampling_num_epochs(param_dict): if DATA_SAMPLING in param_dict.keys(): - return get_scalar_param(param_dict[DATA_SAMPLING], - DATA_SAMPLING_NUM_EPOCHS, - DATA_SAMPLING_NUM_EPOCHS_DEFAULT) + return get_scalar_param(param_dict[DATA_SAMPLING], DATA_SAMPLING_NUM_EPOCHS, DATA_SAMPLING_NUM_EPOCHS_DEFAULT) else: return DATA_SAMPLING_NUM_EPOCHS_DEFAULT def get_data_sampling_num_workers(param_dict): if DATA_SAMPLING in param_dict.keys(): - return get_scalar_param(param_dict[DATA_SAMPLING], - DATA_SAMPLING_NUM_WORKERS, + return get_scalar_param(param_dict[DATA_SAMPLING], DATA_SAMPLING_NUM_WORKERS, DATA_SAMPLING_NUM_WORKERS_DEFAULT) else: return DATA_SAMPLING_NUM_WORKERS_DEFAULT @@ -87,7 +78,8 @@ def get_curriculum_learning(param_dict): param_dict[CURRICULUM_LEARNING] = {} sub_param_dict = param_dict[CURRICULUM_LEARNING] if output[CURRICULUM_LEARNING_ENABLED]: - assert CURRICULUM_LEARNING_METRICS in sub_param_dict.keys(), f"Curriculum learning is enabled, {CURRICULUM_LEARNING_METRICS} must be specified" + assert CURRICULUM_LEARNING_METRICS in sub_param_dict.keys( + ), f"Curriculum learning is enabled, {CURRICULUM_LEARNING_METRICS} must be specified" for key, val in get_curriculum_learning_params(param_dict).items(): output[key] = val return output @@ -95,8 +87,7 @@ def get_curriculum_learning(param_dict): def get_curriculum_learning_enabled(param_dict): if CURRICULUM_LEARNING in param_dict.keys(): - return get_scalar_param(param_dict[CURRICULUM_LEARNING], - CURRICULUM_LEARNING_ENABLED, + return get_scalar_param(param_dict[CURRICULUM_LEARNING], CURRICULUM_LEARNING_ENABLED, CURRICULUM_LEARNING_ENABLED_DEFAULT) else: return False @@ -113,8 +104,7 @@ def get_curriculum_learning_params(param_dict): def get_curriculum_enabled_legacy(param_dict): if CURRICULUM_LEARNING_LEGACY in param_dict.keys(): - return get_scalar_param(param_dict[CURRICULUM_LEARNING_LEGACY], - CURRICULUM_ENABLED_LEGACY, + return get_scalar_param(param_dict[CURRICULUM_LEARNING_LEGACY], CURRICULUM_ENABLED_LEGACY, CURRICULUM_ENABLED_DEFAULT_LEGACY) else: return False @@ -142,9 +132,7 @@ def get_data_routing(param_dict): def get_data_routing_enabled(param_dict): if DATA_ROUTING in param_dict.keys(): - return get_scalar_param(param_dict[DATA_ROUTING], - DATA_ROUTING_ENABLED, - DATA_ROUTING_ENABLED_DEFAULT) + return get_scalar_param(param_dict[DATA_ROUTING], DATA_ROUTING_ENABLED, DATA_ROUTING_ENABLED_DEFAULT) else: return False @@ -164,9 +152,7 @@ def get_random_ltd(param_dict): def get_random_ltd_enabled(param_dict): if RANDOM_LTD in param_dict.keys(): - return get_scalar_param(param_dict[RANDOM_LTD], - RANDOM_LTD_ENABLED, - RANDOM_LTD_ENABLED_DEFAULT) + return get_scalar_param(param_dict[RANDOM_LTD], RANDOM_LTD_ENABLED, RANDOM_LTD_ENABLED_DEFAULT) else: return False diff --git a/deepspeed/runtime/data_pipeline/curriculum_scheduler.py b/deepspeed/runtime/data_pipeline/curriculum_scheduler.py index b4cb18c44..e8e7c807a 100644 --- a/deepspeed/runtime/data_pipeline/curriculum_scheduler.py +++ b/deepspeed/runtime/data_pipeline/curriculum_scheduler.py @@ -7,6 +7,7 @@ from .constants import * class CurriculumScheduler(object): + def __init__(self, config): super().__init__() self.state = {} @@ -16,17 +17,12 @@ class CurriculumScheduler(object): f"Curriculum learning requires the config '{CURRICULUM_LEARNING_MAX_DIFFICULTY}'" assert CURRICULUM_LEARNING_SCHEDULE_TYPE in config, \ f"Curriculum learning requires the config '{CURRICULUM_LEARNING_SCHEDULE_TYPE}'" - self.state[CURRICULUM_LEARNING_MIN_DIFFICULTY] = config[ - CURRICULUM_LEARNING_MIN_DIFFICULTY] - self.state[CURRICULUM_LEARNING_MAX_DIFFICULTY] = config[ - CURRICULUM_LEARNING_MAX_DIFFICULTY] - self.state[CURRICULUM_LEARNING_CURRENT_DIFFICULTY] = config[ - CURRICULUM_LEARNING_MIN_DIFFICULTY] - self.state[CURRICULUM_LEARNING_SCHEDULE_TYPE] = config[ - CURRICULUM_LEARNING_SCHEDULE_TYPE] + self.state[CURRICULUM_LEARNING_MIN_DIFFICULTY] = config[CURRICULUM_LEARNING_MIN_DIFFICULTY] + self.state[CURRICULUM_LEARNING_MAX_DIFFICULTY] = config[CURRICULUM_LEARNING_MAX_DIFFICULTY] + self.state[CURRICULUM_LEARNING_CURRENT_DIFFICULTY] = config[CURRICULUM_LEARNING_MIN_DIFFICULTY] + self.state[CURRICULUM_LEARNING_SCHEDULE_TYPE] = config[CURRICULUM_LEARNING_SCHEDULE_TYPE] self.first_step = True - if config[ - CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_DISCRETE: + if config[CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_DISCRETE: """ The schedule_config is a list of difficulty and a list of max step belonging to each difficulty. Example json config: @@ -43,18 +39,12 @@ class CurriculumScheduler(object): f"Curriculum learning with fixed_discrete schedule requires the schedule_config '{CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY}'" assert CURRICULUM_LEARNING_SCHEDULE_MAX_STEP in config[CURRICULUM_LEARNING_SCHEDULE_CONFIG], \ f"Curriculum learning with fixed_discrete schedule requires the schedule_config '{CURRICULUM_LEARNING_SCHEDULE_MAX_STEP}'" - assert len(config[CURRICULUM_LEARNING_SCHEDULE_CONFIG] - [CURRICULUM_LEARNING_SCHEDULE_MAX_STEP]) > 0 - assert len(config[CURRICULUM_LEARNING_SCHEDULE_CONFIG] - [CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY]) > 0 - assert len(config[CURRICULUM_LEARNING_SCHEDULE_CONFIG] - [CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY]) == len( - config[CURRICULUM_LEARNING_SCHEDULE_CONFIG] - [CURRICULUM_LEARNING_SCHEDULE_MAX_STEP]) + 1 - self.state[CURRICULUM_LEARNING_SCHEDULE_CONFIG] = config[ - CURRICULUM_LEARNING_SCHEDULE_CONFIG] - elif config[ - CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_ROOT: + assert len(config[CURRICULUM_LEARNING_SCHEDULE_CONFIG][CURRICULUM_LEARNING_SCHEDULE_MAX_STEP]) > 0 + assert len(config[CURRICULUM_LEARNING_SCHEDULE_CONFIG][CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY]) > 0 + assert len(config[CURRICULUM_LEARNING_SCHEDULE_CONFIG][CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY]) == len( + config[CURRICULUM_LEARNING_SCHEDULE_CONFIG][CURRICULUM_LEARNING_SCHEDULE_MAX_STEP]) + 1 + self.state[CURRICULUM_LEARNING_SCHEDULE_CONFIG] = config[CURRICULUM_LEARNING_SCHEDULE_CONFIG] + elif config[CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_ROOT: """ The schedule_config includes: total_curriculum_step: how many steps the curriculum learning takes to go @@ -79,15 +69,12 @@ class CurriculumScheduler(object): f"Curriculum learning with fixed_root schedule requires the schedule_config '{CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP}'" assert CURRICULUM_LEARNING_SCHEDULE_ROOT_DEGREE in config[CURRICULUM_LEARNING_SCHEDULE_CONFIG], \ f"Curriculum learning with fixed_root schedule requires the schedule_config '{CURRICULUM_LEARNING_SCHEDULE_ROOT_DEGREE}'" - if config[CURRICULUM_LEARNING_SCHEDULE_CONFIG][ - CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP] % 8 != 0: + if config[CURRICULUM_LEARNING_SCHEDULE_CONFIG][CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP] % 8 != 0: logger.warning( f'When using seqlen metric, the difficulty_step for curriculum learning has to be multiple of 8 (for FP16 data) or 16 (for INT8 data) to enable NVIDIA Tensor Core acceleration. Disregard this warning if this is unrelated to your metric/hardware.' ) - self.state[CURRICULUM_LEARNING_SCHEDULE_CONFIG] = config[ - CURRICULUM_LEARNING_SCHEDULE_CONFIG] - elif config[ - CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_LINEAR: + self.state[CURRICULUM_LEARNING_SCHEDULE_CONFIG] = config[CURRICULUM_LEARNING_SCHEDULE_CONFIG] + elif config[CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_LINEAR: """ The schedule_config is the same as CURRICULUM_LEARNING_SCHEDULE_FIXED_ROOT but without the root_degree. @@ -100,15 +87,12 @@ class CurriculumScheduler(object): f"Curriculum learning with fixed_linear schedule requires the schedule_config '{CURRICULUM_LEARNING_SCHEDULE_TOTAL_STEP}'" assert CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP in config[CURRICULUM_LEARNING_SCHEDULE_CONFIG], \ f"Curriculum learning with fixed_linear schedule requires the schedule_config '{CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP}'" - if config[CURRICULUM_LEARNING_SCHEDULE_CONFIG][ - CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP] % 8 != 0: + if config[CURRICULUM_LEARNING_SCHEDULE_CONFIG][CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP] % 8 != 0: logger.warning( f'When using seqlen metric, the difficulty_step for curriculum learning has to be multiple of 8 (for FP16 data) or 16 (for INT8 data) to enable NVIDIA Tensor Core acceleration. Disregard this warning if this is unrelated to your metric/hardware.' ) - self.state[CURRICULUM_LEARNING_SCHEDULE_CONFIG] = config[ - CURRICULUM_LEARNING_SCHEDULE_CONFIG] - elif config[ - CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_CUSTOM: + self.state[CURRICULUM_LEARNING_SCHEDULE_CONFIG] = config[CURRICULUM_LEARNING_SCHEDULE_CONFIG] + elif config[CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_CUSTOM: """ Fully customized schedule. User need to provide a custom schedule function by using the set_custom_curriculum_learning_schedule API @@ -145,38 +129,28 @@ class CurriculumScheduler(object): s_state = self.state[CURRICULUM_LEARNING_SCHEDULE_CONFIG] if root_degree is None: root_degree = s_state[CURRICULUM_LEARNING_SCHEDULE_ROOT_DEGREE] - next_difficulty = (float(global_steps) / - s_state[CURRICULUM_LEARNING_SCHEDULE_TOTAL_STEP])**( - 1.0 / root_degree) - next_difficulty = math.floor(next_difficulty * - (self.state[CURRICULUM_LEARNING_MAX_DIFFICULTY] - - self.state[CURRICULUM_LEARNING_MIN_DIFFICULTY]) + - self.state[CURRICULUM_LEARNING_MIN_DIFFICULTY]) - next_difficulty -= (next_difficulty % - s_state[CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP]) - next_difficulty = min(next_difficulty, - self.state[CURRICULUM_LEARNING_MAX_DIFFICULTY]) + next_difficulty = (float(global_steps) / s_state[CURRICULUM_LEARNING_SCHEDULE_TOTAL_STEP])**(1.0 / root_degree) + next_difficulty = math.floor( + next_difficulty * + (self.state[CURRICULUM_LEARNING_MAX_DIFFICULTY] - self.state[CURRICULUM_LEARNING_MIN_DIFFICULTY]) + + self.state[CURRICULUM_LEARNING_MIN_DIFFICULTY]) + next_difficulty -= (next_difficulty % s_state[CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP]) + next_difficulty = min(next_difficulty, self.state[CURRICULUM_LEARNING_MAX_DIFFICULTY]) return next_difficulty def get_difficulty(self, global_steps): - if self.state[ - CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_DISCRETE: + if self.state[CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_DISCRETE: return self.__fixed_discrete_get_difficulty(global_steps) - elif self.state[ - CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_LINEAR: + elif self.state[CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_LINEAR: return self.__fixed_root_get_difficulty(global_steps, 1) - elif self.state[ - CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_ROOT: + elif self.state[CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_ROOT: return self.__fixed_root_get_difficulty(global_steps) - elif self.state[ - CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_CUSTOM: + elif self.state[CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_CUSTOM: return self.custom_get_difficulty(global_steps) else: raise RuntimeError('Unsupported curriculum schedule type') def update_difficulty(self, global_steps): - if self.state[CURRICULUM_LEARNING_CURRENT_DIFFICULTY] < self.state[ - CURRICULUM_LEARNING_MAX_DIFFICULTY]: - self.state[CURRICULUM_LEARNING_CURRENT_DIFFICULTY] = self.get_difficulty( - global_steps) + if self.state[CURRICULUM_LEARNING_CURRENT_DIFFICULTY] < self.state[CURRICULUM_LEARNING_MAX_DIFFICULTY]: + self.state[CURRICULUM_LEARNING_CURRENT_DIFFICULTY] = self.get_difficulty(global_steps) return self.state[CURRICULUM_LEARNING_CURRENT_DIFFICULTY] diff --git a/deepspeed/runtime/data_pipeline/data_routing/basic_layer.py b/deepspeed/runtime/data_pipeline/data_routing/basic_layer.py index 436da9538..10023e68a 100644 --- a/deepspeed/runtime/data_pipeline/data_routing/basic_layer.py +++ b/deepspeed/runtime/data_pipeline/data_routing/basic_layer.py @@ -14,6 +14,7 @@ class RandomLayerTokenDrop(Module): """ A layer wrapper for random LTD """ + def __init__(self, layer: Module): super(RandomLayerTokenDrop, self).__init__() self.random_ltd_layer = layer @@ -52,9 +53,7 @@ class RandomLayerTokenDrop(Module): elif self.model_type == 'decoder': self.index_generator = gpt_sample_tokens else: - logger.warning( - "************For now, we only support encoder-only or decoder-only models************" - ) + logger.warning("************For now, we only support encoder-only or decoder-only models************") raise NotImplementedError def get_bsh(self, hidden_stats): @@ -78,40 +77,36 @@ class RandomLayerTokenDrop(Module): self.curr_micro_batch, \ self.random_ltd_num_layer, \ hidden_states.device, mask) - self.random_ltd_scheduler.state[ - RANDOM_LTD_SAMPLE_INDEX] = sampled_indices - self.random_ltd_scheduler.state[ - RANDOM_LTD_ATTENTION_MASK] = part_attention_mask + self.random_ltd_scheduler.state[RANDOM_LTD_SAMPLE_INDEX] = sampled_indices + self.random_ltd_scheduler.state[RANDOM_LTD_ATTENTION_MASK] = part_attention_mask else: - sampled_indices = self.random_ltd_scheduler.state[ - RANDOM_LTD_SAMPLE_INDEX] - part_attention_mask = self.random_ltd_scheduler.state[ - RANDOM_LTD_ATTENTION_MASK] + sampled_indices = self.random_ltd_scheduler.state[RANDOM_LTD_SAMPLE_INDEX] + part_attention_mask = self.random_ltd_scheduler.state[RANDOM_LTD_ATTENTION_MASK] - - hidden_states, part_hidden_states = GatherTokens.apply(hidden_states, sampled_indices[self.random_ltd_layer_id,:,:], self.batch_first) + hidden_states, part_hidden_states = GatherTokens.apply(hidden_states, + sampled_indices[self.random_ltd_layer_id, :, :], + self.batch_first) if self.mask_name is not None: if self.model_type == 'encoder': - kwargs[self.mask_name] = part_attention_mask[ - self.random_ltd_layer_id] + kwargs[self.mask_name] = part_attention_mask[self.random_ltd_layer_id] else: kwargs[self.mask_name] = part_attention_mask outputs = self.random_ltd_layer(part_hidden_states, **kwargs) if isinstance(outputs, tuple): - hidden_states = ScatterTokens.apply(hidden_states, outputs[0], sampled_indices[self.random_ltd_layer_id,:,:], self.batch_first) + hidden_states = ScatterTokens.apply(hidden_states, outputs[0], + sampled_indices[self.random_ltd_layer_id, :, :], self.batch_first) my_list = list(outputs) my_list[0] = hidden_states return tuple(my_list) elif isinstance(outputs, Tensor): - hidden_states = ScatterTokens.apply(hidden_states, outputs, sampled_indices[self.random_ltd_layer_id,:,:], self.batch_first) + hidden_states = ScatterTokens.apply(hidden_states, outputs, + sampled_indices[self.random_ltd_layer_id, :, :], self.batch_first) return hidden_states else: - logger.warning( - "************For now, we only support tuple and tensor output. \ - You need to adjust the output according to the layer in your model************" - ) + logger.warning("************For now, we only support tuple and tensor output. \ + You need to adjust the output according to the layer in your model************") raise NotImplementedError else: return self.random_ltd_layer(hidden_states, **kwargs) diff --git a/deepspeed/runtime/data_pipeline/data_routing/scheduler.py b/deepspeed/runtime/data_pipeline/data_routing/scheduler.py index db0a7d4bc..58f1587f5 100644 --- a/deepspeed/runtime/data_pipeline/data_routing/scheduler.py +++ b/deepspeed/runtime/data_pipeline/data_routing/scheduler.py @@ -12,6 +12,7 @@ from ..constants import * class BaseScheduler(object): + def __init__(self): self.state = {} @@ -19,12 +20,9 @@ class BaseScheduler(object): s_state = self.state[RANDOM_LTD_SCHEDULE_CONFIG] if root_degree is None: root_degree = s_state['root_degree'] - next_seq = (float(global_steps) / - s_state[RANDOM_LTD_REQUIRE_STEP])**(1.0 / root_degree) - next_seq = math.floor( - next_seq * - (self.state[RANDOM_LTD_MAX_VALUE] - self.state[RANDOM_LTD_MIN_VALUE]) + - self.state[RANDOM_LTD_MIN_VALUE]) + next_seq = (float(global_steps) / s_state[RANDOM_LTD_REQUIRE_STEP])**(1.0 / root_degree) + next_seq = math.floor(next_seq * (self.state[RANDOM_LTD_MAX_VALUE] - self.state[RANDOM_LTD_MIN_VALUE]) + + self.state[RANDOM_LTD_MIN_VALUE]) next_seq -= (next_seq % s_state[RANDOM_LTD_INCREASE_STEP]) next_seq = min(next_seq, self.state[RANDOM_LTD_MAX_VALUE]) return next_seq @@ -37,6 +35,7 @@ class BaseScheduler(object): class RandomLTDScheduler(BaseScheduler): + def __init__(self, config): super().__init__() self.model_layer_num = config[RANDOM_LTD_TOTAL_LAYER_NUM] @@ -61,12 +60,9 @@ class RandomLTDScheduler(BaseScheduler): if self.config_schedule is not None: self.state[RANDOM_LTD_MIN_VALUE] = self.config_schedule[RANDOM_LTD_MIN_VALUE] self.state[RANDOM_LTD_MAX_VALUE] = self.config_schedule[RANDOM_LTD_MAX_VALUE] - self.state[RANDOM_LTD_CURRENT_VALUE] = self.config_schedule[ - RANDOM_LTD_MIN_VALUE] - self.state[RANDOM_LTD_SCHEDULE_CONFIG] = self.config_schedule[ - RANDOM_LTD_SCHEDULE_CONFIG] - self.state[RANDOM_LTD_SCHEDULER_TYPE] = self.config_schedule[ - RANDOM_LTD_SCHEDULER_TYPE] + self.state[RANDOM_LTD_CURRENT_VALUE] = self.config_schedule[RANDOM_LTD_MIN_VALUE] + self.state[RANDOM_LTD_SCHEDULE_CONFIG] = self.config_schedule[RANDOM_LTD_SCHEDULE_CONFIG] + self.state[RANDOM_LTD_SCHEDULER_TYPE] = self.config_schedule[RANDOM_LTD_SCHEDULER_TYPE] self.state[RANDOM_LTD_CONSUMED_LAYER_TOKENS] = 0 self.state[RANDOM_LTD_CURR_STEP] = -1 @@ -95,8 +91,7 @@ class RandomLTDScheduler(BaseScheduler): def state_dict(self): return { - RANDOM_LTD_CONSUMED_LAYER_TOKENS: - self.state[RANDOM_LTD_CONSUMED_LAYER_TOKENS], + RANDOM_LTD_CONSUMED_LAYER_TOKENS: self.state[RANDOM_LTD_CONSUMED_LAYER_TOKENS], RANDOM_LTD_CURR_STEP: self.state[RANDOM_LTD_CURR_STEP], RANDOM_LTD_CURRENT_VALUE: self.state[RANDOM_LTD_CURRENT_VALUE], RANDOM_LTD_MIN_VALUE: self.state[RANDOM_LTD_MIN_VALUE], @@ -104,8 +99,7 @@ class RandomLTDScheduler(BaseScheduler): } def load_state_dict(self, state_dict): - self.state[RANDOM_LTD_CONSUMED_LAYER_TOKENS] = state_dict[ - RANDOM_LTD_CONSUMED_LAYER_TOKENS] + self.state[RANDOM_LTD_CONSUMED_LAYER_TOKENS] = state_dict[RANDOM_LTD_CONSUMED_LAYER_TOKENS] self.state[RANDOM_LTD_CURR_STEP] = state_dict[RANDOM_LTD_CURR_STEP] self.state[RANDOM_LTD_CURRENT_VALUE] = state_dict[RANDOM_LTD_CURRENT_VALUE] self.state[RANDOM_LTD_MIN_VALUE] = state_dict[RANDOM_LTD_MIN_VALUE] diff --git a/deepspeed/runtime/data_pipeline/data_routing/utils.py b/deepspeed/runtime/data_pipeline/data_routing/utils.py index 8b7bd501c..52a8e222e 100644 --- a/deepspeed/runtime/data_pipeline/data_routing/utils.py +++ b/deepspeed/runtime/data_pipeline/data_routing/utils.py @@ -10,8 +10,7 @@ def bsh_decoder_gather(reserved_length, hidden_states, mask): rand_list = [] part_hidden_states = [] # batch, seq, hidden ## different from megatron for k in range(hidden_states.size(0)): - B_tmp = torch.randperm(hidden_states.size(1), - device=hidden_states.device)[:reserved_length] + B_tmp = torch.randperm(hidden_states.size(1), device=hidden_states.device)[:reserved_length] B = B_tmp.sort()[0] rand_list.append(B) part_hidden_states.append(hidden_states[k:k + 1, B, :]) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py index efd198f1f..a431035b4 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py @@ -16,6 +16,7 @@ from .utils import split_dataset, split_index, create_mmap_dataset_builder, clos class DataAnalyzer(object): + def __init__(self, dataset, num_workers=1, @@ -53,25 +54,19 @@ class DataAnalyzer(object): self.custom_map_finalize = custom_map_finalize self.custom_reduce = custom_reduce - def init_metric_results(self, - thread_id, - metric_names, - metric_types, - metric_dtypes, - save_path, - worker_id): + def init_metric_results(self, thread_id, metric_names, metric_types, metric_dtypes, save_path, worker_id): metric_results = [] for m_idx in range(len(metric_names)): metric_name, metric_type, metric_dtype = metric_names[m_idx], \ metric_types[m_idx], metric_dtypes[m_idx] - assert metric_dtype not in [np.float64, np.double], "Currently floating point metric values are not supported. Please change your metric into integer values (and potentially multiply a larger coefficient to keep the precision)." + assert metric_dtype not in [ + np.float64, np.double + ], "Currently floating point metric values are not supported. Please change your metric into integer values (and potentially multiply a larger coefficient to keep the precision)." metric_save_path = f"{save_path}/{metric_name}/worker{worker_id}_thread{thread_id}/" os.makedirs(metric_save_path, exist_ok=True) if metric_type == 'single_value_per_sample': sample_to_metric_fname = f"{metric_save_path}/{metric_name}_sample_to_metric" - sample_to_metric_builder = create_mmap_dataset_builder( - sample_to_metric_fname, - metric_dtype) + sample_to_metric_builder = create_mmap_dataset_builder(sample_to_metric_fname, metric_dtype) metric_to_sample_fname = f"{metric_save_path}/{metric_name}_metric_to_sample" os.system(f"rm -rf {metric_to_sample_fname}*") metric_to_sample_dict = defaultdict(list) @@ -84,34 +79,25 @@ class DataAnalyzer(object): elif metric_type == 'accumulate_value_over_samples': metric_value = None metric_value_fname = f"{metric_save_path}/{metric_name}_metric_value" - metric_results.append({ - "metric_value": metric_value, - "metric_value_fname": metric_value_fname - }) + metric_results.append({"metric_value": metric_value, "metric_value_fname": metric_value_fname}) return metric_results - def update_metric_results(self, - data, - metric_types, - metric_functions, - metric_results): + def update_metric_results(self, data, metric_types, metric_functions, metric_results): for m_idx in range(len(metric_types)): metric_type, metric_function, metric_result = metric_types[m_idx], \ metric_functions[m_idx], metric_results[m_idx] if metric_type == 'single_value_per_sample': metric_values = metric_function(data) for row in range(metric_values.size()[0]): - metric_result["sample_to_metric_builder"].add_item( - metric_values[row].reshape(-1)) - metric_result["metric_to_sample_dict"][ - metric_values[row].item()].append(data['index'][row][0].item()) + metric_result["sample_to_metric_builder"].add_item(metric_values[row].reshape(-1)) + metric_result["metric_to_sample_dict"][metric_values[row].item()].append( + data['index'][row][0].item()) for m_value in metric_result["metric_to_sample_dict"]: if len(metric_result["metric_to_sample_dict"][m_value]) > 100: metric_fname = metric_result["metric_to_sample_fname"] with open(f"{metric_fname}_{m_value}.csv", 'a') as f: writer = csv.writer(f) - writer.writerows( - [metric_result["metric_to_sample_dict"][m_value]]) + writer.writerows([metric_result["metric_to_sample_dict"][m_value]]) metric_result["metric_to_sample_dict"][m_value] = [] elif metric_type == 'accumulate_value_over_samples': metric_values = metric_function(data) @@ -126,25 +112,20 @@ class DataAnalyzer(object): metric_dtypes[m_idx], metric_results[m_idx] if metric_type == 'single_value_per_sample': metric_fname = metric_result["sample_to_metric_fname"] - close_mmap_dataset_builder(metric_result["sample_to_metric_builder"], - metric_fname) + close_mmap_dataset_builder(metric_result["sample_to_metric_builder"], metric_fname) for m_value in metric_result["metric_to_sample_dict"]: if len(metric_result["metric_to_sample_dict"][m_value]) > 0: metric_fname = metric_result["metric_to_sample_fname"] with open(f"{metric_fname}_{m_value}.csv", 'a') as f: writer = csv.writer(f) - writer.writerows( - [metric_result["metric_to_sample_dict"][m_value]]) + writer.writerows([metric_result["metric_to_sample_dict"][m_value]]) metric_result["metric_to_sample_dict"][m_value] = [] elif metric_type == 'accumulate_value_over_samples': if metric_result["metric_value"] is not None: - metric_value_builder = create_mmap_dataset_builder( - metric_result["metric_value_fname"], - metric_dtype) - metric_value_builder.add_item( - metric_result["metric_value"].reshape(-1)) - close_mmap_dataset_builder(metric_value_builder, - metric_result["metric_value_fname"]) + metric_value_builder = create_mmap_dataset_builder(metric_result["metric_value_fname"], + metric_dtype) + metric_value_builder.add_item(metric_result["metric_value"].reshape(-1)) + close_mmap_dataset_builder(metric_value_builder, metric_result["metric_value_fname"]) def run_map_helper(self, thread_id): start_idx, end_idx = self.thread_splits[thread_id][0], \ @@ -152,15 +133,9 @@ class DataAnalyzer(object): logger.info(f"worker {self.worker_id} thread {thread_id}: start working " \ f"on data subset {start_idx} to {end_idx}") thread_dataset = Subset(self.dataset, list(range(start_idx, end_idx))) - sampler = BatchSampler(SequentialSampler(thread_dataset), - batch_size=self.batch_size, - drop_last=False) + sampler = BatchSampler(SequentialSampler(thread_dataset), batch_size=self.batch_size, drop_last=False) if self.collate_fn is None: - iterator = iter( - DataLoader(thread_dataset, - batch_sampler=sampler, - num_workers=0, - pin_memory=False)) + iterator = iter(DataLoader(thread_dataset, batch_sampler=sampler, num_workers=0, pin_memory=False)) else: iterator = iter( DataLoader(thread_dataset, @@ -169,19 +144,11 @@ class DataAnalyzer(object): collate_fn=self.collate_fn, pin_memory=False)) if self.custom_map_init is None: - metric_results = self.init_metric_results(thread_id, - self.metric_names, - self.metric_types, - self.metric_dtypes, - self.save_path, - self.worker_id) + metric_results = self.init_metric_results(thread_id, self.metric_names, self.metric_types, + self.metric_dtypes, self.save_path, self.worker_id) else: - metric_results = self.custom_map_init(thread_id, - self.metric_names, - self.metric_types, - self.metric_dtypes, - self.save_path, - self.worker_id) + metric_results = self.custom_map_init(thread_id, self.metric_names, self.metric_types, self.metric_dtypes, + self.save_path, self.worker_id) total_sample = len(thread_dataset) processed_sample = 0 start = time.time() @@ -189,15 +156,9 @@ class DataAnalyzer(object): try: data = next(iterator) if self.custom_map_update is None: - self.update_metric_results(data, - self.metric_types, - self.metric_functions, - metric_results) + self.update_metric_results(data, self.metric_types, self.metric_functions, metric_results) else: - self.custom_map_update(data, - self.metric_types, - self.metric_functions, - metric_results) + self.custom_map_update(data, self.metric_types, self.metric_functions, metric_results) processed_sample += self.batch_size duration = (time.time() - start) / 3600.0 remain_duration = duration * total_sample / processed_sample - duration @@ -206,22 +167,17 @@ class DataAnalyzer(object): f"out of {total_sample} processed in {duration:.2f} hr, " \ f"estimated to finish in {remain_duration:.2f} hr") except StopIteration: - logger.info( - f"worker {self.worker_id} thread {thread_id}: reach end of file") + logger.info(f"worker {self.worker_id} thread {thread_id}: reach end of file") break if self.custom_map_finalize is None: - self.finalize_metric_results(self.metric_types, - self.metric_dtypes, - metric_results) + self.finalize_metric_results(self.metric_types, self.metric_dtypes, metric_results) else: - self.custom_map_finalize(self.metric_types, - self.metric_dtypes, - metric_results) + self.custom_map_finalize(self.metric_types, self.metric_dtypes, metric_results) logger.info(f"worker {self.worker_id} thread {thread_id}: finished") def run_map(self): - self.worker_splits, self.thread_splits = split_dataset(self.dataset, - self.num_workers, self.worker_id, self.num_threads) + self.worker_splits, self.thread_splits = split_dataset(self.dataset, self.num_workers, self.worker_id, + self.num_threads) if len(self.specific_threads) > 0: threads_to_run = self.specific_threads else: @@ -238,81 +194,50 @@ class DataAnalyzer(object): assert self.num_threads == 1 self.run_map_helper(0) - def get_metric_value_percentiles(self, - metric_name, - num_sample_per_value, - total_num_samples): + def get_metric_value_percentiles(self, metric_name, num_sample_per_value, total_num_samples): logger.info(f"Checking the value percentiles of metric {metric_name}...") processed_samples = 0 current_percentile = 5 for key in sorted(num_sample_per_value.keys()): processed_samples += num_sample_per_value[key] if processed_samples >= total_num_samples * current_percentile / 100.0: - logger.info( - f"Metric {metric_name} {current_percentile}th percentile: {key}") + logger.info(f"Metric {metric_name} {current_percentile}th percentile: {key}") current_percentile += 5 - def merge_gather_map_stats(self, - num_workers, - num_threads, - num_threads_reduce, - t_idx_reduce, - metric_save_path, - metric_name, - return_dict): + def merge_gather_map_stats(self, num_workers, num_threads, num_threads_reduce, t_idx_reduce, metric_save_path, + metric_name, return_dict): results = [] for w_idx in range(num_workers): for t_idx in range(num_threads): if (w_idx * num_threads + t_idx) % num_threads_reduce == t_idx_reduce: w_metric_save_path = f"{metric_save_path}/worker{w_idx}_thread{t_idx}/" w_sample_to_metric_fname = f"{w_metric_save_path}/{metric_name}_sample_to_metric" - w_sample_to_metric = MMapIndexedDataset(w_sample_to_metric_fname, - skip_warmup=True) + w_sample_to_metric = MMapIndexedDataset(w_sample_to_metric_fname, skip_warmup=True) unique_v = list(np.unique(w_sample_to_metric)) sample_to_metric_count = len(w_sample_to_metric) - logger.info( - f"Finished gathering map stats from worker {w_idx} thread {t_idx}." - ) + logger.info(f"Finished gathering map stats from worker {w_idx} thread {t_idx}.") results.append([unique_v, sample_to_metric_count]) return_dict[t_idx_reduce] = results - def merge_sample_to_metric(self, - t_idx_reduce, - metric_save_path, - metric_name, - metric_value_dtype, + def merge_sample_to_metric(self, t_idx_reduce, metric_save_path, metric_name, metric_value_dtype, map_worker_thread): sample_to_metric_fname = f"{metric_save_path}/{metric_name}_sample_to_metric_thread{t_idx_reduce}" - sample_to_metric_builder = create_mmap_dataset_builder( - sample_to_metric_fname, - metric_value_dtype) + sample_to_metric_builder = create_mmap_dataset_builder(sample_to_metric_fname, metric_value_dtype) for w_t in map_worker_thread: w_metric_save_path = f"{metric_save_path}/worker{w_t[0]}_thread{w_t[1]}/" w_sample_to_metric_fname = f"{w_metric_save_path}/{metric_name}_sample_to_metric" w_data = MMapIndexedDataset(w_sample_to_metric_fname, skip_warmup=True) for row in range(len(w_data)): - sample_to_metric_builder.add_item( - torch.tensor(w_data[row].astype(np.int64), - dtype=torch.long)) - logger.info( - f"Finished merge_sample_to_metric from worker {w_t[0]} thread {w_t[1]}.") + sample_to_metric_builder.add_item(torch.tensor(w_data[row].astype(np.int64), dtype=torch.long)) + logger.info(f"Finished merge_sample_to_metric from worker {w_t[0]} thread {w_t[1]}.") close_mmap_dataset_builder(sample_to_metric_builder, sample_to_metric_fname) - def merge_metric_to_sample(self, - t_idx_reduce, - metric_save_path, - metric_name, - sample_idx_dtype, - metric_value_dtype, - unique_metric_values, - num_workers, - num_threads): + def merge_metric_to_sample(self, t_idx_reduce, metric_save_path, metric_name, sample_idx_dtype, metric_value_dtype, + unique_metric_values, num_workers, num_threads): index_to_sample_fname = f"{metric_save_path}/{metric_name}_index_to_sample_thread{t_idx_reduce}" - index_to_sample_builder = create_mmap_dataset_builder(index_to_sample_fname, - sample_idx_dtype) + index_to_sample_builder = create_mmap_dataset_builder(index_to_sample_fname, sample_idx_dtype) index_to_metric_fname = f"{metric_save_path}/{metric_name}_index_to_metric_thread{t_idx_reduce}" - index_to_metric_builder = create_mmap_dataset_builder(index_to_metric_fname, - metric_value_dtype) + index_to_metric_builder = create_mmap_dataset_builder(index_to_metric_fname, metric_value_dtype) for unique_v in unique_metric_values: samples = [] for w_idx in range(num_workers): @@ -330,13 +255,7 @@ class DataAnalyzer(object): close_mmap_dataset_builder(index_to_sample_builder, index_to_sample_fname) close_mmap_dataset_builder(index_to_metric_builder, index_to_metric_fname) - def merge_map_results(self, - dataset, - metric_names, - metric_types, - save_path, - num_workers, - num_threads, + def merge_map_results(self, dataset, metric_names, metric_types, save_path, num_workers, num_threads, num_threads_reduce): total_num_samples = len(dataset) sample_idx_dtype = find_fit_int_dtype(0, total_num_samples - 1) @@ -385,9 +304,7 @@ class DataAnalyzer(object): for w_idx in range(num_workers): for t_idx in range(num_threads): map_worker_thread.append([w_idx, t_idx]) - thread_splits = split_index(0, - len(map_worker_thread), - num_threads_reduce) + thread_splits = split_index(0, len(map_worker_thread), num_threads_reduce) p = [] for t_idx_reduce in range(num_threads_reduce): start_idx, end_idx = thread_splits[t_idx_reduce][0], thread_splits[t_idx_reduce][1] @@ -405,24 +322,18 @@ class DataAnalyzer(object): p[t_idx_reduce].join() sample_to_metric_fname = f"{metric_save_path}/{metric_name}_sample_to_metric" - sample_to_metric_builder = create_mmap_dataset_builder( - sample_to_metric_fname, - metric_value_dtype) + sample_to_metric_builder = create_mmap_dataset_builder(sample_to_metric_fname, metric_value_dtype) for t_idx_reduce in range(num_threads_reduce): chunk_fname = f"{metric_save_path}/{metric_name}_sample_to_metric_thread{t_idx_reduce}" logger.info(f"Merging file {chunk_fname}") sample_to_metric_builder.merge_file_(chunk_fname) - close_mmap_dataset_builder(sample_to_metric_builder, - sample_to_metric_fname) - sample_to_metric = MMapIndexedDataset(sample_to_metric_fname, - skip_warmup=True) + close_mmap_dataset_builder(sample_to_metric_builder, sample_to_metric_fname) + sample_to_metric = MMapIndexedDataset(sample_to_metric_fname, skip_warmup=True) assert len(sample_to_metric) == total_num_samples # metric_to_sample unique_metric_values = list(sorted(unique_metric_values)) - thread_splits = split_index(0, - len(unique_metric_values), - num_threads_reduce) + thread_splits = split_index(0, len(unique_metric_values), num_threads_reduce) p = [] for t_idx_reduce in range(num_threads_reduce): start_idx, end_idx = thread_splits[t_idx_reduce][0], thread_splits[t_idx_reduce][1] @@ -442,13 +353,9 @@ class DataAnalyzer(object): for t_idx_reduce in range(num_threads_reduce): p[t_idx_reduce].join() index_to_sample_fname = f"{metric_save_path}/{metric_name}_index_to_sample" - index_to_sample_builder = create_mmap_dataset_builder( - index_to_sample_fname, - sample_idx_dtype) + index_to_sample_builder = create_mmap_dataset_builder(index_to_sample_fname, sample_idx_dtype) index_to_metric_fname = f"{metric_save_path}/{metric_name}_index_to_metric" - index_to_metric_builder = create_mmap_dataset_builder( - index_to_metric_fname, - metric_value_dtype) + index_to_metric_builder = create_mmap_dataset_builder(index_to_metric_fname, metric_value_dtype) for t_idx_reduce in range(num_threads_reduce): chunk_is_fname = f"{metric_save_path}/{metric_name}_index_to_sample_thread{t_idx_reduce}" logger.info(f"Merging file {chunk_is_fname}") @@ -456,43 +363,29 @@ class DataAnalyzer(object): chunk_im_fname = f"{metric_save_path}/{metric_name}_index_to_metric_thread{t_idx_reduce}" logger.info(f"Merging file {chunk_im_fname}") index_to_metric_builder.merge_file_(chunk_im_fname) - close_mmap_dataset_builder(index_to_sample_builder, - index_to_sample_fname) - close_mmap_dataset_builder(index_to_metric_builder, - index_to_metric_fname) + close_mmap_dataset_builder(index_to_sample_builder, index_to_sample_fname) + close_mmap_dataset_builder(index_to_metric_builder, index_to_metric_fname) num_sample_per_value = {} - index_to_sample = MMapIndexedDataset(index_to_sample_fname, - skip_warmup=True) - index_to_metric = MMapIndexedDataset(index_to_metric_fname, - skip_warmup=True) + index_to_sample = MMapIndexedDataset(index_to_sample_fname, skip_warmup=True) + index_to_metric = MMapIndexedDataset(index_to_metric_fname, skip_warmup=True) index_to_sample_merged_fname = f"{metric_save_path}/{metric_name}_index_to_sample_percentile_merged" - index_to_sample_merged_builder = create_mmap_dataset_builder( - index_to_sample_merged_fname, - sample_idx_dtype) + index_to_sample_merged_builder = create_mmap_dataset_builder(index_to_sample_merged_fname, + sample_idx_dtype) for v_idx in range(len(index_to_sample)): if v_idx > 0: assert index_to_metric[v_idx] > index_to_metric[v_idx - 1] - num_sample_per_value[index_to_metric[v_idx][0]] = len( - index_to_sample[v_idx]) + num_sample_per_value[index_to_metric[v_idx][0]] = len(index_to_sample[v_idx]) assert sum(num_sample_per_value.values()) == total_num_samples merge_step = len(index_to_sample) // 100 for v_idx in range(0, len(index_to_sample), merge_step): merged_samples = np.copy( - np.concatenate( - index_to_sample[v_idx:min(len(index_to_sample), - (v_idx + merge_step))], - axis=None)) + np.concatenate(index_to_sample[v_idx:min(len(index_to_sample), (v_idx + merge_step))], + axis=None)) index_to_sample_merged_builder.add_item( - torch.tensor(merged_samples.astype(np.int64), - dtype=torch.long)) - logger.info( - f"Finished merging index_to_sample {v_idx} to {v_idx+merge_step}." - ) - close_mmap_dataset_builder(index_to_sample_merged_builder, - index_to_sample_merged_fname) - self.get_metric_value_percentiles(metric_name, - num_sample_per_value, - total_num_samples) + torch.tensor(merged_samples.astype(np.int64), dtype=torch.long)) + logger.info(f"Finished merging index_to_sample {v_idx} to {v_idx+merge_step}.") + close_mmap_dataset_builder(index_to_sample_merged_builder, index_to_sample_merged_fname) + self.get_metric_value_percentiles(metric_name, num_sample_per_value, total_num_samples) elif metric_type == 'accumulate_value_over_samples': metric_save_path = f"{save_path}/{metric_name}/" metric_value = None @@ -500,8 +393,7 @@ class DataAnalyzer(object): for t_idx in range(num_threads): w_metric_save_path = f"{metric_save_path}/worker{w_idx}_thread{t_idx}/" w_metric_value_fname = f"{w_metric_save_path}/{metric_name}_metric_value" - w_metric_value = MMapIndexedDataset(w_metric_value_fname, - skip_warmup=True) + w_metric_value = MMapIndexedDataset(w_metric_value_fname, skip_warmup=True) if metric_value is None: metric_value = np.copy(w_metric_value[0]) else: @@ -510,28 +402,14 @@ class DataAnalyzer(object): value_min = int(min(metric_value)) metric_value_dtype = find_fit_int_dtype(value_min, value_max) metric_value_fname = f"{metric_save_path}/{metric_name}_metric_value" - metric_value_builder = create_mmap_dataset_builder( - metric_value_fname, - metric_value_dtype) - metric_value_builder.add_item( - torch.tensor(metric_value.astype(np.int64), - dtype=torch.long)) + metric_value_builder = create_mmap_dataset_builder(metric_value_fname, metric_value_dtype) + metric_value_builder.add_item(torch.tensor(metric_value.astype(np.int64), dtype=torch.long)) close_mmap_dataset_builder(metric_value_builder, metric_value_fname) def run_reduce(self): if self.custom_reduce is None: - self.merge_map_results(self.dataset, - self.metric_names, - self.metric_types, - self.save_path, - self.num_workers, - self.num_threads, - self.num_threads_reduce) + self.merge_map_results(self.dataset, self.metric_names, self.metric_types, self.save_path, + self.num_workers, self.num_threads, self.num_threads_reduce) else: - self.custom_reduce(self.dataset, - self.metric_names, - self.metric_types, - self.save_path, - self.num_workers, - self.num_threads, - self.num_threads_reduce) + self.custom_reduce(self.dataset, self.metric_names, self.metric_types, self.save_path, self.num_workers, + self.num_threads, self.num_threads_reduce) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_sampler.py b/deepspeed/runtime/data_pipeline/data_sampling/data_sampler.py index 24dbcfa93..42b02e582 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/data_sampler.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/data_sampler.py @@ -31,6 +31,7 @@ from .utils import create_mmap_dataset_builder, close_mmap_dataset_builder, find class DeepSpeedDataSampler(object): + def __init__(self, data_efficiency_config, one_epoch_total_samples, @@ -45,8 +46,8 @@ class DeepSpeedDataSampler(object): self.data_efficiency_config = data_efficiency_config self.one_epoch_total_samples = one_epoch_total_samples self.index_dtype = find_fit_int_dtype(0, one_epoch_total_samples) - self.total_samples = one_epoch_total_samples * self.data_efficiency_config[ - DATA_SAMPLING][DATA_SAMPLING_NUM_EPOCHS] + self.total_samples = one_epoch_total_samples * self.data_efficiency_config[DATA_SAMPLING][ + DATA_SAMPLING_NUM_EPOCHS] self.micro_batch_size = micro_batch_size self.data_parallel_rank = data_parallel_rank self.data_parallel_group = data_parallel_group @@ -57,13 +58,11 @@ class DeepSpeedDataSampler(object): self.gradient_accumulation_steps self.global_rank = global_rank self.drop_last = drop_last - self.np_rng = np.random.default_rng( - self.data_efficiency_config[DATA_EFFICIENCY_SEED]) + self.np_rng = np.random.default_rng(self.data_efficiency_config[DATA_EFFICIENCY_SEED]) self.state = {} self.batch = [] self.consumed_samples = 0 - if self.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][ - CURRICULUM_LEARNING_ENABLED]: + if self.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][CURRICULUM_LEARNING_ENABLED]: self.curriculum_step = 0 self.current_difficulties = {} self.data_cluster_paths = [] @@ -77,33 +76,26 @@ class DeepSpeedDataSampler(object): if self.global_rank == 0: self.data_clusters = [] self.data_cluster_sizes = [] - cluster_path = self.data_efficiency_config[DATA_SAMPLING][ - CURRICULUM_LEARNING][CURRICULUM_LEARNING_CLUSTER_PATH] + cluster_path = self.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][ + CURRICULUM_LEARNING_CLUSTER_PATH] if not os.path.exists(cluster_path): os.makedirs(cluster_path) - for metric in self.data_efficiency_config[DATA_SAMPLING][ - CURRICULUM_LEARNING][CURRICULUM_LEARNING_METRICS]: + for metric in self.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][CURRICULUM_LEARNING_METRICS]: self.curriculum_schedulers[metric] = CurriculumScheduler( - data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING] - [CURRICULUM_LEARNING_METRICS][metric]) - self.difficulty_type[metric] = data_efficiency_config[DATA_SAMPLING][ - CURRICULUM_LEARNING][CURRICULUM_LEARNING_METRICS][metric][ - CURRICULUM_LEARNING_DIFFICULTY_TYPE] - self.clustering_type[metric] = data_efficiency_config[DATA_SAMPLING][ - CURRICULUM_LEARNING][CURRICULUM_LEARNING_METRICS][metric][ - CURRICULUM_LEARNING_CLUSTERING_TYPE] + data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][CURRICULUM_LEARNING_METRICS][metric]) + self.difficulty_type[metric] = data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][ + CURRICULUM_LEARNING_METRICS][metric][CURRICULUM_LEARNING_DIFFICULTY_TYPE] + self.clustering_type[metric] = data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][ + CURRICULUM_LEARNING_METRICS][metric][CURRICULUM_LEARNING_CLUSTERING_TYPE] if self.global_rank == 0: if self.clustering_type[metric] != CURRICULUM_LEARNING_SINGLE_CLUSTER: self.curriculum_index_to_sample[metric] = MMapIndexedDataset( - data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING] - [CURRICULUM_LEARNING_METRICS][metric] - [CURRICULUM_LEARNING_SAMPLE_PATH], + data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][CURRICULUM_LEARNING_METRICS] + [metric][CURRICULUM_LEARNING_SAMPLE_PATH], skip_warmup=True) - if self.difficulty_type[ - metric] == CURRICULUM_LEARNING_VALUE_BASED: + if self.difficulty_type[metric] == CURRICULUM_LEARNING_VALUE_BASED: self.curriculum_index_to_metric[metric] = MMapIndexedDataset( - data_efficiency_config[DATA_SAMPLING] - [CURRICULUM_LEARNING][CURRICULUM_LEARNING_METRICS] + data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][CURRICULUM_LEARNING_METRICS] [metric][CURRICULUM_LEARNING_METRIC_PATH], skip_warmup=True) @@ -122,8 +114,7 @@ class DeepSpeedDataSampler(object): def set_custom_curriculum_learning_schedule(self, schedule_func_dict): for metric in self.curriculum_schedulers: if metric in schedule_func_dict: - self.curriculum_schedulers[metric].set_custom_get_difficulty( - schedule_func_dict[metric]) + self.curriculum_schedulers[metric].set_custom_get_difficulty(schedule_func_dict[metric]) def get_start_end_idx(self): start_idx = self.data_parallel_rank * self.micro_batch_size @@ -133,26 +124,19 @@ class DeepSpeedDataSampler(object): def get_sample_based_on_metric_value(self, metric, value_start, value_end): new_samples = None for row in range(len(self.curriculum_index_to_sample[metric])): - if self.curriculum_index_to_metric[metric][ - row] <= value_end and self.curriculum_index_to_metric[metric][ - row] > value_start: + if self.curriculum_index_to_metric[metric][row] <= value_end and self.curriculum_index_to_metric[metric][ + row] > value_start: row_samples = np.copy(self.curriculum_index_to_sample[metric][row]) new_samples = row_samples if new_samples is None else np.concatenate( - (new_samples, - row_samples), - axis=None) + (new_samples, row_samples), axis=None) return new_samples - def get_sample_based_on_metric_percentile(self, - metric, - percentile_start, - percentile_end): + def get_sample_based_on_metric_percentile(self, metric, percentile_start, percentile_end): new_samples = None if self.data_1epoch_size is None: - self.data_1epoch_size = sum( - len(x) for x in self.curriculum_index_to_sample[metric]) - max_percentile = self.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][ - CURRICULUM_LEARNING_METRICS][metric][CURRICULUM_LEARNING_MAX_DIFFICULTY] + self.data_1epoch_size = sum(len(x) for x in self.curriculum_index_to_sample[metric]) + max_percentile = self.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][CURRICULUM_LEARNING_METRICS][ + metric][CURRICULUM_LEARNING_MAX_DIFFICULTY] sample_per_percentile = self.data_1epoch_size // max_percentile start_count = sample_per_percentile * percentile_start end_count = sample_per_percentile * percentile_end @@ -167,12 +151,9 @@ class DeepSpeedDataSampler(object): row_end = row_size else: row_end = end_count - current_count - row_samples = np.copy( - self.curriculum_index_to_sample[metric][row][row_start:row_end]) + row_samples = np.copy(self.curriculum_index_to_sample[metric][row][row_start:row_end]) new_samples = row_samples if new_samples is None else np.concatenate( - (new_samples, - row_samples), - axis=None) + (new_samples, row_samples), axis=None) current_count += row_size if current_count >= end_count: break @@ -193,63 +174,42 @@ class DeepSpeedDataSampler(object): need_clustering += 1 if need_clustering > 1: for metric in self.curriculum_schedulers: - if self.clustering_type[ - metric] == CURRICULUM_LEARNING_SINGLE_CLUSTER: + if self.clustering_type[metric] == CURRICULUM_LEARNING_SINGLE_CLUSTER: metric_cluster = np.arange(start=0, stop=self.one_epoch_total_samples, step=1, dtype=self.index_dtype) else: - if self.difficulty_type[ - metric] == CURRICULUM_LEARNING_VALUE_BASED: - metric_cluster = self.get_sample_based_on_metric_value( - metric, - float('-inf'), - self.current_difficulties[metric]) - elif self.difficulty_type[ - metric] == CURRICULUM_LEARNING_PERCENTILE_BASED: + if self.difficulty_type[metric] == CURRICULUM_LEARNING_VALUE_BASED: + metric_cluster = self.get_sample_based_on_metric_value(metric, float('-inf'), + self.current_difficulties[metric]) + elif self.difficulty_type[metric] == CURRICULUM_LEARNING_PERCENTILE_BASED: metric_cluster = self.get_sample_based_on_metric_percentile( - metric, - 0, - self.current_difficulties[metric]) + metric, 0, self.current_difficulties[metric]) new_cluster = metric_cluster if new_cluster is None else \ np.intersect1d(new_cluster, metric_cluster, assume_unique=True) for cluster in self.data_clusters: - new_cluster = np.setdiff1d(new_cluster, - cluster[0], - assume_unique=True) + new_cluster = np.setdiff1d(new_cluster, cluster[0], assume_unique=True) else: if len(self.data_clusters) == 0: - new_cluster = np.arange(start=0, - stop=self.one_epoch_total_samples, - step=1, - dtype=self.index_dtype) + new_cluster = np.arange(start=0, stop=self.one_epoch_total_samples, step=1, dtype=self.index_dtype) for metric in self.curriculum_schedulers: if self.clustering_type[metric] != CURRICULUM_LEARNING_SINGLE_CLUSTER: - if self.difficulty_type[ - metric] == CURRICULUM_LEARNING_VALUE_BASED: - new_cluster = self.get_sample_based_on_metric_value( - metric, - previous_difficulties[metric], - self.current_difficulties[metric]) - elif self.difficulty_type[ - metric] == CURRICULUM_LEARNING_PERCENTILE_BASED: + if self.difficulty_type[metric] == CURRICULUM_LEARNING_VALUE_BASED: + new_cluster = self.get_sample_based_on_metric_value(metric, previous_difficulties[metric], + self.current_difficulties[metric]) + elif self.difficulty_type[metric] == CURRICULUM_LEARNING_PERCENTILE_BASED: new_cluster = self.get_sample_based_on_metric_percentile( - metric, - previous_difficulties[metric], - self.current_difficulties[metric]) + metric, previous_difficulties[metric], self.current_difficulties[metric]) if new_cluster is not None and len(new_cluster) > 0: logger.info( f"new data cluster (previous_difficulties {previous_difficulties}, current_difficulties {self.current_difficulties}) with size {len(new_cluster)} generated." ) self.np_rng.shuffle(new_cluster) - cluster_builder = create_mmap_dataset_builder(cluster_path, - self.index_dtype) + cluster_builder = create_mmap_dataset_builder(cluster_path, self.index_dtype) cluster_builder.add_item_numpy(new_cluster) close_mmap_dataset_builder(cluster_builder, cluster_path) - self.data_clusters.append( - MMapIndexedDataset(cluster_path, - skip_warmup=True)) + self.data_clusters.append(MMapIndexedDataset(cluster_path, skip_warmup=True)) self.data_cluster_sizes.append(len(self.data_clusters[-1][0])) else: logger.info( @@ -264,10 +224,7 @@ class DeepSpeedDataSampler(object): num_clusters = len(self.data_clusters) weight_sum = sum(self.data_cluster_sizes) weights = [x / weight_sum for x in self.data_cluster_sizes] - samples = self.np_rng.choice(num_clusters, - self.global_batch_size, - replace=True, - p=weights) + samples = self.np_rng.choice(num_clusters, self.global_batch_size, replace=True, p=weights) samples = np.bincount(samples, minlength=num_clusters) return samples @@ -285,8 +242,7 @@ class DeepSpeedDataSampler(object): def get_sample_from_cluster(self, cidx, num_samples): start_idx = self.data_cluster_current_position[cidx] - samples = list( - np.copy(self.data_clusters[cidx][0][start_idx:(start_idx + num_samples)])) + samples = list(np.copy(self.data_clusters[cidx][0][start_idx:(start_idx + num_samples)])) self.data_cluster_current_position[cidx] += num_samples if len(samples) < num_samples: num_samples_remained = num_samples - len(samples) @@ -297,14 +253,12 @@ class DeepSpeedDataSampler(object): return samples def get_next_global_batch(self): - if self.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][ - CURRICULUM_LEARNING_ENABLED]: + if self.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][CURRICULUM_LEARNING_ENABLED]: self.curriculum_step += 1 new_cluster = False previous_difficulties = {} for metric in self.curriculum_schedulers: - next_difficulty = self.curriculum_schedulers[metric].update_difficulty( - self.curriculum_step) + next_difficulty = self.curriculum_schedulers[metric].update_difficulty(self.curriculum_step) if metric not in self.current_difficulties or \ next_difficulty != self.current_difficulties[metric]: new_cluster = True @@ -313,8 +267,7 @@ class DeepSpeedDataSampler(object): else: if self.difficulty_type[metric] == CURRICULUM_LEARNING_VALUE_BASED: previous_difficulties[metric] = float('-inf') - elif self.difficulty_type[ - metric] == CURRICULUM_LEARNING_PERCENTILE_BASED: + elif self.difficulty_type[metric] == CURRICULUM_LEARNING_PERCENTILE_BASED: previous_difficulties[metric] = 0 self.current_difficulties[metric] = next_difficulty if new_cluster: @@ -323,12 +276,9 @@ class DeepSpeedDataSampler(object): samples_per_cluster = self.sample_from_clusters() batch = [] for cidx in range(len(samples_per_cluster)): - batch += self.get_sample_from_cluster(cidx, - samples_per_cluster[cidx]) + batch += self.get_sample_from_cluster(cidx, samples_per_cluster[cidx]) self.np_rng.shuffle(batch) - batch = torch.tensor(batch, - device=get_accelerator().current_device_name(), - dtype=torch.long).view(-1) + batch = torch.tensor(batch, device=get_accelerator().current_device_name(), dtype=torch.long).view(-1) else: batch = torch.empty(self.global_batch_size, device=get_accelerator().current_device_name(), @@ -356,8 +306,7 @@ class DeepSpeedDataSampler(object): CURRICULUM_LEARNING_STEP: self.curriculum_step, CURRICULUM_LEARNING_CURRENT_DIFFICULTIES: self.current_difficulties, CURRICULUM_LEARNING_DATA_CLUSTER_PATHS: self.data_cluster_paths, - CURRICULUM_LEARNING_DATA_CLUSTER_CURRENT_POSITION: - self.data_cluster_current_position, + CURRICULUM_LEARNING_DATA_CLUSTER_CURRENT_POSITION: self.data_cluster_current_position, CURRICULUM_LEARNING_NP_RNG_STATE: np.random.get_state() } @@ -367,11 +316,10 @@ class DeepSpeedDataSampler(object): self.curriculum_step = state_dict[CURRICULUM_LEARNING_STEP] self.current_difficulties = state_dict[CURRICULUM_LEARNING_CURRENT_DIFFICULTIES] self.data_cluster_paths = state_dict[CURRICULUM_LEARNING_DATA_CLUSTER_PATHS] - self.data_cluster_current_position = state_dict[ - CURRICULUM_LEARNING_DATA_CLUSTER_CURRENT_POSITION] + self.data_cluster_current_position = state_dict[CURRICULUM_LEARNING_DATA_CLUSTER_CURRENT_POSITION] np.random.set_state(state_dict[CURRICULUM_LEARNING_NP_RNG_STATE]) - cluster_root_path = self.data_efficiency_config[DATA_SAMPLING][ - CURRICULUM_LEARNING][CURRICULUM_LEARNING_CLUSTER_PATH] + cluster_root_path = self.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][ + CURRICULUM_LEARNING_CLUSTER_PATH] # Backward compatibility: previously data_cluster_paths were stored as # absolute paths. Now we changed it to just the file name so that even # if user moved the cluster files, the checkpoint loading still works @@ -379,12 +327,9 @@ class DeepSpeedDataSampler(object): # in deepspeed json config. for idx in range(len(self.data_cluster_paths)): if '/' in self.data_cluster_paths[idx]: - self.data_cluster_paths[idx] = self.data_cluster_paths[idx].split( - '/')[-1] + self.data_cluster_paths[idx] = self.data_cluster_paths[idx].split('/')[-1] if self.global_rank == 0: for cluster_fname in self.data_cluster_paths: cluster_path = f"{cluster_root_path}/{cluster_fname}" - self.data_clusters.append( - MMapIndexedDataset(cluster_path, - skip_warmup=True)) + self.data_clusters.append(MMapIndexedDataset(cluster_path, skip_warmup=True)) self.data_cluster_sizes.append(len(self.data_clusters[-1][0])) diff --git a/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py b/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py index b2a31663e..af7fd418b 100644 --- a/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py +++ b/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py @@ -50,16 +50,13 @@ def infer_dataset_impl(path): return None else: print(f"Dataset does not exist: {path}") - print( - "Path should be a basename that both .idx and .bin can be appended to get full filenames." - ) + print("Path should be a basename that both .idx and .bin can be appended to get full filenames.") return None def make_builder(out_file, impl, vocab_size=None): if impl == 'mmap': - return MMapIndexedDatasetBuilder(out_file, - dtype=__best_fitting_dtype(vocab_size)) + return MMapIndexedDatasetBuilder(out_file, dtype=__best_fitting_dtype(vocab_size)) else: return IndexedDatasetBuilder(out_file) @@ -67,9 +64,7 @@ def make_builder(out_file, impl, vocab_size=None): def make_dataset(path, impl, skip_warmup=False): if not IndexedDataset.exists(path): print(f"Dataset does not exist: {path}") - print( - "Path should be a basename that both .idx and .bin can be appended to get full filenames." - ) + print("Path should be a basename that both .idx and .bin can be appended to get full filenames.") return None if impl == 'infer': impl = infer_dataset_impl(path) @@ -150,10 +145,8 @@ class IndexedDataset(torch.utils.data.Dataset): def read_index(self, path): with open(index_file_path(path), 'rb') as f: magic = f.read(8) - assert magic == self._HDR_MAGIC, ( - 'Index file doesn\'t match expected format. ' - 'Make sure that --dataset-impl is configured properly.' - ) + assert magic == self._HDR_MAGIC, ('Index file doesn\'t match expected format. ' + 'Make sure that --dataset-impl is configured properly.') version = f.read(8) assert struct.unpack('= 0: if data_sampler is None: - data_sampler = DistributedSampler( - dataset=dataset, - num_replicas=data_parallel_world_size, - rank=data_parallel_rank) + data_sampler = DistributedSampler(dataset=dataset, + num_replicas=data_parallel_world_size, + rank=data_parallel_rank) device_count = 1 else: if data_sampler is None: diff --git a/deepspeed/runtime/eigenvalue.py b/deepspeed/runtime/eigenvalue.py index 618ac00ca..5c78e2e54 100755 --- a/deepspeed/runtime/eigenvalue.py +++ b/deepspeed/runtime/eigenvalue.py @@ -7,6 +7,7 @@ import logging class Eigenvalue(object): + def __init__(self, verbose=False, max_iter=100, @@ -77,8 +78,7 @@ class Eigenvalue(object): ] else: v = [ - torch.randn(p.size(), - device=device) for p in model_block.parameters() + torch.randn(p.size(), device=device) for p in model_block.parameters() if p.grad is not None and p.grad.grad_fn is not None ] torch.random.set_rng_state(rng_state) @@ -100,24 +100,18 @@ class Eigenvalue(object): # Disable eigenvalue if the model doesn't support second order gradients computation, # e.g. when enabling DS transformer kernel. if len(grads) == 0 or len(params) == 0: - log_dist(f'The model does NOT support eigenvalue computation.', - ranks=[0], - level=logging.WARNING) + log_dist(f'The model does NOT support eigenvalue computation.', ranks=[0], level=logging.WARNING) return [] i = 0 eigenvalue_current, eigenvalue_previous = 1., 0. while (i < self.max_iter) and abs(eigenvalue_current) > 0 and (abs( - (eigenvalue_current - eigenvalue_previous) / - eigenvalue_current) >= self.tol): # test convergence criteria + (eigenvalue_current - eigenvalue_previous) / eigenvalue_current) >= + self.tol): # test convergence criteria eigenvalue_previous = eigenvalue_current - Hv = torch.autograd.grad(grads, - params, - grad_outputs=v, - only_inputs=True, - retain_graph=True) + Hv = torch.autograd.grad(grads, params, grad_outputs=v, only_inputs=True, retain_graph=True) #Hv = [hv.float() for hv in Hv] Hv = [self.nan_to_num(hv).float() for hv in Hv] @@ -131,9 +125,7 @@ class Eigenvalue(object): block_eigenvalue.append(eigenvalue_current) if self.verbose: - log_dist( - f'block: {block}, power iteration: {i}, eigenvalue: {eigenvalue_current}', - ranks=[0]) + log_dist(f'block: {block}, power iteration: {i}, eigenvalue: {eigenvalue_current}', ranks=[0]) block_eigenvalue = self.post_process(block_eigenvalue) diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index fdd014668..cdcc2e478 100644 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -117,10 +117,8 @@ except ImportError: def split_half_float_double_sparse(tensors): device_type = get_accelerator().device_name() supported_types = [ - "torch.{}.HalfTensor".format(device_type), - "torch.{}.FloatTensor".format(device_type), - "torch.{}.DoubleTensor".format(device_type), - "torch.{}.BFloat16Tensor".format(device_type), + "torch.{}.HalfTensor".format(device_type), "torch.{}.FloatTensor".format(device_type), + "torch.{}.DoubleTensor".format(device_type), "torch.{}.BFloat16Tensor".format(device_type), SparseTensor.type() ] @@ -149,6 +147,7 @@ STEP_GLOBAL_TIMER = 'step' class EngineTimers(object): r"""Wallclock timers for DeepSpeedEngine""" + def __init__(self, enable_micro_timers, enable_global_timers): self.forward_timers = [] self.backward_timers = [] @@ -165,10 +164,7 @@ class EngineTimers(object): self.backward_reduce_timers += [BACKWARD_REDUCE_MICRO_TIMER] self.step_timers += [STEP_MICRO_TIMER] self.micro_timers += [ - FORWARD_MICRO_TIMER, - BACKWARD_MICRO_TIMER, - BACKWARD_INNER_MICRO_TIMER, - BACKWARD_REDUCE_MICRO_TIMER, + FORWARD_MICRO_TIMER, BACKWARD_MICRO_TIMER, BACKWARD_INNER_MICRO_TIMER, BACKWARD_REDUCE_MICRO_TIMER, STEP_MICRO_TIMER ] @@ -179,16 +175,14 @@ class EngineTimers(object): self.backward_reduce_timers += [BACKWARD_REDUCE_GLOBAL_TIMER] self.step_timers += [STEP_GLOBAL_TIMER] self.global_timers += [ - FORWARD_GLOBAL_TIMER, - BACKWARD_GLOBAL_TIMER, - BACKWARD_INNER_GLOBAL_TIMER, - BACKWARD_REDUCE_GLOBAL_TIMER, + FORWARD_GLOBAL_TIMER, BACKWARD_GLOBAL_TIMER, BACKWARD_INNER_GLOBAL_TIMER, BACKWARD_REDUCE_GLOBAL_TIMER, STEP_GLOBAL_TIMER ] class DeepSpeedEngine(Module): r"""DeepSpeed engine for training.""" + def __init__( self, args, @@ -255,8 +249,7 @@ class DeepSpeedEngine(Module): from deepspeed.comm import supported_torch_version # This supported_torch_version check is for torch1.2 compatibility only if supported_torch_version: - dist.init_distributed(dist_backend=self.dist_backend, - dist_init_required=dist_init_required) + dist.init_distributed(dist_backend=self.dist_backend, dist_init_required=dist_init_required) else: if dist_init_required is None: dist_init_required = not dist.is_initialized() @@ -272,14 +265,12 @@ class DeepSpeedEngine(Module): self._do_args_sanity_check(args) self._configure_with_arguments(args, mpu) self._do_sanity_check() - see_memory_usage(f"DeepSpeed Engine: After args sanity test", - force=self.memory_breakdown()) + see_memory_usage(f"DeepSpeed Engine: After args sanity test", force=self.memory_breakdown()) if mpu is not None: if self.elasticity_enabled(): if not self.is_elastic_model_parallel_supported(): - assert not self.elasticity_enabled(), ( - "Elasticity is not currently supported" " with model parallelism." - ) + assert not self.elasticity_enabled(), ("Elasticity is not currently supported" + " with model parallelism.") self._set_distributed_vars(args) @@ -310,8 +301,7 @@ class DeepSpeedEngine(Module): monitor_memory=False, ) - log_dist(f"DeepSpeed Flops Profiler Enabled: {self.flops_profiler_enabled()}", - ranks=[0]) + log_dist(f"DeepSpeed Flops Profiler Enabled: {self.flops_profiler_enabled()}", ranks=[0]) if self.flops_profiler_enabled(): self.flops_profiler = FlopsProfiler(self.module, self) @@ -351,12 +341,9 @@ class DeepSpeedEngine(Module): self.sparse_tensor_module_names = set() # if self.sparse_gradients_enabled(): for name, module in self.module.named_modules(): - if isinstance(module, - (torch.nn.Embedding, - torch.nn.EmbeddingBag)) and self.sparse_gradients_enabled(): + if isinstance(module, (torch.nn.Embedding, torch.nn.EmbeddingBag)) and self.sparse_gradients_enabled(): self.sparse_tensor_module_names.add(name + ".weight") - logger.info( - "Will convert {} to sparse tensor during training".format(name)) + logger.info("Will convert {} to sparse tensor during training".format(name)) self.save_non_zero_checkpoint = False self.save_zero_checkpoint = False @@ -370,23 +357,19 @@ class DeepSpeedEngine(Module): self.progressive_layer_drop = self._configure_progressive_layer_drop() if self.curriculum_enabled_legacy(): - self.curriculum_scheduler_legacy = self._configure_curriculum_scheduler_legacy( - ) + self.curriculum_scheduler_legacy = self._configure_curriculum_scheduler_legacy() if self.random_ltd_enabled(): random_ltd_config = self.random_ltd_config() random_ltd_config[RANDOM_LTD_GLOBAL_BATCH_SIZE] = self.train_batch_size() - random_ltd_config[ - RANDOM_LTD_MICRO_BATCH_SIZE] = self.train_micro_batch_size_per_gpu() - self.random_ltd_scheduler = self._configure_random_ltd_scheduler( - random_ltd_config) + random_ltd_config[RANDOM_LTD_MICRO_BATCH_SIZE] = self.train_micro_batch_size_per_gpu() + self.random_ltd_scheduler = self._configure_random_ltd_scheduler(random_ltd_config) # Engine timers - self.engine_timers = EngineTimers( - enable_micro_timers=self.wall_clock_breakdown(), - enable_global_timers=self.wall_clock_breakdown() - or self.flops_profiler_enabled()) + self.engine_timers = EngineTimers(enable_micro_timers=self.wall_clock_breakdown(), + enable_global_timers=self.wall_clock_breakdown() + or self.flops_profiler_enabled()) if self.global_rank == 0: self._config.print("DeepSpeedEngine configuration") @@ -419,10 +402,8 @@ class DeepSpeedEngine(Module): if p.requires_grad: trainable_num_params += n if self.global_rank == 0: - self.autotuning_model_info[ - "num_params"] = num_params * self.mp_world_size - self.autotuning_model_info[ - "trainable_num_params"] = trainable_num_params * self.mp_world_size + self.autotuning_model_info["num_params"] = num_params * self.mp_world_size + self.autotuning_model_info["trainable_num_params"] = trainable_num_params * self.mp_world_size logger.info(f"model parameter = {num_params}") @@ -452,13 +433,10 @@ class DeepSpeedEngine(Module): ValueError: if ``train_batch_size`` is not divisible by the configured micro-batch size and data parallelism. """ - if train_batch_size % (self.train_micro_batch_size_per_gpu() * - self.dp_world_size) != 0: + if train_batch_size % (self.train_micro_batch_size_per_gpu() * self.dp_world_size) != 0: #print(f'{train_batch_size=} {self.train_micro_batch_size_per_gpu()=} {self.dp_world_size=}') - raise ValueError( - f'Train batch size must be divisible by micro-batch data parallelism') - new_gas = train_batch_size // (self.train_micro_batch_size_per_gpu() * - self.dp_world_size) + raise ValueError(f'Train batch size must be divisible by micro-batch data parallelism') + new_gas = train_batch_size // (self.train_micro_batch_size_per_gpu() * self.dp_world_size) # overwrite config self._config.train_batch_size = train_batch_size self._config.gradient_accumulation_steps = new_gas @@ -469,8 +447,7 @@ class DeepSpeedEngine(Module): def set_custom_curriculum_learning_schedule(self, schedule_func_dict): if self.training_dataloader is not None and self.curriculum_learning_enabled(): - self.training_dataloader.data_sampler.set_custom_curriculum_learning_schedule( - schedule_func_dict) + self.training_dataloader.data_sampler.set_custom_curriculum_learning_schedule(schedule_func_dict) def get_global_grad_norm(self) -> float: """Return the 2-norm of all gradients. If there is model parallelism, @@ -497,8 +474,7 @@ class DeepSpeedEngine(Module): elif name in dir(_module): return getattr(_module, name) else: - raise AttributeError( - f"'{type(self).__name__}' object has no attribute '{name}'") + raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'") def checkpoint_tag_validation_enabled(self): return self._config.checkpoint_tag_validation_enabled @@ -572,15 +548,13 @@ class DeepSpeedEngine(Module): return self._config.data_efficiency_config[DATA_SAMPLING] def curriculum_learning_enabled(self): - return self._config.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][ - CURRICULUM_LEARNING_ENABLED] + return self._config.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][CURRICULUM_LEARNING_ENABLED] def curriculum_learning_config(self): return self._config.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING] def random_ltd_enabled(self): - return self._config.data_efficiency_config[DATA_ROUTING][RANDOM_LTD][ - RANDOM_LTD_ENABLED] + return self._config.data_efficiency_config[DATA_ROUTING][RANDOM_LTD][RANDOM_LTD_ENABLED] def random_ltd_config(self): return self._config.data_efficiency_config[DATA_ROUTING][RANDOM_LTD] @@ -588,26 +562,20 @@ class DeepSpeedEngine(Module): def random_ltd_initialize(self): assert self.random_ltd_enabled() random_ltd_config = self.random_ltd_config() - random_ltd_queue = deque( - [x for x in sorted(random_ltd_config[RANDOM_LTD_LAYER_ID])]) + random_ltd_queue = deque([x for x in sorted(random_ltd_config[RANDOM_LTD_LAYER_ID])]) count = 0 for name, layer in self.module.named_modules(): if isinstance(layer, RandomLayerTokenDrop): - if len(random_ltd_queue) != 0 and str( - random_ltd_queue[0]) in name: ###[1,2,3] - layer.init_config(random_ltd_config, - self.random_ltd_scheduler, - count) + if len(random_ltd_queue) != 0 and str(random_ltd_queue[0]) in name: ###[1,2,3] + layer.init_config(random_ltd_config, self.random_ltd_scheduler, count) random_ltd_queue.popleft() count += 1 if random_ltd_config[RANDOM_LTD_LAYER_NUM] != count: - raise ValueError( - f'random_ltd_layer_num {random_ltd_config[RANDOM_LTD_LAYER_NUM]} must be \ + raise ValueError(f'random_ltd_layer_num {random_ltd_config[RANDOM_LTD_LAYER_NUM]} must be \ equivalent to the len of random_ltd_layer_id {count}') - if random_ltd_config[RANDOM_LTD_LAYER_TOKEN_LR_SCHEDULE][ - RANDOM_LTD_LAYER_TOKEN_LR_ENABLED]: + if random_ltd_config[RANDOM_LTD_LAYER_TOKEN_LR_SCHEDULE][RANDOM_LTD_LAYER_TOKEN_LR_ENABLED]: assert self.client_lr_scheduler is None raise ValueError(f'not yet support') #self.lr_scheduler = lr_schedules.WarmupLayerTokenDecayLR(self.optimizer, self.random_ltd_scheduler) @@ -668,8 +636,7 @@ class DeepSpeedEngine(Module): def autotuning_profile_model_info(self): return self.autotuning_enabled( ) and self._config.autotuning_config.model_info and self._config.autotuning_config.model_info.get( - "profile", - False) + "profile", False) def sparse_gradients_enabled(self): return self._config.sparse_gradients_enabled @@ -681,8 +648,7 @@ class DeepSpeedEngine(Module): return self._config.train_micro_batch_size_per_gpu def optimizer_name(self): - return (self.client_optimizer.__class__.__name__ - if self.client_optimizer else self._config.optimizer_name) + return (self.client_optimizer.__class__.__name__ if self.client_optimizer else self._config.optimizer_name) def optimizer_params(self): return self._config.optimizer_params @@ -700,22 +666,15 @@ class DeepSpeedEngine(Module): return ( self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS] [WEIGHT_QUANTIZE_IN_FORWARD_ENABLED], - self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS] - [WEIGHT_QUANTIZE_ENABLED], - self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS] - [WEIGHT_QUANTIZE_GROUPS], + self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS][WEIGHT_QUANTIZE_ENABLED], + self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS][WEIGHT_QUANTIZE_GROUPS], self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS] [WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE], - self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS] - [WEIGHT_QUANTIZE_CHANGE_RATIO], - self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS] - [WEIGHT_QUANTIZE_TYPE], - self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS] - [WEIGHT_QUANTIZE_ROUNDING], - self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS] - [WEIGHT_QUANTIZE_VERBOSE], - self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS] - [WEIGHT_QUANTIZE_KERNEL], + self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS][WEIGHT_QUANTIZE_CHANGE_RATIO], + self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS][WEIGHT_QUANTIZE_TYPE], + self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS][WEIGHT_QUANTIZE_ROUNDING], + self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS][WEIGHT_QUANTIZE_VERBOSE], + self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS][WEIGHT_QUANTIZE_KERNEL], ) def zero_optimization(self): @@ -741,10 +700,7 @@ class DeepSpeedEngine(Module): def zero_use_cpu_optimizer(self): if self._config.zero_config.offload_optimizer is not None: - return self._config.zero_config.offload_optimizer.device in [ - OffloadDeviceEnum.cpu, - OffloadDeviceEnum.nvme - ] + return self._config.zero_config.offload_optimizer.device in [OffloadDeviceEnum.cpu, OffloadDeviceEnum.nvme] return False def zero_cpu_offload(self): @@ -905,14 +861,11 @@ class DeepSpeedEngine(Module): # First check for scheduler in json configuration lr_scheduler = self._scheduler_from_config(self.optimizer) if lr_scheduler: - log_dist( - f"DeepSpeed using configured LR scheduler = {self.scheduler_name()}", - ranks=[0]) + log_dist(f"DeepSpeed using configured LR scheduler = {self.scheduler_name()}", ranks=[0]) self.lr_scheduler = lr_scheduler else: if isinstance(client_lr_scheduler, Callable): - log_dist('DeepSpeed using client callable to create LR scheduler', - ranks=[0]) + log_dist('DeepSpeed using client callable to create LR scheduler', ranks=[0]) self.lr_scheduler = client_lr_scheduler(self.basic_optimizer) else: log_dist('DeepSpeed using client LR scheduler', ranks=[0]) @@ -927,12 +880,9 @@ class DeepSpeedEngine(Module): try: from deepspeed.runtime.checkpoint_engine.nebula_checkpoint_engine import \ NebulaCheckpointEngine - self.checkpoint_engine = NebulaCheckpointEngine( - config_params=self._config.nebula_config) + self.checkpoint_engine = NebulaCheckpointEngine(config_params=self._config.nebula_config) except ImportError as err: - logger.error( - f"No torch_nebula was found! Will fall back to torch.save. Details: {err}" - ) + logger.error(f"No torch_nebula was found! Will fall back to torch.save. Details: {err}") self.checkpoint_engine = TorchCheckpointEngine() dp_rank = self.global_rank @@ -944,8 +894,7 @@ class DeepSpeedEngine(Module): # only the first data parallel process needs to store the model checkpoint # if you want to use node local storage this must be done by rank 0 on each # node - self.save_non_zero_checkpoint = ( - rank == 0) or self.zero_optimization_partition_weights() + self.save_non_zero_checkpoint = (rank == 0) or self.zero_optimization_partition_weights() if self.zero_optimization() or self.bfloat16_enabled(): param_rank = dist.get_rank(group=self.optimizer.dp_process_group) @@ -960,9 +909,8 @@ class DeepSpeedEngine(Module): if hasattr(lr_schedules, scheduler_name): scheduler = getattr(lr_schedules, scheduler_name) else: - assert hasattr( - torch.optim.lr_scheduler, scheduler_name - ), f"DeepSpeed does not recognize LR scheduler {scheduler_name}" + assert hasattr(torch.optim.lr_scheduler, + scheduler_name), f"DeepSpeed does not recognize LR scheduler {scheduler_name}" scheduler = getattr(torch.optim.lr_scheduler, scheduler_name) @@ -973,9 +921,7 @@ class DeepSpeedEngine(Module): return None def _set_distributed_vars(self, args): - device_rank = args.device_rank if args is not None and hasattr( - args, - 'device_rank') else self.local_rank + device_rank = args.device_rank if args is not None and hasattr(args, 'device_rank') else self.local_rank if device_rank >= 0: get_accelerator().set_device(device_rank) self.device = torch.device(get_accelerator().device_name(), device_rank) @@ -1005,21 +951,16 @@ class DeepSpeedEngine(Module): args.local_rank = self.local_rank if self.config is None: - self.config = (args.deepspeed_config - if hasattr(args, - "deepspeed_config") else None) + self.config = (args.deepspeed_config if hasattr(args, "deepspeed_config") else None) self._config = DeepSpeedConfig(self.config, mpu) # Validate command line arguments def _do_args_sanity_check(self, args): if hasattr(args, "deepscale_config") and args.deepscale_config is not None: - logger.warning( - "************ --deepscale_config is deprecated, please use --deepspeed_config ************" - ) + logger.warning("************ --deepscale_config is deprecated, please use --deepspeed_config ************") if hasattr(args, "deepspeed_config"): - assert ( - args.deepspeed_config is None - ), "Not sure how to proceed, we were given both a deepscale_config and deepspeed_config" + assert (args.deepspeed_config is + None), "Not sure how to proceed, we were given both a deepscale_config and deepspeed_config" args.deepspeed_config = args.deepscale_config assert "LOCAL_RANK" in os.environ or "OMPI_COMM_WORLD_LOCAL_RANK" in os.environ, "DeepSpeed requires the LOCAL_RANK environment " \ @@ -1027,8 +968,8 @@ class DeepSpeedEngine(Module): "different launcher please ensure LOCAL_RANK is set prior to initializing deepspeed." if hasattr(args, 'local_rank') and args.local_rank != None: - assert isinstance( - args.local_rank, int), f"args.local_rank of {args.local_rank} is an unknown type {type(args.local_rank)}" + assert isinstance(args.local_rank, + int), f"args.local_rank of {args.local_rank} is an unknown type {type(args.local_rank)}" if args.local_rank >= 0: env_local_rank = int(os.environ.get("LOCAL_RANK")) assert ( @@ -1036,16 +977,11 @@ class DeepSpeedEngine(Module): ), f"Mismatch in local rank setting, args.local_rank={args.local_rank} but env['LOCAL_RANK']={env_local_rank}." if self.config is None: - assert ( - hasattr( - args, "deepspeed_config") and args.deepspeed_config is not None - ), "DeepSpeed requires --deepspeed_config to specify configuration file" + assert (hasattr(args, "deepspeed_config") and args.deepspeed_config + is not None), "DeepSpeed requires --deepspeed_config to specify configuration file" def _is_supported_optimizer(self, optimizer_name): - return (optimizer_name in DEEPSPEED_OPTIMIZERS - or getattr(torch.optim, - optimizer_name, - None) is not None) + return (optimizer_name in DEEPSPEED_OPTIMIZERS or getattr(torch.optim, optimizer_name, None) is not None) def _supported_optims(self): FairseqOptimizer = None @@ -1070,18 +1006,11 @@ class DeepSpeedEngine(Module): if not self.client_optimizer: if self.optimizer_name() is not None: assert self._is_supported_optimizer( - self.optimizer_name() - ), "{} is not a supported DeepSpeed Optimizer".format( - self.optimizer_name() - ) + self.optimizer_name()), "{} is not a supported DeepSpeed Optimizer".format(self.optimizer_name()) - if (self.optimizer_name() == LAMB_OPTIMIZER - or self.optimizer_name() == ONEBIT_LAMB_OPTIMIZER): - assert ( - self.dynamic_loss_scale() - ), "DeepSpeed {} optimizer requires dynamic loss scaling".format( - self.optimizer_name() - ) + if (self.optimizer_name() == LAMB_OPTIMIZER or self.optimizer_name() == ONEBIT_LAMB_OPTIMIZER): + assert (self.dynamic_loss_scale()), "DeepSpeed {} optimizer requires dynamic loss scaling".format( + self.optimizer_name()) # Detect invalid combinations of client optimizer and client scheduler if isinstance(self.client_lr_scheduler, _LRScheduler): @@ -1089,6 +1018,7 @@ class DeepSpeedEngine(Module): f'Client Optimizer (type = {type(self.client_optimizer)} is not instantiated but Client LR Scheduler is instantiated' def _broadcast_model(self): + def is_replicated(p): if hasattr(p, "ds_status") and p.ds_status is not ZeroParamStatus.AVAILABLE: return False @@ -1103,20 +1033,15 @@ class DeepSpeedEngine(Module): group=self.expert_data_parallel_group[p.group_name]) else: if torch.is_tensor(p) and is_replicated(p): - dist.broadcast(p, - groups._get_broadcast_src_rank(), - group=self.data_parallel_group) + dist.broadcast(p, groups._get_broadcast_src_rank(), group=self.data_parallel_group) @staticmethod def __check_params(model: Module, dtype: torch.dtype) -> None: return - if not all(param.dtype == dtype - for param in model.parameters()) and dist.get_rank() == 0: - raise ValueError( - f"{dtype} is enabled but the following parameters have dtype that is " - f"not {dtype}: " - f"{[(n, p.dtype) for n, p in model.named_parameters() if p.dtype != dtype]}" - ) + if not all(param.dtype == dtype for param in model.parameters()) and dist.get_rank() == 0: + raise ValueError(f"{dtype} is enabled but the following parameters have dtype that is " + f"not {dtype}: " + f"{[(n, p.dtype) for n, p in model.named_parameters() if p.dtype != dtype]}") def _set_client_model(self, model): # register client model in _modules so that nn.module methods work correctly @@ -1130,14 +1055,12 @@ class DeepSpeedEngine(Module): if self.fp16_enabled(): if self.zero_optimization_partition_weights() and any( - [hasattr(param, - "ds_id") for param in self.module.parameters()]): + [hasattr(param, "ds_id") for param in self.module.parameters()]): self.__check_params(self.module, torch.half) self.module.half() elif self.bfloat16_enabled(): if self.zero_optimization_partition_weights() and any( - hasattr(param, - 'ds_id') for param in self.module.parameters()): + hasattr(param, 'ds_id') for param in self.module.parameters()): self.__check_params(self.module, torch.bfloat16) self.module.bfloat16() else: @@ -1191,8 +1114,7 @@ class DeepSpeedEngine(Module): return [id(param) for param in group] occurrence = sum([ - ids_list(group['params']).count(param_id) - if param_id in ids_list(group['params']) else 0 + ids_list(group['params']).count(param_id) if param_id in ids_list(group['params']) else 0 for group in optimizer.param_groups ]) assert occurrence <= 1, f"Parameter with name: {name} occurs multiple times in optimizer.param_groups. Make sure it only appears once to prevent undefined behaviour." @@ -1212,9 +1134,7 @@ class DeepSpeedEngine(Module): ), 'You are using an untested ZeRO Optimizer. Please add <"zero_allow_untested_optimizer": true> in the configuration file to use it.' if self.global_rank == 0: - logger.warning( - "**** You are using ZeRO with an untested optimizer, proceed with caution *****" - ) + logger.warning("**** You are using ZeRO with an untested optimizer, proceed with caution *****") if model_dtype == torch.bfloat16 and grad_accum_dtype == torch.float32 and self.zero_optimization_stage( ) == 1: @@ -1222,23 +1142,19 @@ class DeepSpeedEngine(Module): if model_dtype != grad_accum_dtype: raise NotImplementedError( - "Model data type and gradient accumulation data type must be equal to use ZeRO" - ) + "Model data type and gradient accumulation data type must be equal to use ZeRO") return ZERO_OPTIMIZATION elif amp_enabled: if model_dtype != grad_accum_dtype: raise NotImplementedError( - "Model data type and gradient accumulation data type must be equal to use Amp" - ) + "Model data type and gradient accumulation data type must be equal to use Amp") if model_dtype == torch.bfloat16 or model_dtype == torch.float16: - raise NotImplementedError( - "Cannot enable both amp with (legacy) fp16 or bfloat16 mode") + raise NotImplementedError("Cannot enable both amp with (legacy) fp16 or bfloat16 mode") try: logger.info("Initializing Apex amp from: {}".format(amp.__path__)) except NameError: # If apex/amp is available it will be imported above - raise RuntimeError( - "Unable to import apex/amp, please make sure it is installed") + raise RuntimeError("Unable to import apex/amp, please make sure it is installed") return AMP # data type checks elif model_dtype == grad_accum_dtype: @@ -1252,8 +1168,7 @@ class DeepSpeedEngine(Module): elif model_dtype == torch.bfloat16 and grad_accum_dtype == torch.float32: return BFLOAT16 else: - raise NotImplementedError( - "unsupported mix of model dtype and gradient accummulation type") + raise NotImplementedError("unsupported mix of model dtype and gradient accummulation type") return None @@ -1264,9 +1179,7 @@ class DeepSpeedEngine(Module): client_optimizer.param_groups[:] = [ pg for pg in client_optimizer.param_groups if len(pg["params"]) != 0 ] - log_dist( - "Removing param_group that has no 'params' in the client Optimizer", - ranks=[0]) + log_dist("Removing param_group that has no 'params' in the client Optimizer", ranks=[0]) basic_optimizer = client_optimizer log_dist('Using client Optimizer as basic optimizer', ranks=[0]) @@ -1274,24 +1187,18 @@ class DeepSpeedEngine(Module): basic_optimizer = client_optimizer(model_parameters) log_dist('Using client callable to create basic optimizer', ranks=[0]) - if self.zero_use_cpu_optimizer() and not isinstance( - basic_optimizer, - deepspeed.ops.adam.DeepSpeedCPUAdam): + if self.zero_use_cpu_optimizer() and not isinstance(basic_optimizer, deepspeed.ops.adam.DeepSpeedCPUAdam): if self.zero_force_ds_cpu_optimizer(): msg = f'You are using ZeRO-Offload with a client provided optimizer ({type(basic_optimizer)}) which in most cases will yield poor performance. Please either use deepspeed.ops.adam.DeepSpeedCPUAdam or set an optimizer in your ds-config (https://www.deepspeed.ai/docs/config-json/#optimizer-parameters). If you really want to use a custom optimizer w. ZeRO-Offload and understand the performance impacts you can also set <"zero_force_ds_cpu_optimizer": false> in your configuration file.' raise ZeRORuntimeException(msg) else: basic_optimizer = self._configure_basic_optimizer(model_parameters) - log_dist( - f"Using DeepSpeed Optimizer param name {self.optimizer_name()} as basic optimizer", - ranks=[0]) + log_dist(f"Using DeepSpeed Optimizer param name {self.optimizer_name()} as basic optimizer", ranks=[0]) self._check_for_duplicates(basic_optimizer) self.basic_optimizer = basic_optimizer - log_dist("DeepSpeed Basic Optimizer = {}".format( - basic_optimizer.__class__.__name__), - ranks=[0]) + log_dist("DeepSpeed Basic Optimizer = {}".format(basic_optimizer.__class__.__name__), ranks=[0]) optimizer_wrapper = self._do_optimizer_sanity_check(basic_optimizer) @@ -1300,9 +1207,7 @@ class DeepSpeedEngine(Module): elif optimizer_wrapper == AMP: amp_params = self.amp_params() log_dist(f"Initializing AMP with these params: {amp_params}", ranks=[0]) - model, self.optimizer = amp.initialize( - self.module, basic_optimizer, **amp_params - ) + model, self.optimizer = amp.initialize(self.module, basic_optimizer, **amp_params) self._set_client_model(model) self._broadcast_model() # TODO: maybe need to broadcast experts differently? @@ -1313,8 +1218,7 @@ class DeepSpeedEngine(Module): else: self.optimizer = basic_optimizer - log_dist("DeepSpeed Final Optimizer = {}".format(self.optimizer_name()), - ranks=[0]) + log_dist("DeepSpeed Final Optimizer = {}".format(self.optimizer_name()), ranks=[0]) self.compression_scheduler = self._configure_compression_scheduler() self.quantizer = self._configure_quantization() @@ -1334,22 +1238,18 @@ class DeepSpeedEngine(Module): adam_w_mode = optimizer_parameters.pop(ADAM_W_MODE, ADAM_W_MODE_DEFAULT) # Optimizer name of Adam forces AdamW logic unless adam_w_mode is explicitly set - effective_adam_w_mode = self.optimizer_name( - ) == ADAMW_OPTIMIZER or adam_w_mode + effective_adam_w_mode = self.optimizer_name() == ADAMW_OPTIMIZER or adam_w_mode if torch_adam: if not effective_adam_w_mode: - optimizer = torch.optim.Adam(model_parameters, - **optimizer_parameters) + optimizer = torch.optim.Adam(model_parameters, **optimizer_parameters) else: - optimizer = torch.optim.AdamW(model_parameters, - **optimizer_parameters) + optimizer = torch.optim.AdamW(model_parameters, **optimizer_parameters) else: if self.zero_use_cpu_optimizer(): if self.optimizer_name() == ADAGRAD_OPTIMIZER: from deepspeed.ops.adagrad import DeepSpeedCPUAdagrad - optimizer = DeepSpeedCPUAdagrad(model_parameters, - **optimizer_parameters) + optimizer = DeepSpeedCPUAdagrad(model_parameters, **optimizer_parameters) else: from deepspeed.ops.adam import DeepSpeedCPUAdam optimizer = DeepSpeedCPUAdam(model_parameters, @@ -1374,26 +1274,21 @@ class DeepSpeedEngine(Module): optimizer = OnebitAdam(model_parameters, self, **optimizer_parameters) if not self.fp16_enabled(): - logger.warning( - f"Currently the convergence of 1-bit Adam is only verified under FP16" - ) + logger.warning(f"Currently the convergence of 1-bit Adam is only verified under FP16") elif self.optimizer_name() == ZERO_ONE_ADAM_OPTIMIZER: assert not self.zero_optimization(), "0/1 Adam is not compatible with ZeRO" from deepspeed.runtime.fp16.onebit.zoadam import ZeroOneAdam optimizer = ZeroOneAdam(model_parameters, self, **optimizer_parameters) if not self.fp16_enabled(): - logger.warning( - f'Currently the convergence of 0/1 Adam is only verified under FP16') + logger.warning(f'Currently the convergence of 0/1 Adam is only verified under FP16') elif self.optimizer_name() == ONEBIT_LAMB_OPTIMIZER: assert not self.zero_optimization(), "1bit-Lamb is not compatible with ZeRO" from deepspeed.runtime.fp16.onebit.lamb import OnebitLamb optimizer = OnebitLamb(model_parameters, self, **optimizer_parameters) if not self.fp16_enabled(): - logger.warning( - f"Currently the convergence of 1-bit Lamb is only verified under FP16" - ) + logger.warning(f"Currently the convergence of 1-bit Lamb is only verified under FP16") else: torch_optimizer = getattr(torch.optim, self.optimizer_name()) optimizer = torch_optimizer(model_parameters, **optimizer_parameters) @@ -1418,7 +1313,8 @@ class DeepSpeedEngine(Module): use_quantizer_kernel, ) = self.quantize_training() if quantize_enabled and not quantize_weight_in_forward: - assert self.fp16_enabled(), "MoQ (quantize in optimization step) weight quantization is only supported for FP16" + assert self.fp16_enabled( + ), "MoQ (quantize in optimization step) weight quantization is only supported for FP16" quantizer = None if quantize_enabled and not quantize_weight_in_forward: from deepspeed.runtime.quantize import Quantizer @@ -1462,9 +1358,7 @@ class DeepSpeedEngine(Module): has_moe_layers=self.has_moe_layers, ) else: - log_dist( - f'Creating fp16 optimizer with static loss scale: {self.loss_scale()}', - ranks=[0]) + log_dist(f'Creating fp16 optimizer with static loss scale: {self.loss_scale()}', ranks=[0]) optimizer = FP16_Optimizer( optimizer, deepspeed=self, @@ -1475,8 +1369,7 @@ class DeepSpeedEngine(Module): has_moe_layers=self.has_moe_layers, ) else: - log_dist(f'Creating fp16 unfused optimizer with dynamic loss scale', - ranks=[0]) + log_dist(f'Creating fp16 unfused optimizer with dynamic loss scale', ranks=[0]) optimizer = FP16_UnfusedOptimizer( optimizer, deepspeed=self, @@ -1499,14 +1392,13 @@ class DeepSpeedEngine(Module): log_dist('Creating BF16 optimizer', ranks=[0]) timers = self.timers if self.wall_clock_breakdown() else None - optimizer = BF16_Optimizer( - optimizer, - self.param_names, - mpu=self.mpu, - clip_grad=clip_grad, - allgather_bucket_size=self.zero_allgather_bucket_size(), - dp_process_group=self.data_parallel_group, - timers=timers) + optimizer = BF16_Optimizer(optimizer, + self.param_names, + mpu=self.mpu, + clip_grad=clip_grad, + allgather_bucket_size=self.zero_allgather_bucket_size(), + dp_process_group=self.data_parallel_group, + timers=timers) return optimizer @@ -1529,8 +1421,7 @@ class DeepSpeedEngine(Module): round_robin_gradients = self.zero_round_robin_gradients() assert not isinstance(optimizer, DummyOptim), "zero stage {} requires an optimizer".format(zero_stage) - log_dist(f'Creating {model_dtype} ZeRO stage {zero_stage} optimizer', - ranks=[0]) + log_dist(f'Creating {model_dtype} ZeRO stage {zero_stage} optimizer', ranks=[0]) # Overlap and contiguous grads are meaningless in stage 1 and are ignored if zero_stage == ZeroStageEnum.optimizer_states: overlap_comm = False @@ -1541,9 +1432,7 @@ class DeepSpeedEngine(Module): if isinstance(self.module, PipelineModule): if overlap_comm: - logger.warning( - "Pipeline parallelism does not support overlapped communication, will be disabled." - ) + logger.warning("Pipeline parallelism does not support overlapped communication, will be disabled.") overlap_comm = False optimizer = DeepSpeedZeroOptimizer( optimizer, @@ -1557,10 +1446,8 @@ class DeepSpeedEngine(Module): reduce_bucket_size=self.zero_reduce_bucket_size(), allgather_bucket_size=self.zero_allgather_bucket_size(), dp_process_group=self.data_parallel_group, - expert_parallel_group=self.expert_parallel_group - if self.has_moe_layers else None, - expert_data_parallel_group=self.expert_data_parallel_group - if self.has_moe_layers else None, + expert_parallel_group=self.expert_parallel_group if self.has_moe_layers else None, + expert_data_parallel_group=self.expert_data_parallel_group if self.has_moe_layers else None, reduce_scatter=self.zero_reduce_scatter(), overlap_comm=overlap_comm, cpu_offload=self.zero_cpu_offload(), @@ -1572,8 +1459,7 @@ class DeepSpeedEngine(Module): partition_grads=zero_stage == ZeroStageEnum.gradients, round_robin_gradients=round_robin_gradients, has_moe_layers=self.has_moe_layers, - fp16_master_weights_and_gradients=self.fp16_master_weights_and_gradients( - ), + fp16_master_weights_and_gradients=self.fp16_master_weights_and_gradients(), communication_data_type=self.communication_data_type, elastic_checkpoint=self.zero_elastic_checkpoint()) @@ -1581,21 +1467,19 @@ class DeepSpeedEngine(Module): assert not self.has_moe_layers, "MoE not supported with Stage 3" if isinstance(optimizer, DummyOptim): log_dist("Creating ZeRO Offload", ranks=[0]) - optimizer = DeepSpeedZeRoOffload( - self.module, - timers=timers, - ds_config=self.config, - overlap_comm=self.zero_overlap_comm(), - prefetch_bucket_size=self.zero_prefetch_bucket_size(), - max_reuse_distance=self.zero_max_reuse_distance(), - max_live_parameters=self.zero_max_live_parameters(), - param_persistence_threshold=self.zero_param_persistence_threshold(), - model_persistence_threshold=self.zero_model_persistence_threshold(), - offload_param_config=self.zero_offload_param(), - mpu=self.mpu) + optimizer = DeepSpeedZeRoOffload(self.module, + timers=timers, + ds_config=self.config, + overlap_comm=self.zero_overlap_comm(), + prefetch_bucket_size=self.zero_prefetch_bucket_size(), + max_reuse_distance=self.zero_max_reuse_distance(), + max_live_parameters=self.zero_max_live_parameters(), + param_persistence_threshold=self.zero_param_persistence_threshold(), + model_persistence_threshold=self.zero_model_persistence_threshold(), + offload_param_config=self.zero_offload_param(), + mpu=self.mpu) else: - log_dist(f'Creating {model_dtype} ZeRO stage {zero_stage} optimizer', - ranks=[0]) + log_dist(f'Creating {model_dtype} ZeRO stage {zero_stage} optimizer', ranks=[0]) from deepspeed.runtime.zero.stage3 import DeepSpeedZeroOptimizer_Stage3 optimizer = DeepSpeedZeroOptimizer_Stage3( self.module, @@ -1659,9 +1543,7 @@ class DeepSpeedEngine(Module): @staticmethod def is_iterable_style_dataset(obj): - return isinstance(obj, - torch.utils.data.IterableDataset - ) # hasattr(obj, "__iter__") should work as well + return isinstance(obj, torch.utils.data.IterableDataset) # hasattr(obj, "__iter__") should work as well def dataloader_drop_last(self): return self._config.dataloader_drop_last @@ -1684,8 +1566,7 @@ class DeepSpeedEngine(Module): data_sampler=None, collate_fn=None, num_local_io_workers=None): - if not (self.is_map_style_dataset(dataset) - or self.is_iterable_style_dataset(dataset)): + if not (self.is_map_style_dataset(dataset) or self.is_iterable_style_dataset(dataset)): raise ValueError("Training data must be a torch Dataset") if batch_size is None: @@ -1717,33 +1598,26 @@ class DeepSpeedEngine(Module): deepspeed_dataloader_config = {} if self.curriculum_learning_enabled(): deepspeed_dataloader_config = { - CURRICULUM_LEARNING: - self.curriculum_learning_enabled(), - DATA_EFFICIENCY: - self.data_efficiency_config(), - DATA_PARALLEL_GROUP: - self.data_parallel_group, - GRADIENT_ACCUMULATION_STEPS: - self.gradient_accumulation_steps(), - GLOBAL_RANK: - self.global_rank, - DATA_SAMPLING_NUM_WORKERS: - self.data_sampling_config()[DATA_SAMPLING_NUM_WORKERS] + CURRICULUM_LEARNING: self.curriculum_learning_enabled(), + DATA_EFFICIENCY: self.data_efficiency_config(), + DATA_PARALLEL_GROUP: self.data_parallel_group, + GRADIENT_ACCUMULATION_STEPS: self.gradient_accumulation_steps(), + GLOBAL_RANK: self.global_rank, + DATA_SAMPLING_NUM_WORKERS: self.data_sampling_config()[DATA_SAMPLING_NUM_WORKERS] } - return DeepSpeedDataLoader( - dataset=dataset, - batch_size=batch_size, - pin_memory=pin_memory, - collate_fn=collate_fn, - local_rank=self.local_rank, - tput_timer=deepspeed_io_timer, - num_local_io_workers=num_local_io_workers, - data_sampler=data_sampler, - data_parallel_world_size=data_parallel_world_size, - data_parallel_rank=data_parallel_rank, - dataloader_drop_last=self.dataloader_drop_last(), - deepspeed_dataloader_config=deepspeed_dataloader_config) + return DeepSpeedDataLoader(dataset=dataset, + batch_size=batch_size, + pin_memory=pin_memory, + collate_fn=collate_fn, + local_rank=self.local_rank, + tput_timer=deepspeed_io_timer, + num_local_io_workers=num_local_io_workers, + data_sampler=data_sampler, + data_parallel_world_size=data_parallel_world_size, + data_parallel_rank=data_parallel_rank, + dataloader_drop_last=self.dataloader_drop_last(), + deepspeed_dataloader_config=deepspeed_dataloader_config) def train(self, mode=True): r"""""" @@ -1770,9 +1644,7 @@ class DeepSpeedEngine(Module): else: scaled_loss = prescaled_loss if self.warn_unscaled_loss: - logger.warning( - f"DeepSpeed unable to scale loss because of type: {type(prescaled_loss)}" - ) + logger.warning(f"DeepSpeed unable to scale loss because of type: {type(prescaled_loss)}") self.warn_unscaled_loss = False return scaled_loss @@ -1790,9 +1662,8 @@ class DeepSpeedEngine(Module): else: see_memory_usage("Engine before forward", force=self.memory_breakdown()) - flops_profiler_active = (self.flops_profiler_enabled() and self.global_steps - == self.flops_profiler_profile_step() - and self.global_rank == 0) + flops_profiler_active = (self.flops_profiler_enabled() + and self.global_steps == self.flops_profiler_profile_step() and self.global_rank == 0) # used to check quantization happens at step 0! if self.global_steps == 0 and hasattr(self, "compression_scheduler"): @@ -1821,10 +1692,7 @@ class DeepSpeedEngine(Module): if self.module.training and self.curriculum_enabled_legacy(): self.curriculum_scheduler_legacy.update_difficulty(self.global_steps + 1) if self.curriculum_params_legacy()["curriculum_type"] == "seqlen": - kwargs.update({ - "curriculum_seqlen": - self.curriculum_scheduler_legacy.get_current_difficulty() - }) + kwargs.update({"curriculum_seqlen": self.curriculum_scheduler_legacy.get_current_difficulty()}) if self.module.training and self.random_ltd_enabled(): self.random_ltd_scheduler.update_seq(self.global_steps) @@ -1859,9 +1727,7 @@ class DeepSpeedEngine(Module): if self.autotuning_profile_model_info(): activation_mem = get_ma_status() - ma self.autotuning_model_info["activation_mem_per_gpu"] = activation_mem - print_json_dist(self.autotuning_model_info, - [0], - path=self.autotuning_model_info_path()) + print_json_dist(self.autotuning_model_info, [0], path=self.autotuning_model_info_path()) exit() else: see_memory_usage("Engine after forward", force=self.memory_breakdown()) @@ -1912,29 +1778,21 @@ class DeepSpeedEngine(Module): f'allreduce_gradients() is not valid when bfloat+pipeline_parallelism is enabled' # Pass (PP) gas boundary flag to optimizer (required for zero) - self.optimizer.is_gradient_accumulation_boundary = self.is_gradient_accumulation_boundary( - ) + self.optimizer.is_gradient_accumulation_boundary = self.is_gradient_accumulation_boundary() # ZeRO stage >= 2 communicates during non gradient accumulation boundaries as well if self.zero_optimization_partition_gradients(): self.optimizer.overlapping_partition_gradients_reduce_epilogue() # Communicate only at gradient accumulation boundaries elif self.is_gradient_accumulation_boundary(): - if self.zero_optimization_stage( - ) == ZeroStageEnum.optimizer_states and hasattr(self.optimizer, - 'reduce_gradients'): - self.optimizer.reduce_gradients( - pipeline_parallel=self.pipeline_parallelism) + if self.zero_optimization_stage() == ZeroStageEnum.optimizer_states and hasattr( + self.optimizer, 'reduce_gradients'): + self.optimizer.reduce_gradients(pipeline_parallel=self.pipeline_parallelism) else: self.buffered_allreduce_fallback(elements_per_buffer=bucket_size) @instrument_w_nvtx - def backward(self, - loss, - allreduce_gradients=True, - release_loss=False, - retain_graph=False, - scale_wrt_gas=True): + def backward(self, loss, allreduce_gradients=True, release_loss=False, retain_graph=False, scale_wrt_gas=True): r"""Execute backward pass on the loss Arguments: loss: Torch tensor on which to execute backward propagation @@ -1949,9 +1807,7 @@ class DeepSpeedEngine(Module): scale_wrt_gas = self.scale_wrt_gas if not allreduce_gradients: - logger.warning( - f"Argument `allreduce_gradients` is deprecated, ignored, and will soon be removed" - ) + logger.warning(f"Argument `allreduce_gradients` is deprecated, ignored, and will soon be removed") # scale loss w.r.t. gradient accumulation if needed if self.gradient_accumulation_steps() > 1 and scale_wrt_gas: @@ -1976,16 +1832,13 @@ class DeepSpeedEngine(Module): self._start_timers(self.engine_timers.backward_inner_timers) if self.zero_optimization(): - self.optimizer.is_gradient_accumulation_boundary = self.is_gradient_accumulation_boundary( - ) + self.optimizer.is_gradient_accumulation_boundary = self.is_gradient_accumulation_boundary() self.optimizer.backward(loss, retain_graph=retain_graph) elif self.amp_enabled(): # AMP requires delaying unscale when inside gradient accumulation boundaries # https://nvidia.github.io/apex/advanced.html#gradient-accumulation-across-iterations delay_unscale = not self.is_gradient_accumulation_boundary() - with amp.scale_loss(loss, - self.optimizer, - delay_unscale=delay_unscale) as scaled_loss: + with amp.scale_loss(loss, self.optimizer, delay_unscale=delay_unscale) as scaled_loss: scaled_loss.backward(retain_graph=retain_graph) elif self.fp16_enabled(): if self.eigenvalue_enabled(): @@ -2068,22 +1921,17 @@ class DeepSpeedEngine(Module): param.grad = None def clip_fp32_gradients(self): - clip_grad_norm_(parameters=self.module.parameters(), - max_norm=self.gradient_clipping(), - mpu=self.mpu) + clip_grad_norm_(parameters=self.module.parameters(), max_norm=self.gradient_clipping(), mpu=self.mpu) def _take_model_step(self, lr_kwargs, block_eigenvalue={}): if self.gradient_clipping() > 0.0: - if not (self.fp16_enabled() or self.bfloat16_enabled() or self.amp_enabled() - or self.zero_optimization()): + if not (self.fp16_enabled() or self.bfloat16_enabled() or self.amp_enabled() or self.zero_optimization()): self.clip_fp32_gradients() elif self.amp_enabled(): # AMP's recommended way of doing clipping # https://nvidia.github.io/apex/advanced.html#gradient-clipping master_params = amp.master_params(self.optimizer) - clip_grad_norm_(parameters=master_params, - max_norm=self.gradient_clipping(), - mpu=self.mpu) + clip_grad_norm_(parameters=master_params, max_norm=self.gradient_clipping(), mpu=self.mpu) self.optimizer.step() if hasattr(self.optimizer, '_global_grad_norm'): @@ -2149,8 +1997,7 @@ class DeepSpeedEngine(Module): # Check early because self.global_steps is incremented at some point here. # TODO: Delay self.global_steps increment until very end of this function. flops_profiler_active = self.flops_profiler_enabled( - ) and self.global_steps == self.flops_profiler_profile_step( - ) and self.global_rank == 0 + ) and self.global_steps == self.flops_profiler_profile_step() and self.global_rank == 0 self._start_timers(self.engine_timers.step_timers) @@ -2165,20 +2012,16 @@ class DeepSpeedEngine(Module): if self.is_gradient_accumulation_boundary(): self.gas_boundary_ctr += 1 - if (self.eigenvalue_enabled() and - (self.gas_boundary_ctr % self.eigenvalue_gas_boundary_resolution() == 0) + if (self.eigenvalue_enabled() and (self.gas_boundary_ctr % self.eigenvalue_gas_boundary_resolution() == 0) and self.quantizer.any_precision_switch()): log_dist(f"computing eigenvalue...", ranks=[0]) - self.block_eigenvalue = self.eigenvalue.compute_eigenvalue( - self.module, - self.device, - self.optimizer.cur_scale) + self.block_eigenvalue = self.eigenvalue.compute_eigenvalue(self.module, self.device, + self.optimizer.cur_scale) if self.progressive_layer_drop: self.progressive_layer_drop.update_state(self.global_steps) - if (self.eigenvalue_enabled() and not self.gas_boundary_ctr % - self.eigenvalue_gas_boundary_resolution() + if (self.eigenvalue_enabled() and not self.gas_boundary_ctr % self.eigenvalue_gas_boundary_resolution() and self.quantizer.any_precision_switch()): self._take_model_step(lr_kwargs, self.block_eigenvalue) else: @@ -2186,8 +2029,7 @@ class DeepSpeedEngine(Module): report_progress = self.global_rank == 0 if self.global_rank else True - self.tput_timer.stop(global_step=self.is_gradient_accumulation_boundary(), - report_speed=report_progress) + self.tput_timer.stop(global_step=self.is_gradient_accumulation_boundary(), report_speed=report_progress) self._stop_timers(self.engine_timers.step_timers) @@ -2195,9 +2037,7 @@ class DeepSpeedEngine(Module): if self.monitor.enabled: if self.is_gradient_accumulation_boundary(): if self.global_rank == 0: - self.summary_events = [(f"Train/Samples/lr", - self.get_lr()[0], - self.global_samples)] + self.summary_events = [(f"Train/Samples/lr", self.get_lr()[0], self.global_samples)] if self.fp16_enabled() and hasattr(self.optimizer, "cur_scale"): self.summary_events.append(( @@ -2206,8 +2046,8 @@ class DeepSpeedEngine(Module): self.global_samples, )) - if (self.eigenvalue_enabled() and not self.gas_boundary_ctr % - self.eigenvalue_gas_boundary_resolution()): + if (self.eigenvalue_enabled() + and not self.gas_boundary_ctr % self.eigenvalue_gas_boundary_resolution()): ev_values = self.block_eigenvalue.values() for i in range(len(ev_values)): self.summary_events.append(( @@ -2231,14 +2071,12 @@ class DeepSpeedEngine(Module): ) self.flops_profiler.end_profile() - if self.autotuning_enabled() and self.global_steps == ( - self.autotuning_end_profile_step() + 1): + if self.autotuning_enabled() and self.global_steps == (self.autotuning_end_profile_step() + 1): self._autotuning_exit() if self.wall_clock_breakdown(): # Log micro timing and reset - self.timers.log(names=self.engine_timers.micro_timers, - memory_breakdown=self.memory_breakdown()) + self.timers.log(names=self.engine_timers.micro_timers, memory_breakdown=self.memory_breakdown()) if self.wall_clock_breakdown() or self.flops_profiler_enabled(): # Log global timing and reset @@ -2272,13 +2110,10 @@ class DeepSpeedEngine(Module): FORWARD_GLOBAL_TIMER, BACKWARD_GLOBAL_TIMER, STEP_GLOBAL_TIMER, - ], - reset=False) - titer = msg[FORWARD_GLOBAL_TIMER] + msg[BACKWARD_GLOBAL_TIMER] + msg[ - STEP_GLOBAL_TIMER] + ], reset=False) + titer = msg[FORWARD_GLOBAL_TIMER] + msg[BACKWARD_GLOBAL_TIMER] + msg[STEP_GLOBAL_TIMER] msg["latency"] = titer - msg["FLOPS_per_gpu"] = self.flops * 1_000_000 * self.gradient_accumulation_steps( - ) / titer + msg["FLOPS_per_gpu"] = self.flops * 1_000_000 * self.gradient_accumulation_steps() / titer msg["throughput"] = self.train_batch_size() * 1_000_000 / \ msg["latency"] print_json_dist(msg, [0], path=self.autotuning_metric_path()) @@ -2352,8 +2187,7 @@ class DeepSpeedEngine(Module): def _report_progress(self, step): lr = self.get_lr() mom = self.get_mom() - log_dist(f"step={step}, skipped={self.skipped_steps}, lr={lr}, mom={mom}", - ranks=[0]) + log_dist(f"step={step}, skipped={self.skipped_steps}, lr={lr}, mom={mom}", ranks=[0]) def allreduce_bucket(self, bucket, dp_group): tensor = self.flatten(bucket) @@ -2369,10 +2203,8 @@ class DeepSpeedEngine(Module): dist.all_reduce(tensor_to_allreduce, group=dp_group) if self.gradient_average: - if self.gradient_predivide_factor() != dist.get_world_size( - group=dp_group): - tensor_to_allreduce.mul_(self.gradient_predivide_factor() / - dist.get_world_size(group=dp_group)) + if self.gradient_predivide_factor() != dist.get_world_size(group=dp_group): + tensor_to_allreduce.mul_(self.gradient_predivide_factor() / dist.get_world_size(group=dp_group)) else: tensor_to_allreduce.mul_(1. / dist.get_world_size(group=dp_group)) dist.all_reduce(tensor_to_allreduce, group=dp_group) @@ -2414,9 +2246,7 @@ class DeepSpeedEngine(Module): # rank is reducing the same size. In some cases it may make # sense in the future to support the ability to average not # w.r.t. world size but with a different value. - param.grad = torch.zeros(param.size(), - dtype=param.dtype, - device=param.device) + param.grad = torch.zeros(param.size(), dtype=param.dtype, device=param.device) grad_data = param.grad.data if param_name in self.sparse_tensor_module_names or grad_data.is_sparse: @@ -2443,9 +2273,7 @@ class DeepSpeedEngine(Module): if bucket_type == SparseTensor.type(): self.sparse_allreduce_no_retain(bucket, dp_group=dp_group) else: - self.allreduce_no_retain(bucket, - dp_group=dp_group, - numel_per_bucket=elements_per_buffer) + self.allreduce_no_retain(bucket, dp_group=dp_group, numel_per_bucket=elements_per_buffer) def _reduce_expert_gradients(self, expert_grads, elements_per_buffer): for ep_name, expert_grads_group in expert_grads.items(): @@ -2453,15 +2281,12 @@ class DeepSpeedEngine(Module): for i, bucket_tuple in enumerate(expert_split_buckets): bucket_type, bucket = bucket_tuple if bucket_type == SparseTensor.type(): - self.sparse_allreduce_no_retain( - bucket, - groups._get_expert_data_parallel_group(ep_name)) + self.sparse_allreduce_no_retain(bucket, groups._get_expert_data_parallel_group(ep_name)) else: # Separate between diff groups - self.allreduce_no_retain( - bucket, - dp_group=groups._get_expert_data_parallel_group(ep_name), - numel_per_bucket=elements_per_buffer) + self.allreduce_no_retain(bucket, + dp_group=groups._get_expert_data_parallel_group(ep_name), + numel_per_bucket=elements_per_buffer) def buffered_allreduce_fallback(self, grads=None, elements_per_buffer=500000000): if grads is None: @@ -2504,8 +2329,7 @@ class DeepSpeedEngine(Module): if self.postscale_gradients(): if self.gradient_average: - values.mul_(self.gradient_predivide_factor() / - dist.get_world_size(group=dp_group)) + values.mul_(self.gradient_predivide_factor() / dist.get_world_size(group=dp_group)) else: values.mul_(1. / dist.get_world_size(group=dp_group)) @@ -2526,36 +2350,25 @@ class DeepSpeedEngine(Module): if value.dim() == 1: if fill_size > 0: value = torch.cat([value, value.new_empty(fill_size)]) - tensor_list = [ - value.new_empty(max_size) - for _ in range(dist.get_world_size(group=dp_group)) - ] + tensor_list = [value.new_empty(max_size) for _ in range(dist.get_world_size(group=dp_group))] else: if fill_size > 0: value = torch.cat([value, value.new_empty(fill_size, value.size()[1])]) tensor_list = [ value.new_empty(max_size, - value.size()[1]) - for _ in range(dist.get_world_size(group=dp_group)) + value.size()[1]) for _ in range(dist.get_world_size(group=dp_group)) ] dist.all_gather(tensor_list, value, group=dp_group) tensors = [] for dev_idx, t in enumerate(tensor_list): size = all_sizes[dev_idx][0] - tensors.append( - t.index_select(0, - torch.arange(size, - dtype=torch.long, - device=self.device))) + tensors.append(t.index_select(0, torch.arange(size, dtype=torch.long, device=self.device))) return tensors def all_gather_scalar(self, value, dp_group): - tensor_list = [ - value.new_zeros(value.size()) - for _ in range(dist.get_world_size(group=dp_group)) - ] + tensor_list = [value.new_zeros(value.size()) for _ in range(dist.get_world_size(group=dp_group))] dist.all_gather(tensor_list, value, group=dp_group) return tensor_list @@ -2575,20 +2388,19 @@ class DeepSpeedEngine(Module): num_experts=1, checkpoint_engine=TorchCheckpointEngine()): if old_moe_load: - expp_rank = groups._get_expert_data_parallel_rank( - groups._get_max_expert_size_name()) + expp_rank = groups._get_expert_data_parallel_rank(groups._get_max_expert_size_name()) - num_local_experts = max( - num_experts) // groups._get_expert_parallel_world_size( - groups._get_max_expert_size_name()) + num_local_experts = max(num_experts) // groups._get_expert_parallel_world_size( + groups._get_max_expert_size_name()) for local_expert_id in range(num_local_experts): global_expert_id = expp_rank * num_local_experts + local_expert_id - expert_state_dict = checkpoint_engine.load(DeepSpeedEngine._get_expert_ckpt_name( - checkpoint_path, - -1, # -1 means ignore layer_id - global_expert_id, - tag, - mpu), + expert_state_dict = checkpoint_engine.load( + DeepSpeedEngine._get_expert_ckpt_name( + checkpoint_path, + -1, # -1 means ignore layer_id + global_expert_id, + tag, + mpu), map_location=torch.device('cpu')) # Updating global -> local expert ids @@ -2609,21 +2421,15 @@ class DeepSpeedEngine(Module): # loop all local_experts for local_expert_id in range(num_local_experts): global_expert_id = expp_rank * num_local_experts + local_expert_id - expert_state_dict = checkpoint_engine.load( - DeepSpeedEngine._get_expert_ckpt_name( - checkpoint_path, - moe_layer_id, - global_expert_id, - tag, - mpu), - map_location=torch.device('cpu')) + expert_state_dict = checkpoint_engine.load(DeepSpeedEngine._get_expert_ckpt_name( + checkpoint_path, moe_layer_id, global_expert_id, tag, mpu), + map_location=torch.device('cpu')) # print(expert_state_dict.keys()) # Updating global -> local expert ids moe_str_prefix = '.deepspeed_moe.experts.deepspeed_experts.' for key in list(expert_state_dict.keys()): - local_key = key.replace( - f'{moe_str_prefix}{global_expert_id}', - f'{moe_str_prefix}{local_expert_id}') + local_key = key.replace(f'{moe_str_prefix}{global_expert_id}', + f'{moe_str_prefix}{local_expert_id}') expert_state_dict[local_key] = expert_state_dict.pop(key) state_dict.update(expert_state_dict) moe_layer_id += 1 @@ -2632,18 +2438,14 @@ class DeepSpeedEngine(Module): if custom_load_fn: custom_load_fn(src=state_dict, dst=self.module) else: - self.module.load_state_dict(state_dict, # TODO - strict=strict) + self.module.load_state_dict( + state_dict, # TODO + strict=strict) def _get_zero_ckpt_prefix(self, dp_rank, bf16_mode): return f'{"bf16_" if bf16_mode else ""}zero_pp_rank_{dp_rank}' - def _get_rank_zero_ckpt_name(self, - checkpoints_path, - tag, - mp_rank, - dp_rank, - bf16_mode): + def _get_rank_zero_ckpt_name(self, checkpoints_path, tag, mp_rank, dp_rank, bf16_mode): file_prefix = self._get_zero_ckpt_prefix(dp_rank, bf16_mode=bf16_mode) zero_ckpt_name = os.path.join( checkpoints_path, @@ -2656,11 +2458,7 @@ class DeepSpeedEngine(Module): mp_rank = 0 if self.mpu is None else self.mpu.get_model_parallel_rank() pp_rank = dist.get_rank(group=self.optimizer.dp_process_group) bf16_mode = self.bfloat16_enabled() - return self._get_rank_zero_ckpt_name(checkpoints_path, - tag, - mp_rank, - pp_rank, - bf16_mode) + return self._get_rank_zero_ckpt_name(checkpoints_path, tag, mp_rank, pp_rank, bf16_mode) def _get_ckpt_name(self, checkpoints_path, tag, mp_placeholder=None): if mp_placeholder is not None: @@ -2670,8 +2468,7 @@ class DeepSpeedEngine(Module): mp_rank_str = f"{mp_rank:02d}" if self.zero_optimization_partition_weights(): - filename = "zero_pp_rank_{}".format( - dist.get_rank(group=self.optimizer.dp_process_group)) + filename = "zero_pp_rank_{}".format(dist.get_rank(group=self.optimizer.dp_process_group)) ckpt_name = os.path.join( checkpoints_path, str(tag), @@ -2687,10 +2484,8 @@ class DeepSpeedEngine(Module): def _get_optimizer_ckpt_name(self, checkpoints_path, tag, expp_rank): mp_rank = 0 if self.mpu is None else self.mpu.get_model_parallel_rank() - ckpt_name = os.path.join( - checkpoints_path, - str(tag), - f'expp_rank_{expp_rank}_mp_rank_{mp_rank:02d}_optim_states.pt') + ckpt_name = os.path.join(checkpoints_path, str(tag), + f'expp_rank_{expp_rank}_mp_rank_{mp_rank:02d}_optim_states.pt') return ckpt_name @staticmethod @@ -2698,24 +2493,17 @@ class DeepSpeedEngine(Module): mp_rank = 0 if mpu is None else mpu.get_model_parallel_rank() if layer_id <= -1: # Used to support old checkpoint loading - ckpt_name = os.path.join( - checkpoints_path, - '' if tag is None else str(tag), - f'expert_{expert_id}_mp_rank_{mp_rank:02d}_model_states.pt') + ckpt_name = os.path.join(checkpoints_path, '' if tag is None else str(tag), + f'expert_{expert_id}_mp_rank_{mp_rank:02d}_model_states.pt') else: # Used to support new checkpoint loading - ckpt_name = os.path.join( - checkpoints_path, - '' if tag is None else str(tag), - f'layer_{layer_id}_expert_{expert_id}_mp_rank_{mp_rank:02d}_model_states.pt' - ) + ckpt_name = os.path.join(checkpoints_path, '' if tag is None else str(tag), + f'layer_{layer_id}_expert_{expert_id}_mp_rank_{mp_rank:02d}_model_states.pt') return ckpt_name def _get_all_ckpt_names(self, checkpoints_path, tag): # It is required that (checkpoints_path, tag) are consistent among all ranks. - ckpt_file_pattern = self._get_ckpt_name(checkpoints_path, - tag, - mp_placeholder="*") + ckpt_file_pattern = self._get_ckpt_name(checkpoints_path, tag, mp_placeholder="*") import glob ckpt_files = glob.glob(ckpt_file_pattern) @@ -2755,17 +2543,14 @@ class DeepSpeedEngine(Module): """ if tag is None: - latest_tag = "latest_universal" if self.load_universal_checkpoint( - ) else "latest" + latest_tag = "latest_universal" if self.load_universal_checkpoint() else "latest" latest_path = os.path.join(load_dir, latest_tag) if os.path.isfile(latest_path): with open(latest_path, "r") as fd: tag = fd.read().strip() else: if self.load_universal_checkpoint(): - raise ValueError( - f'Invalid for universal checkpoint: {latest_path} does not exist' - ) + raise ValueError(f'Invalid for universal checkpoint: {latest_path} does not exist') else: logger.warning( f"Unable to find latest file at {latest_path}, if trying to load latest " @@ -2787,10 +2572,7 @@ class DeepSpeedEngine(Module): load_zero_checkpoint = self.zero_optimization() or self.bfloat16_enabled() if load_zero_checkpoint and load_path is not None: - success = self._load_zero_checkpoint( - load_dir, - tag, - load_optimizer_states=load_optimizer_states) + success = self._load_zero_checkpoint(load_dir, tag, load_optimizer_states=load_optimizer_states) if not success: self.optimizer._restore_from_bit16_weights() @@ -2811,16 +2593,12 @@ class DeepSpeedEngine(Module): from deepspeed.runtime.state_dict_factory import SDLoaderFactory ckpt_list = self._get_all_ckpt_names(load_dir, tag) - sd_loader = SDLoaderFactory.get_sd_loader( - ckpt_list, - checkpoint_engine=self.checkpoint_engine) + sd_loader = SDLoaderFactory.get_sd_loader(ckpt_list, checkpoint_engine=self.checkpoint_engine) is_pipe_parallel = isinstance(self.module, PipelineModule) mp_rank = 0 if self.mpu is None else self.mpu.get_model_parallel_rank() - load_path, checkpoint, _ = sd_loader.load( - self.mp_world_size, mp_rank, is_pipe_parallel=is_pipe_parallel - ) + load_path, checkpoint, _ = sd_loader.load(self.mp_world_size, mp_rank, is_pipe_parallel=is_pipe_parallel) if checkpoint is None: return None, None @@ -2858,38 +2636,29 @@ class DeepSpeedEngine(Module): largest_group_name = groups._get_max_expert_size_name() expp_rank = groups._get_expert_parallel_rank(largest_group_name) optim_load_path = self._get_optimizer_ckpt_name(load_dir, tag, expp_rank) - optim_checkpoint = self.checkpoint_engine.load( - optim_load_path, - map_location=torch.device('cpu')) + optim_checkpoint = self.checkpoint_engine.load(optim_load_path, map_location=torch.device('cpu')) else: optim_checkpoint = checkpoint - has_zero_optimizer_state = self.zero_optimization() or self.bfloat16_enabled( - ) + has_zero_optimizer_state = self.zero_optimization() or self.bfloat16_enabled() if load_optimizer_states and self.optimizer is not None and not has_zero_optimizer_state: if self.fp16_enabled(): - self.optimizer.load_state_dict( - optim_checkpoint['optimizer'], - load_optimizer_states=load_optimizer_states) + self.optimizer.load_state_dict(optim_checkpoint['optimizer'], + load_optimizer_states=load_optimizer_states) else: self.optimizer.load_state_dict(optim_checkpoint['optimizer']) if load_lr_scheduler_states and self.lr_scheduler is not None: self.lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) - if self.random_ltd_enabled( - ) and self.random_ltd_scheduler is not None and 'random_ltd' in checkpoint: + if self.random_ltd_enabled() and self.random_ltd_scheduler is not None and 'random_ltd' in checkpoint: self.random_ltd_scheduler.load_state_dict(checkpoint['random_ltd']) if self.training_dataloader is not None and self.curriculum_learning_enabled( ) and 'data_sampler' in checkpoint: - self.training_dataloader.data_sampler.load_state_dict( - checkpoint['data_sampler']) + self.training_dataloader.data_sampler.load_state_dict(checkpoint['data_sampler']) - def get_sparse_tensor_module_names(original_set, - loaded_set, - original_parameters, - loaded_parameters): + def get_sparse_tensor_module_names(original_set, loaded_set, original_parameters, loaded_parameters): result = set() for name in original_set: @@ -2899,8 +2668,7 @@ class DeepSpeedEngine(Module): for name in loaded_set: if name in original_parameters: - result.add( - name) # parameter exists in both configs and it was sparse + result.add(name) # parameter exists in both configs and it was sparse return result @@ -2915,26 +2683,16 @@ class DeepSpeedEngine(Module): self.sparse_tensor_module_names = sparse_tensor_module_names else: self.sparse_tensor_module_names = get_sparse_tensor_module_names( - self.sparse_tensor_module_names, - sparse_tensor_module_names, - dict(self.module.named_parameters()), - checkpoint["module"]) + self.sparse_tensor_module_names, sparse_tensor_module_names, + dict(self.module.named_parameters()), checkpoint["module"]) self.global_steps = checkpoint['global_steps'] - self.global_samples = checkpoint.get( - 'global_samples', - self.global_steps * self.train_batch_size()) + self.global_samples = checkpoint.get('global_samples', self.global_steps * self.train_batch_size()) self.skipped_steps = checkpoint['skipped_steps'] self.loaded_checkpoint_mp_world_size = checkpoint['mp_world_size'] deepspeed_states = [ - 'module', - 'sparse_tensor_module_names', - 'skipped_steps', - 'global_steps', - 'dp_world_size', - 'mp_world_size', - 'data_sampler', - 'random_ltd' + 'module', 'sparse_tensor_module_names', 'skipped_steps', 'global_steps', 'dp_world_size', + 'mp_world_size', 'data_sampler', 'random_ltd' ] client_state = {} @@ -2943,11 +2701,7 @@ class DeepSpeedEngine(Module): if load_optimizer_states: deepspeed_states.append('optimizer') - client_state = { - key: value - for key, - value in checkpoint.items() if not key in deepspeed_states - } + client_state = {key: value for key, value in checkpoint.items() if not key in deepspeed_states} if not load_optimizer_states and not load_module_only: client_state['optimizer'] = optim_checkpoint['optimizer'] @@ -2970,28 +2724,18 @@ class DeepSpeedEngine(Module): if zero_sd_list is None: return False - self.optimizer.load_state_dict( - state_dict_list=zero_sd_list, - load_optimizer_states=load_optimizer_states, - load_from_fp32_weights=self.zero_load_from_fp32_weights(), - checkpoint_folder=checkpoint_folder) + self.optimizer.load_state_dict(state_dict_list=zero_sd_list, + load_optimizer_states=load_optimizer_states, + load_from_fp32_weights=self.zero_load_from_fp32_weights(), + checkpoint_folder=checkpoint_folder) if self.load_universal_checkpoint(): - logger.info( - f'loaded universal zero checkpoints from {checkpoint_folder} for rank {self.global_rank}' - ) + logger.info(f'loaded universal zero checkpoints from {checkpoint_folder} for rank {self.global_rank}') else: - logger.info( - f"loading {len(zero_sd_list)} zero partition checkpoints for rank {self.global_rank}" - ) + logger.info(f"loading {len(zero_sd_list)} zero partition checkpoints for rank {self.global_rank}") return True - def _get_mp_rank_zero_checkpoint_names(self, - load_dir, - tag, - mp_rank, - dp_world_size, - bf16_mode): + def _get_mp_rank_zero_checkpoint_names(self, load_dir, tag, mp_rank, dp_world_size, bf16_mode): zero_ckpt_names = [] for dp_rank in range(dp_world_size): ckpt_name = self._get_rank_zero_ckpt_name(checkpoints_path=load_dir, @@ -3005,18 +2749,16 @@ class DeepSpeedEngine(Module): def _get_all_zero_checkpoint_names(self, load_dir, tag, bf16_mode): mp_rank = 0 if self.mpu is None else self.mpu.get_model_parallel_rank() - zero_ckpt_names = self._get_mp_rank_zero_checkpoint_names( - load_dir=load_dir, - tag=tag, - mp_rank=mp_rank, - dp_world_size=self.loaded_checkpoint_dp_world_size, - bf16_mode=bf16_mode) + zero_ckpt_names = self._get_mp_rank_zero_checkpoint_names(load_dir=load_dir, + tag=tag, + mp_rank=mp_rank, + dp_world_size=self.loaded_checkpoint_dp_world_size, + bf16_mode=bf16_mode) for i, ckpt_name in enumerate(zero_ckpt_names): if not os.path.exists(ckpt_name): # transparently handle the old file pattern for optim_states if "optim_states.pt" in ckpt_name: - ckpt_name_try = ckpt_name.replace("_optim_states.pt", - "optim_states.pt") + ckpt_name_try = ckpt_name.replace("_optim_states.pt", "optim_states.pt") if os.path.exists(ckpt_name_try): zero_ckpt_names[i] = ckpt_name_try continue @@ -3030,8 +2772,7 @@ class DeepSpeedEngine(Module): if ckpt_name is None: _state = {OPTIMIZER_STATE_DICT: None} # Fully load state for current rank - elif self.zero_elastic_checkpoint() or dist.get_rank( - group=self.optimizer.dp_process_group) == i: + elif self.zero_elastic_checkpoint() or dist.get_rank(group=self.optimizer.dp_process_group) == i: _state = self.checkpoint_engine.load( ckpt_name, map_location='cpu', @@ -3041,25 +2782,18 @@ class DeepSpeedEngine(Module): zero_sd_list.append(_state) zero_optimizer_sd = [sd[OPTIMIZER_STATE_DICT] for sd in zero_sd_list] - logger.info( - f"successfully read {len(zero_optimizer_sd)} ZeRO state_dicts for rank {self.global_rank}" - ) + logger.info(f"successfully read {len(zero_optimizer_sd)} ZeRO state_dicts for rank {self.global_rank}") return zero_optimizer_sd def _get_all_zero_checkpoints(self, load_dir, tag): for bf16_mode in [self.bfloat16_enabled(), not self.bfloat16_enabled()]: - zero_ckpt_names = self._get_all_zero_checkpoint_names( - load_dir, - tag, - bf16_mode) + zero_ckpt_names = self._get_all_zero_checkpoint_names(load_dir, tag, bf16_mode) if zero_ckpt_names is not None: # Warn if loading checkpoint of different bit16 type if bf16_mode is not self.bfloat16_enabled(): checkpoint_bit16 = BFLOAT16 if bf16_mode else FP16 engine_bit16 = BFLOAT16 if self.bfloat16_enabled() else FP16 - logger.warn( - f'Loading {checkpoint_bit16} zero checkpoints into {engine_bit16} training engine' - ) + logger.warn(f'Loading {checkpoint_bit16} zero checkpoints into {engine_bit16} training engine') return self._get_all_zero_checkpoint_state_dicts(zero_ckpt_names) return None @@ -3073,10 +2807,9 @@ class DeepSpeedEngine(Module): dist.all_reduce(max_bhash, op=dist.ReduceOp.MAX) dist.all_reduce(min_bhash, op=dist.ReduceOp.MIN) valid = all(min_bhash == bhash) and all(max_bhash == bhash) - msg = ( - f"[rank={dist.get_rank()}] The checkpoint tag name '{tag}' is not consistent across " - "all ranks. Including rank unique information in checkpoint tag could cause issues when " - "restoring with different world sizes.") + msg = (f"[rank={dist.get_rank()}] The checkpoint tag name '{tag}' is not consistent across " + "all ranks. Including rank unique information in checkpoint tag could cause issues when " + "restoring with different world sizes.") if self.checkpoint_tag_validation_fail(): assert valid, msg elif not valid: @@ -3208,15 +2941,9 @@ class DeepSpeedEngine(Module): # let save the moe parameters for global_expert_id, expert_state_dict in experts_state_dict.items(): # save the moe parameters - moe_save_path = self._get_expert_ckpt_name( - save_dir, - moe_layer_id, - global_expert_id, - tag, - self.mpu) + moe_save_path = self._get_expert_ckpt_name(save_dir, moe_layer_id, global_expert_id, tag, self.mpu) if self.random_ltd_enabled(): - expert_state_dict = remove_random_ltd_state_dict( - expert_state_dict) + expert_state_dict = remove_random_ltd_state_dict(expert_state_dict) self.checkpoint_engine.save(expert_state_dict, moe_save_path) moe_layer_id += 1 @@ -3234,9 +2961,7 @@ class DeepSpeedEngine(Module): # Save optimizer states. They are different across each exp parallel rank. optimizer_state = { - 'optimizer': - self.optimizer.state_dict() - if self.optimizer and not self.zero_optimization() else None + 'optimizer': self.optimizer.state_dict() if self.optimizer and not self.zero_optimization() else None } # TODO: why use BufferedWriter not the path file_path = self._get_optimizer_ckpt_name(save_dir, tag, expp_rank) @@ -3251,15 +2976,12 @@ class DeepSpeedEngine(Module): 'module': model_state_dict, 'lr_scheduler': - self.lr_scheduler.state_dict() - if self.lr_scheduler is not None else None, + self.lr_scheduler.state_dict() if self.lr_scheduler is not None else None, 'data_sampler': self.training_dataloader.data_sampler.state_dict() if - (self.training_dataloader is not None - and self.curriculum_learning_enabled()) else None, + (self.training_dataloader is not None and self.curriculum_learning_enabled()) else None, 'random_ltd': - self.random_ltd_scheduler.state_dict() - if self.random_ltd_enabled() else None, + self.random_ltd_scheduler.state_dict() if self.random_ltd_enabled() else None, 'sparse_tensor_module_names': self.sparse_tensor_module_names, 'skipped_steps': @@ -3281,8 +3003,7 @@ class DeepSpeedEngine(Module): self._curr_save_path = None def _create_checkpoint_file(self, save_dir, tag, zero_checkpoint): - name_function = (self._get_zero_ckpt_name - if zero_checkpoint else self._get_ckpt_name) + name_function = (self._get_zero_ckpt_name if zero_checkpoint else self._get_ckpt_name) try: checkpoint_name = name_function(save_dir, tag) path = os.path.dirname(checkpoint_name) @@ -3320,17 +3041,12 @@ class DeepSpeedEngine(Module): state = dict(module=module, buffer_names=self._get_buffer_names(), - optimizer=self.optimizer.state_dict() - if self.optimizer and not zero_optimizer_state else None, - param_shapes=self._get_zero_param_shapes() - if self.optimizer and zero_optimizer_state else None, - lr_scheduler=self.lr_scheduler.state_dict() - if self.lr_scheduler is not None else None, + optimizer=self.optimizer.state_dict() if self.optimizer and not zero_optimizer_state else None, + param_shapes=self._get_zero_param_shapes() if self.optimizer and zero_optimizer_state else None, + lr_scheduler=self.lr_scheduler.state_dict() if self.lr_scheduler is not None else None, data_sampler=self.training_dataloader.data_sampler.state_dict() if - (self.training_dataloader is not None - and self.curriculum_learning_enabled()) else None, - random_ltd=self.random_ltd_scheduler.state_dict() - if self.random_ltd_enabled() else None, + (self.training_dataloader is not None and self.curriculum_learning_enabled()) else None, + random_ltd=self.random_ltd_scheduler.state_dict() if self.random_ltd_enabled() else None, sparse_tensor_module_names=self.sparse_tensor_module_names, skipped_steps=self.skipped_steps, global_steps=self.global_steps, @@ -3420,9 +3136,7 @@ class DeepSpeedEngine(Module): def _save_zero_checkpoint(self, save_path, tag): zero_checkpoint_name = self._get_zero_ckpt_name(save_path, tag) - zero_sd = dict(optimizer_state_dict=self.optimizer.state_dict(), - ds_config=self.config, - ds_version=version) + zero_sd = dict(optimizer_state_dict=self.optimizer.state_dict(), ds_config=self.config, ds_version=version) self.checkpoint_engine.save(zero_sd, zero_checkpoint_name) if self.global_rank == 0: @@ -3452,9 +3166,7 @@ class DeepSpeedEngine(Module): # gather one layer at a time to be memory-efficient # must use modifier_rank=0 to release GPU memory after each layer gathered #see_memory_usage("before GatheredParameters", force=True) - with deepspeed.zero.GatheredParameters(list( - module.parameters(recurse=False)), - modifier_rank=0): + with deepspeed.zero.GatheredParameters(list(module.parameters(recurse=False)), modifier_rank=0): if dist.get_rank() == 0: # handle params for name, param in module.named_parameters(recurse=False): @@ -3475,8 +3187,7 @@ class DeepSpeedEngine(Module): # now buffers - not sure if need to take care of potentially shared weights here for name, buf in module.named_buffers(recurse=False): - if (buf is not None - and name not in module._non_persistent_buffers_set): + if (buf is not None and name not in module._non_persistent_buffers_set): state_dict[prefix + name] = buf.detach().cpu() #see_memory_usage("after GatheredParameters", force=True) @@ -3529,8 +3240,7 @@ class DeepSpeedEngine(Module): else: # the model will be bogus if not consolidated so don't confuse the user by saving it logger.info( - f"Did not save the model {path} because `stage3_gather_16bit_weights_on_model_save` is False" - ) + f"Did not save the model {path} because `stage3_gather_16bit_weights_on_model_save` is False") return False else: state_dict = self.module.state_dict() diff --git a/deepspeed/runtime/fp16/fused_optimizer.py b/deepspeed/runtime/fp16/fused_optimizer.py index 4f4b5cfa7..e9f7a38d8 100755 --- a/deepspeed/runtime/fp16/fused_optimizer.py +++ b/deepspeed/runtime/fp16/fused_optimizer.py @@ -23,6 +23,7 @@ class FP16_Optimizer(DeepSpeedOptimizer): For usage example please see, TODO: DeepSpeed V2 Tutorial """ + def __init__(self, init_optimizer, deepspeed=None, @@ -58,20 +59,15 @@ class FP16_Optimizer(DeepSpeedOptimizer): # push this group to list before modify self.fp16_groups.append(param_group['params']) # init fp16 weight buffer, flattened - self.fp16_groups_flat.append( - _flatten_dense_tensors([p.clone().detach() - for p in self.fp16_groups[i]])) + self.fp16_groups_flat.append(_flatten_dense_tensors([p.clone().detach() for p in self.fp16_groups[i]])) # set model fp16 weight to slices of flattened buffer - updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i], - self.fp16_groups[i]) + updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i], self.fp16_groups[i]) for p, q in zip(self.fp16_groups[i], updated_params): p.data = q.data # init master weight, flattened - self.fp32_groups_flat.append( - self.fp16_groups_flat[i].clone().float().detach()) + self.fp32_groups_flat.append(self.fp16_groups_flat[i].clone().float().detach()) # modify optimizer of have flat master weight - self.fp32_groups_flat[ - i].requires_grad = True # keep this in case internal optimizer uses it + self.fp32_groups_flat[i].requires_grad = True # keep this in case internal optimizer uses it param_group['params'] = [self.fp32_groups_flat[i]] # we may have a way of fusing dynamic scale. Do not support for now @@ -113,16 +109,13 @@ class FP16_Optimizer(DeepSpeedOptimizer): self.mpu = mpu self.overflow = False - self.overflow_checker = CheckOverflow(self.fp16_groups, - mpu=self.mpu, - deepspeed=deepspeed) + self.overflow_checker = CheckOverflow(self.fp16_groups, mpu=self.mpu, deepspeed=deepspeed) self.initialize_optimizer_states() def initialize_optimizer_states(self): for i, group in enumerate(self.fp16_groups): - self.fp32_groups_flat[i].grad = torch.zeros( - self.fp32_groups_flat[i].size(), - device=self.fp32_groups_flat[i].device) + self.fp32_groups_flat[i].grad = torch.zeros(self.fp32_groups_flat[i].size(), + device=self.fp32_groups_flat[i].device) self.optimizer.step() @@ -156,10 +149,7 @@ class FP16_Optimizer(DeepSpeedOptimizer): for i, group in enumerate(self.fp16_groups): grads_groups_flat.append( _flatten_dense_tensors([ - torch.zeros(p.size(), - dtype=p.dtype, - device=p.device) if p.grad is None else p.grad - for p in group + torch.zeros(p.size(), dtype=p.dtype, device=p.device) if p.grad is None else p.grad for p in group ])) norm_groups.append(get_weight_norm(grads_groups_flat[i], mpu=self.mpu)) @@ -169,17 +159,13 @@ class FP16_Optimizer(DeepSpeedOptimizer): if self.overflow: if self.verbose: - logger.info( - "[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss " - "scale: {}, reducing to {}".format(prev_scale, - self.cur_scale)) + logger.info("[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss " + "scale: {}, reducing to {}".format(prev_scale, self.cur_scale)) return self.overflow scaled_grad_norm = get_global_norm(norm_list=norm_groups) - combined_scale = self.unscale_and_clip_grads(grads_groups_flat, - scaled_grad_norm, - apply_scale=False) + combined_scale = self.unscale_and_clip_grads(grads_groups_flat, scaled_grad_norm, apply_scale=False) # Stash unscaled gradient norm self._global_grad_norm = scaled_grad_norm / self.cur_scale @@ -191,8 +177,7 @@ class FP16_Optimizer(DeepSpeedOptimizer): grad_norms=norm_groups) # TODO: we probably don't need this? just to be safe for i in range(len(norm_groups)): - updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i], - self.fp16_groups[i]) + updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i], self.fp16_groups[i]) for p, q in zip(self.fp16_groups[i], updated_params): p.data = q.data return self.overflow @@ -222,9 +207,7 @@ class FP16_Optimizer(DeepSpeedOptimizer): def override_loss_scale(self, loss_scale): if loss_scale != self.external_loss_scale: - logger.info( - f'[deepspeed] setting loss scale from {self.external_loss_scale} -> {loss_scale}' - ) + logger.info(f'[deepspeed] setting loss scale from {self.external_loss_scale} -> {loss_scale}') self.custom_loss_scaler = True self.external_loss_scale = loss_scale @@ -273,10 +256,8 @@ class FP16_Optimizer(DeepSpeedOptimizer): grads_groups_flat.append( _flatten_dense_tensors([ - torch.zeros(p.size(), - dtype=data_type, - device=p.device) - if p.grad is None else p.grad.to(data_type) for p in group + torch.zeros(p.size(), dtype=data_type, device=p.device) if p.grad is None else p.grad.to(data_type) + for p in group ])) for p in group: @@ -313,8 +294,7 @@ class FP16_Optimizer(DeepSpeedOptimizer): self.start_timers([UPDATE_FP16]) for i in range(len(self.fp16_groups)): - updated_params = _unflatten_dense_tensors(self.fp32_groups_flat[i], - self.fp16_groups[i]) + updated_params = _unflatten_dense_tensors(self.fp32_groups_flat[i], self.fp16_groups[i]) for p, q in zip(self.fp16_groups[i], updated_params): p.data.copy_(q.data) @@ -334,9 +314,7 @@ class FP16_Optimizer(DeepSpeedOptimizer): else: pg = groups._get_data_parallel_group() scaled_norm = all_groups_norm * 1.0 / float(dist.get_world_size(group=pg)) - scaled_norm_tensor = torch.tensor(scaled_norm, - device=self.fp32_groups_flat[0].device, - dtype=torch.float) + scaled_norm_tensor = torch.tensor(scaled_norm, device=self.fp32_groups_flat[0].device, dtype=torch.float) dist.all_reduce(scaled_norm_tensor, group=pg) all_groups_norm = scaled_norm_tensor.item() #print(f"old = {all_groups_norm_old} and new = {all_groups_norm} at rank: {deepspeed.comm.get_rank()}") @@ -376,25 +354,19 @@ class FP16_Optimizer(DeepSpeedOptimizer): if self.dynamic_loss_scale: prev_scale = self.cur_scale if skip: - self.cur_scale = max(self.cur_scale / self.scale_factor, - self.min_loss_scale) + self.cur_scale = max(self.cur_scale / self.scale_factor, self.min_loss_scale) self.last_overflow_iter = self.cur_iter if self.verbose: logger.info(f"\nGrad overflow on iteration {self.cur_iter}") - logger.info( - f"Reducing dynamic loss scale from {prev_scale} to {self.cur_scale}" - ) + logger.info(f"Reducing dynamic loss scale from {prev_scale} to {self.cur_scale}") else: # Ensure self.scale_window updates since last overflow stable_interval = (self.cur_iter - self.last_overflow_iter) - 1 if (stable_interval > 0) and (stable_interval % self.scale_window == 0): self.cur_scale *= self.scale_factor if self.verbose: - logger.info( - f"No Grad overflow for {self.scale_window} iterations") - logger.info( - f"Increasing dynamic loss scale from {prev_scale} to {self.cur_scale}" - ) + logger.info(f"No Grad overflow for {self.scale_window} iterations") + logger.info(f"Increasing dynamic loss scale from {prev_scale} to {self.cur_scale}") else: if skip: logger.info("Grad overflow on iteration: %s", self.cur_iter) diff --git a/deepspeed/runtime/fp16/loss_scaler.py b/deepspeed/runtime/fp16/loss_scaler.py index 376de7d89..de8d31ff3 100755 --- a/deepspeed/runtime/fp16/loss_scaler.py +++ b/deepspeed/runtime/fp16/loss_scaler.py @@ -37,6 +37,7 @@ class LossScalerBase: """LossScalarBase Base class for a loss scaler """ + def __init__(self, cur_scale): self.cur_scale = cur_scale self.dynamic = False @@ -67,6 +68,7 @@ class LossScaler(LossScalerBase): Args: scale (float, optional, default=1.0): The loss scale. """ + def __init__(self, scale=1): super(LossScaler, self).__init__(scale) @@ -104,6 +106,7 @@ class DynamicLossScaler(LossScalerBase): scale_factor (float, optional, default=2.0): Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``. If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``. scale_window (int, optional, default=1000): Number of consecutive iterations without an overflow to wait before increasing the loss scale. """ + def __init__(self, init_scale=2**32, scale_factor=2., @@ -162,8 +165,7 @@ class DynamicLossScaler(LossScalerBase): if self.delayed_shift == 1 or self.cur_hysteresis == 1: if (self.cur_scale == self.min_scale) and self.raise_error_at_min_scale: raise Exception( - "Current loss scale already at minimum - cannot decrease scale anymore. Exiting run." - ) + "Current loss scale already at minimum - cannot decrease scale anymore. Exiting run.") else: next_scale = max(self.cur_scale / self.scale_factor, self.min_scale) if dist.get_rank() == 0: diff --git a/deepspeed/runtime/fp16/onebit/adam.py b/deepspeed/runtime/fp16/onebit/adam.py index 5eb22fb64..71fce13c8 100644 --- a/deepspeed/runtime/fp16/onebit/adam.py +++ b/deepspeed/runtime/fp16/onebit/adam.py @@ -39,14 +39,14 @@ class OnebitAdam(torch.optim.Optimizer): .. _On the Convergence of Adam and Beyond: https://openreview.net/forum?id=ryQu7f-RZ """ + def __init__(self, params, deepspeed=None, lr=1e-3, freeze_step=100000, bias_correction=True, - betas=(0.9, - 0.999), + betas=(0.9, 0.999), eps=1e-8, eps_inside_sqrt=False, weight_decay=0., @@ -92,8 +92,7 @@ class OnebitAdam(torch.optim.Optimizer): assert TORCH_MAJOR >= 1 and TORCH_MINOR >= 8, "Please use torch 1.8 or greater to enable NCCL backend in 1-bit Adam. Alternatively, please specify 'mpi' as the 'comm_backend_name' in config file to proceed with the MPI backend" assert dist.is_initialized() == True, "Please initialize the torch distributed backend." from deepspeed.runtime.comm.nccl import NcclBackend - self.using_pipeline = hasattr(self.deepspeed, - 'pipeline_enable_backward_allreduce') + self.using_pipeline = hasattr(self.deepspeed, 'pipeline_enable_backward_allreduce') self.comm_backend_handle = NcclBackend(self.deepspeed.mpu) elif self.comm_backend_name == 'mpi': @@ -164,22 +163,17 @@ class OnebitAdam(torch.optim.Optimizer): # Exponential moving average of squared gradient values state['exp_avg_sq'] = torch.zeros_like(p.data) - if not self.initialize or (self.adam_freeze_key - and 'worker_error' not in state.keys()): + if not self.initialize or (self.adam_freeze_key and 'worker_error' not in state.keys()): state['tensor_size'] = torch.numel(p.data) state['corrected_tensor_size'] = state['tensor_size'] if state['tensor_size'] % (self.size * self.divider) != 0: - state['corrected_tensor_size'] += ((self.size * self.divider) - - (state['tensor_size'] % - (self.size * self.divider))) - state['server_chunk_size'] = state[ - 'corrected_tensor_size'] // self.size + state['corrected_tensor_size'] += ((self.size * self.divider) - (state['tensor_size'] % + (self.size * self.divider))) + state['server_chunk_size'] = state['corrected_tensor_size'] // self.size get_accelerator().empty_cache() - state['worker_error'] = torch.zeros(state['corrected_tensor_size'], - device=p.device) - state['server_error'] = torch.zeros(state['server_chunk_size'], - device=p.device) + state['worker_error'] = torch.zeros(state['corrected_tensor_size'], device=p.device) + state['server_error'] = torch.zeros(state['server_chunk_size'], device=p.device) get_accelerator().empty_cache() self.adam_freeze_key = True if not self.initialize and dist.get_rank() == 0: @@ -211,11 +205,9 @@ class OnebitAdam(torch.optim.Optimizer): if self.size > 1: exp_avg.set_( - self.comm_backend_handle.compressed_allreduce( - exp_avg, - state['worker_error'], - state['server_error'], - self.deepspeed.local_rank)) + self.comm_backend_handle.compressed_allreduce(exp_avg, state['worker_error'], + state['server_error'], + self.deepspeed.local_rank)) # Because 1-bit compression cannot represent exact zero, it is required to # provide a momentum mask for those params that have constant exact zeros in their # momentums, otherwise the compression error would keep accumulating. @@ -225,8 +217,7 @@ class OnebitAdam(torch.optim.Optimizer): # (See example in DeepSpeedExamples/bing_bert/deepspeed_train.py.) if 'exp_avg_mask' in group: if exp_avg.device != group['exp_avg_mask'].device: - group['exp_avg_mask'] = group['exp_avg_mask'].to( - device=exp_avg.device) + group['exp_avg_mask'] = group['exp_avg_mask'].to(device=exp_avg.device) exp_avg.mul_(group['exp_avg_mask']) if self.initialize: @@ -272,8 +263,7 @@ class OnebitAdam(torch.optim.Optimizer): for i, group in enumerate(self.param_groups): if 'exp_avg_mask' in group: state_dict['param_groups'][i]['exp_avg_mask'] = group['exp_avg_mask'] - elif 'exp_avg_mask' not in group and 'exp_avg_mask' in state_dict[ - 'param_groups'][i]: + elif 'exp_avg_mask' not in group and 'exp_avg_mask' in state_dict['param_groups'][i]: state_dict['param_groups'][i].pop('exp_avg_mask') super().load_state_dict(state_dict) if self.state[self.param_groups[0]['params'][0]]['step'] < self.freeze_step: @@ -287,9 +277,7 @@ class OnebitAdam(torch.optim.Optimizer): self.deepspeed.enable_backward_allreduce = True else: if dist.get_rank() == 0: - print( - "Checkpoint loaded and OnebitAdam compression stage starts/continues." - ) + print("Checkpoint loaded and OnebitAdam compression stage starts/continues.") if self.adam_freeze_key is False: self.adam_freeze_key = True if self.using_pipeline: diff --git a/deepspeed/runtime/fp16/onebit/lamb.py b/deepspeed/runtime/fp16/onebit/lamb.py index 87c24695e..9267d5223 100644 --- a/deepspeed/runtime/fp16/onebit/lamb.py +++ b/deepspeed/runtime/fp16/onebit/lamb.py @@ -54,14 +54,14 @@ class OnebitLamb(torch.optim.Optimizer): .. _On the Convergence of Adam and Beyond: https://openreview.net/forum?id=ryQu7f-RZ """ + def __init__(self, params, deepspeed=None, lr=1e-3, freeze_step=100000, bias_correction=True, - betas=(0.9, - 0.999), + betas=(0.9, 0.999), eps=1e-8, eps_inside_sqrt=False, weight_decay=0., @@ -114,8 +114,7 @@ class OnebitLamb(torch.optim.Optimizer): assert TORCH_MAJOR >= 1 and TORCH_MINOR >= 8, "Please use torch 1.8 or greater to enable NCCL backend in 1-bit Adam. Alternatively, please specify 'mpi' as the 'comm_backend_name' in config file to proceed with the MPI backend" assert dist.is_initialized() == True, "Please initialize the torch distributed backend." from deepspeed.runtime.comm.nccl import NcclBackend - self.using_pipeline = hasattr(self.deepspeed, - 'pipeline_enable_backward_allreduce') + self.using_pipeline = hasattr(self.deepspeed, 'pipeline_enable_backward_allreduce') self.comm_backend_handle = NcclBackend(self.deepspeed.mpu) elif self.comm_backend_name == 'mpi': @@ -165,24 +164,20 @@ class OnebitLamb(torch.optim.Optimizer): if self.lamb_freeze_key: exp_avg_last_step = [] for group in self.param_groups: - exp_avg_last_step.append( - [self.state[p]['exp_avg'].detach().clone() for p in group['params']]) + exp_avg_last_step.append([self.state[p]['exp_avg'].detach().clone() for p in group['params']]) if 'scaling_coeff' not in self.state[self.param_groups[0]['params'][0]]: # Compute the scaling_coeff for each momentum at the end of warmup stage. # This is used to reduce compression error during compression stage. momentum_scales = [] for group in self.param_groups: momentum_scales.append([ - (torch.norm(self.state[p]['exp_avg']) / - np.sqrt(torch.numel(self.state[p]['exp_avg']))).item() + (torch.norm(self.state[p]['exp_avg']) / np.sqrt(torch.numel(self.state[p]['exp_avg']))).item() for p in group['params'] ]) - united_scale = sum([sum(x) for x in momentum_scales]) / sum( - [len(x) for x in momentum_scales]) + united_scale = sum([sum(x) for x in momentum_scales]) / sum([len(x) for x in momentum_scales]) for i, group in enumerate(self.param_groups): for j, p in enumerate(group['params']): - self.state[p][ - 'scaling_coeff'] = united_scale / momentum_scales[i][j] + self.state[p]['scaling_coeff'] = united_scale / momentum_scales[i][j] for group, grads_this_group in zip(self.param_groups, grads_group): if grads_this_group is None: @@ -201,8 +196,7 @@ class OnebitLamb(torch.optim.Optimizer): state = self.state[p] # State initialization - if len(state) == 0 or (len(state) == 1 - and 'scaling_coeff' in state.keys()): + if len(state) == 0 or (len(state) == 1 and 'scaling_coeff' in state.keys()): state['step'] = 0 state['lamb_coeff_freeze'] = 0.0 state['last_factor'] = 1.0 @@ -215,7 +209,8 @@ class OnebitLamb(torch.optim.Optimizer): if not self.initialize: self.lamb_freeze_key = True - exp_avg, exp_avg_sq, exp_avg_sq_fresh = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_sq_fresh'] + exp_avg, exp_avg_sq, exp_avg_sq_fresh = state['exp_avg'], state['exp_avg_sq'], state[ + 'exp_avg_sq_fresh'] beta1, beta2 = group['betas'] max_coeff = group['max_coeff'] min_coeff = group['min_coeff'] @@ -243,8 +238,8 @@ class OnebitLamb(torch.optim.Optimizer): if lamb_coeff < min_coeff: lamb_coeff = min_coeff if lamb_coeff != 1.0: - state['lamb_coeff_freeze'] = self.coeff_beta * state[ - 'lamb_coeff_freeze'] + (1 - self.coeff_beta) * lamb_coeff + state['lamb_coeff_freeze'] = self.coeff_beta * state['lamb_coeff_freeze'] + ( + 1 - self.coeff_beta) * lamb_coeff self.lamb_coeffs.append(lamb_coeff) with torch.no_grad(): p.add_(-group['lr'] * lamb_coeff * update) @@ -266,20 +261,15 @@ class OnebitLamb(torch.optim.Optimizer): tensor_size += torch.numel(p.data) corrected_tensor_size = tensor_size if tensor_size % (self.size * self.divider) != 0: - difference = ((self.size * self.divider) - (tensor_size % - (self.size * self.divider))) + difference = ((self.size * self.divider) - (tensor_size % (self.size * self.divider))) corrected_tensor_size += difference - self.dummy_exp_avg[0] = torch.zeros( - difference, - device=momentum_groups[0].data.device) + self.dummy_exp_avg[0] = torch.zeros(difference, device=momentum_groups[0].data.device) momentum_groups.append(self.dummy_exp_avg[0]) self.corrected_tensor_sizes.append(corrected_tensor_size) self.server_chunk_sizes.append(corrected_tensor_size // self.size) - self.exp_avg_flat.append( - _flatten_dense_tensors([p.detach().clone() for p in momentum_groups])) - updated_params = _unflatten_dense_tensors(self.exp_avg_flat[0], - momentum_groups) + self.exp_avg_flat.append(_flatten_dense_tensors([p.detach().clone() for p in momentum_groups])) + updated_params = _unflatten_dense_tensors(self.exp_avg_flat[0], momentum_groups) for p, q in zip(momentum_groups, updated_params): p.data = q.data @@ -287,11 +277,8 @@ class OnebitLamb(torch.optim.Optimizer): get_accelerator().empty_cache() for i in range(len(self.exp_avg_flat)): self.worker_errors.append( - torch.zeros(self.corrected_tensor_sizes[i], - device=self.exp_avg_flat[i].device)) - self.server_errors.append( - torch.zeros(self.server_chunk_sizes[i], - device=self.exp_avg_flat[i].device)) + torch.zeros(self.corrected_tensor_sizes[i], device=self.exp_avg_flat[i].device)) + self.server_errors.append(torch.zeros(self.server_chunk_sizes[i], device=self.exp_avg_flat[i].device)) get_accelerator().empty_cache() if self.lamb_freeze_key: @@ -300,31 +287,23 @@ class OnebitLamb(torch.optim.Optimizer): if not self.initialize: get_accelerator().empty_cache() self.worker_errors.append( - torch.zeros(self.corrected_tensor_sizes[i], - device=self.exp_avg_flat[i].device)) + torch.zeros(self.corrected_tensor_sizes[i], device=self.exp_avg_flat[i].device)) self.server_errors.append( - torch.zeros(self.server_chunk_sizes[i], - device=self.exp_avg_flat[i].device)) + torch.zeros(self.server_chunk_sizes[i], device=self.exp_avg_flat[i].device)) get_accelerator().empty_cache() if dist.get_rank() == 0: print("Cupy Buffers Initialized Successfully.") - self.comm_backend_handle.compressed_allreduce( - self.exp_avg_flat[i], - self.worker_errors[0], - self.server_errors[0], - self.deepspeed.local_rank) + self.comm_backend_handle.compressed_allreduce(self.exp_avg_flat[i], self.worker_errors[0], + self.server_errors[0], self.deepspeed.local_rank) if dist.get_rank() == 0: print('Pop out errors', flush=True) del self.worker_errors[:] del self.server_errors[:] else: - self.comm_backend_handle.compressed_allreduce( - self.exp_avg_flat[i], - self.worker_errors[i], - self.server_errors[i], - self.deepspeed.local_rank) + self.comm_backend_handle.compressed_allreduce(self.exp_avg_flat[i], self.worker_errors[i], + self.server_errors[i], self.deepspeed.local_rank) if self.lamb_freeze_key and self.initialize: for i, group in enumerate(self.param_groups): @@ -332,7 +311,8 @@ class OnebitLamb(torch.optim.Optimizer): for j, p in enumerate(group['params']): state = self.state[p] - exp_avg, exp_avg_sq, exp_avg_sq_fresh = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_sq_fresh'] + exp_avg, exp_avg_sq, exp_avg_sq_fresh = state['exp_avg'], state['exp_avg_sq'], state[ + 'exp_avg_sq_fresh'] beta1, beta2 = group['betas'] exp_avg.div_(self.state[p]['scaling_coeff']) # Because 1-bit compression cannot represent exact zero, it is required to @@ -345,15 +325,11 @@ class OnebitLamb(torch.optim.Optimizer): # to add this exp_avg_mask for BERT pre-training.) if 'exp_avg_mask' in group: if exp_avg.device != group['exp_avg_mask'].device: - group['exp_avg_mask'] = group['exp_avg_mask'].to( - device=exp_avg.device) + group['exp_avg_mask'] = group['exp_avg_mask'].to(device=exp_avg.device) exp_avg.mul_(group['exp_avg_mask']) - grad_reconstruct = ((exp_avg - exp_avg_last_step[i][j] * beta1) / - (1 - beta1)) - exp_avg_sq_fresh.mul_(beta2).addcmul_(1 - beta2, - grad_reconstruct, - grad_reconstruct) + grad_reconstruct = ((exp_avg - exp_avg_last_step[i][j] * beta1) / (1 - beta1)) + exp_avg_sq_fresh.mul_(beta2).addcmul_(1 - beta2, grad_reconstruct, grad_reconstruct) denom = exp_avg_sq.sqrt() + group['eps'] update_prelim = exp_avg / denom @@ -367,9 +343,7 @@ class OnebitLamb(torch.optim.Optimizer): denom_real = exp_avg_sq_fresh.sqrt() + group['eps'] factor = (denom / denom_real).max().item() if group['weight_decay'] > 0.0: - update_ratio = min(1.0, - (update_prelim.pow(2).sum().sqrt() / - update_norm).item()) + update_ratio = min(1.0, (update_prelim.pow(2).sum().sqrt() / update_norm).item()) factor = factor * update_ratio + (1.0 - update_ratio) if factor > self.factor_max: factor = self.factor_max @@ -416,8 +390,7 @@ class OnebitLamb(torch.optim.Optimizer): for i, group in enumerate(self.param_groups): if 'exp_avg_mask' in group: state_dict['param_groups'][i]['exp_avg_mask'] = group['exp_avg_mask'] - elif 'exp_avg_mask' not in group and 'exp_avg_mask' in state_dict[ - 'param_groups'][i]: + elif 'exp_avg_mask' not in group and 'exp_avg_mask' in state_dict['param_groups'][i]: state_dict['param_groups'][i].pop('exp_avg_mask') super().load_state_dict(state_dict) # need to reset the fused momentum since loading states will break the linking @@ -442,9 +415,7 @@ class OnebitLamb(torch.optim.Optimizer): self.state[p].pop('scaling_coeff') else: if dist.get_rank() == 0: - print( - "Checkpoint loaded and OnebitLamb compression stage starts/continues." - ) + print("Checkpoint loaded and OnebitLamb compression stage starts/continues.") if self.lamb_freeze_key is False: self.lamb_freeze_key = True if self.using_pipeline: diff --git a/deepspeed/runtime/fp16/onebit/zoadam.py b/deepspeed/runtime/fp16/onebit/zoadam.py index f86ae86f3..322fa3e9b 100644 --- a/deepspeed/runtime/fp16/onebit/zoadam.py +++ b/deepspeed/runtime/fp16/onebit/zoadam.py @@ -49,13 +49,13 @@ class ZeroOneAdam(torch.optim.Optimizer): .. _On the Convergence of Adam and Beyond: https://openreview.net/forum?id=ryQu7f-RZ """ + def __init__(self, params, deepspeed=None, lr=1e-3, bias_correction=True, - betas=(0.9, - 0.999), + betas=(0.9, 0.999), eps=1e-8, eps_inside_sqrt=False, weight_decay=0., @@ -105,8 +105,7 @@ class ZeroOneAdam(torch.optim.Optimizer): assert TORCH_MAJOR >= 1 and TORCH_MINOR >= 8, "Please use torch 1.8 or greater to enable NCCL backend in 0/1 Adam. Alternatively, please specify 'mpi' as the 'comm_backend_name' in config file to proceed with the MPI backend" assert dist.is_initialized() == True, "Please initialize the torch distributed backend." from deepspeed.runtime.comm.nccl import NcclBackend - self.using_pipeline = hasattr(self.deepspeed, - 'pipeline_enable_backward_allreduce') + self.using_pipeline = hasattr(self.deepspeed, 'pipeline_enable_backward_allreduce') self.comm_backend_handle = NcclBackend(self.deepspeed.mpu) elif self.comm_backend_name == 'mpi': @@ -181,16 +180,12 @@ class ZeroOneAdam(torch.optim.Optimizer): state['corrected_tensor_size'] = state['tensor_size'] if state['tensor_size'] % (self.size * self.divider) != 0: - state['corrected_tensor_size'] += ((self.size * self.divider) - - (state['tensor_size'] % - (self.size * self.divider))) - state['server_chunk_size'] = state[ - 'corrected_tensor_size'] // self.size + state['corrected_tensor_size'] += ((self.size * self.divider) - (state['tensor_size'] % + (self.size * self.divider))) + state['server_chunk_size'] = state['corrected_tensor_size'] // self.size get_accelerator().empty_cache() - state['worker_error'] = torch.zeros(state['corrected_tensor_size'], - device=p.device) - state['server_error'] = torch.zeros(state['server_chunk_size'], - device=p.device) + state['worker_error'] = torch.zeros(state['corrected_tensor_size'], device=p.device) + state['server_error'] = torch.zeros(state['server_chunk_size'], device=p.device) # Accumulation of momentum, i.e., the u variable in the 0/1 Adam paper state['momentum_accumulator'] = torch.zeros_like(p.data) get_accelerator().empty_cache() @@ -213,16 +208,10 @@ class ZeroOneAdam(torch.optim.Optimizer): if self.size > 1: with torch.no_grad(): grad_onebit = self.comm_backend_handle.compressed_allreduce( - grad, - state['worker_error'], - state['server_error'], - self.deepspeed.local_rank) + grad, state['worker_error'], state['server_error'], self.deepspeed.local_rank) if 'exp_avg_mask' in group: - if grad_onebit.device != group[ - 'exp_avg_mask'].device: - group['exp_avg_mask'] = group[ - 'exp_avg_mask'].to( - device=grad_onebit.device) + if grad_onebit.device != group['exp_avg_mask'].device: + group['exp_avg_mask'] = group['exp_avg_mask'].to(device=grad_onebit.device) grad_onebit.mul_(group['exp_avg_mask']) exp_avg.mul_(beta1).add_(1 - beta1, grad_onebit) else: @@ -233,15 +222,12 @@ class ZeroOneAdam(torch.optim.Optimizer): if not self.initialize: if self.size > 1: comm_buffer.set_( - self.comm_backend_handle.compressed_allreduce( - comm_buffer, - state['worker_error'], - state['server_error'], - self.deepspeed.local_rank)) + self.comm_backend_handle.compressed_allreduce(comm_buffer, state['worker_error'], + state['server_error'], + self.deepspeed.local_rank)) if 'exp_avg_mask' in group: if comm_buffer.device != group['exp_avg_mask'].device: - group['exp_avg_mask'] = group['exp_avg_mask'].to( - device=comm_buffer.device) + group['exp_avg_mask'] = group['exp_avg_mask'].to(device=comm_buffer.device) comm_buffer.mul_(group['exp_avg_mask']) if self.initialize: @@ -252,22 +238,18 @@ class ZeroOneAdam(torch.optim.Optimizer): p.data.add_(-group['lr'] * update) if self.freeze_key is True: comm_buffer.add_(-group['lr'] * update) - if state['step'] % state[ - 'local_step_interval'] == 0 and self.freeze_key: + if state['step'] % state['local_step_interval'] == 0 and self.freeze_key: with torch.no_grad(): p.data.add_(-1 * comm_buffer) comm_buffer.mul_(exp_avg_sq.sqrt() + group['eps']) if self.size > 1: comm_buffer.copy_( - self.comm_backend_handle.compressed_allreduce( - comm_buffer, - state['worker_error'], - state['server_error'], - self.deepspeed.local_rank)) + self.comm_backend_handle.compressed_allreduce(comm_buffer, state['worker_error'], + state['server_error'], + self.deepspeed.local_rank)) if 'exp_avg_mask' in group: if comm_buffer.device != group['exp_avg_mask'].device: - group['exp_avg_mask'] = group['exp_avg_mask'].to( - device=comm_buffer.device) + group['exp_avg_mask'] = group['exp_avg_mask'].to(device=comm_buffer.device) comm_buffer.mul_(group['exp_avg_mask']) exp_avg.zero_().add_(comm_buffer / state['lrs'], alpha=-1) p.data.add_(comm_buffer / (exp_avg_sq.sqrt() + group['eps'])) @@ -298,9 +280,8 @@ class ZeroOneAdam(torch.optim.Optimizer): state['local_step_counter'] += 1 if state['local_step_counter'] == self.local_step_scaler: state['local_step_counter'] = 0 - state['local_step_interval'] = min( - self.local_step_clipper, - state['local_step_interval'] * 2) + state['local_step_interval'] = min(self.local_step_clipper, + state['local_step_interval'] * 2) if not self.initialize: print('Pop out errors', flush=True) @@ -343,14 +324,13 @@ class ZeroOneAdam(torch.optim.Optimizer): for i, group in enumerate(self.param_groups): if 'exp_avg_mask' in group: state_dict['param_groups'][i]['exp_avg_mask'] = group['exp_avg_mask'] - elif 'exp_avg_mask' not in group and 'exp_avg_mask' in state_dict[ - 'param_groups'][i]: + elif 'exp_avg_mask' not in group and 'exp_avg_mask' in state_dict['param_groups'][i]: state_dict['param_groups'][i].pop('exp_avg_mask') super().load_state_dict(state_dict) if self.state[self.param_groups[0]['params'][0]]['step'] < self.var_freeze_step: self.var_freeze_key = False - if (self.state[self.param_groups[0]['params'][0]]['step'] + 1 - ) % self.state[self.param_groups[0]['params'][0]]['var_interval'] == 0: + if (self.state[self.param_groups[0]['params'][0]]['step'] + + 1) % self.state[self.param_groups[0]['params'][0]]['var_interval'] == 0: if self.using_pipeline: self.deepspeed.pipeline_enable_backward_allreduce = True else: diff --git a/deepspeed/runtime/fp16/unfused_optimizer.py b/deepspeed/runtime/fp16/unfused_optimizer.py index e0249f15a..c5f7eb420 100755 --- a/deepspeed/runtime/fp16/unfused_optimizer.py +++ b/deepspeed/runtime/fp16/unfused_optimizer.py @@ -24,6 +24,7 @@ class FP16_UnfusedOptimizer(DeepSpeedOptimizer): For usage example please see, TODO: DeepSpeed V2 Tutorial """ + def __init__(self, init_optimizer, deepspeed=None, @@ -105,9 +106,7 @@ class FP16_UnfusedOptimizer(DeepSpeedOptimizer): self.mpu = mpu self.overflow = False - self.overflow_checker = CheckOverflow(self.fp16_groups, - mpu=self.mpu, - deepspeed=deepspeed) + self.overflow_checker = CheckOverflow(self.fp16_groups, mpu=self.mpu, deepspeed=deepspeed) self.initialize_optimizer_states() @@ -137,45 +136,33 @@ class FP16_UnfusedOptimizer(DeepSpeedOptimizer): expert_norm_groups = [] for i, group in enumerate(self.fp16_groups): grads = [ - torch.zeros(p.size(), - dtype=p.dtype, - device=p.device) if p.grad is None else p.grad for p in group + torch.zeros(p.size(), dtype=p.dtype, device=p.device) if p.grad is None else p.grad for p in group ] grads_groups.append(grads) grads_groups_flat.append(_flatten_dense_tensors(grads)) grads_for_norm, expert_grads_for_norm = split_params_grads_into_shared_and_expert_params(group) norm_group_value = 0.0 if len(grads_for_norm) > 0: - norm_group_value = get_weight_norm( - _flatten_dense_tensors(grads_for_norm), - mpu=self.mpu) + norm_group_value = get_weight_norm(_flatten_dense_tensors(grads_for_norm), mpu=self.mpu) norm_groups.append(norm_group_value) expert_norm_group_value = 0.0 if len(expert_grads_for_norm) > 0: - expert_norm_group_value = get_weight_norm( - _flatten_dense_tensors(expert_grads_for_norm), - mpu=self.mpu) + expert_norm_group_value = get_weight_norm(_flatten_dense_tensors(expert_grads_for_norm), mpu=self.mpu) expert_norm_groups.append(expert_norm_group_value) - self.overflow = self.overflow_checker.check_using_norm(norm_groups + - expert_norm_groups) + self.overflow = self.overflow_checker.check_using_norm(norm_groups + expert_norm_groups) prev_scale = self.cur_scale self._update_scale(self.overflow) if self.overflow: if self.verbose: - logger.info( - "[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss " - "scale: {}, reducing to {}".format(prev_scale, - self.cur_scale)) + logger.info("[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss " + "scale: {}, reducing to {}".format(prev_scale, self.cur_scale)) return self.overflow self._global_grad_norm = get_global_norm(norm_list=norm_groups) - combined_scale = self.unscale_and_clip_grads(self._global_grad_norm, - apply_scale=False) - self.optimizer.step(grads=grads_groups, - output_params=self.fp16_groups, - scale=combined_scale) + combined_scale = self.unscale_and_clip_grads(self._global_grad_norm, apply_scale=False) + self.optimizer.step(grads=grads_groups, output_params=self.fp16_groups, scale=combined_scale) for fp32_group, fp16_group in zip(self.fp32_groups, self.fp16_groups): for idx, (fp32_param, fp16_param) in enumerate(zip(fp32_group, fp16_group)): @@ -199,9 +186,7 @@ class FP16_UnfusedOptimizer(DeepSpeedOptimizer): def override_loss_scale(self, loss_scale): if loss_scale != self.external_loss_scale: - logger.info( - f'[deepspeed] setting loss scale from {self.external_loss_scale} -> {loss_scale}' - ) + logger.info(f'[deepspeed] setting loss scale from {self.external_loss_scale} -> {loss_scale}') self.custom_loss_scaler = True self.external_loss_scale = loss_scale @@ -219,10 +204,8 @@ class FP16_UnfusedOptimizer(DeepSpeedOptimizer): self._update_scale(self.overflow) if self.overflow: if self.verbose: - logger.info( - "[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss " - "scale: {}, reducing to {}".format(prev_scale, - self.cur_scale)) + logger.info("[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss " + "scale: {}, reducing to {}".format(prev_scale, self.cur_scale)) return self.overflow norm_groups = [] @@ -236,9 +219,7 @@ class FP16_UnfusedOptimizer(DeepSpeedOptimizer): # copying gradients to fp32 to wor k with fp32 parameters for fp32_param, fp16_param in zip(self.fp32_groups[i], self.fp16_groups[i]): if fp16_param.grad is None: - fp32_param.grad = torch.zeros(fp16_param.size(), - dtype=fp32_param.dtype, - device=fp32_param.device) + fp32_param.grad = torch.zeros(fp16_param.size(), dtype=fp32_param.dtype, device=fp32_param.device) else: fp32_param.grad = fp16_param.grad.to(fp32_param.dtype) @@ -294,25 +275,19 @@ class FP16_UnfusedOptimizer(DeepSpeedOptimizer): if self.dynamic_loss_scale: prev_scale = self.cur_scale if skip: - self.cur_scale = max(self.cur_scale / self.scale_factor, - self.min_loss_scale) + self.cur_scale = max(self.cur_scale / self.scale_factor, self.min_loss_scale) self.last_overflow_iter = self.cur_iter if self.verbose: logger.info("Grad overflow on iteration: %s", self.cur_iter) - logger.info( - f"Reducing dynamic loss scale from {prev_scale} to {self.cur_scale}" - ) + logger.info(f"Reducing dynamic loss scale from {prev_scale} to {self.cur_scale}") else: # Ensure self.scale_window updates since last overflow stable_interval = (self.cur_iter - self.last_overflow_iter) - 1 if (stable_interval > 0) and (stable_interval % self.scale_window == 0): self.cur_scale *= self.scale_factor if self.verbose: - logger.info( - f"No Grad overflow for {self.scale_window} iterations") - logger.info( - f"Increasing dynamic loss scale from {prev_scale} to {self.cur_scale}" - ) + logger.info(f"No Grad overflow for {self.scale_window} iterations") + logger.info(f"Increasing dynamic loss scale from {prev_scale} to {self.cur_scale}") else: if skip: logger.info("Grad overflow on iteration %s", self.cur_iter) diff --git a/deepspeed/runtime/lr_schedules.py b/deepspeed/runtime/lr_schedules.py index faf5e6fee..816c0226d 100755 --- a/deepspeed/runtime/lr_schedules.py +++ b/deepspeed/runtime/lr_schedules.py @@ -53,28 +53,15 @@ TOTAL_NUM_STEPS = 'total_num_steps' def add_tuning_arguments(parser): - group = parser.add_argument_group('Convergence Tuning', - 'Convergence tuning configurations') + group = parser.add_argument_group('Convergence Tuning', 'Convergence tuning configurations') # LR scheduler - group.add_argument('--lr_schedule', - type=str, - default=None, - help='LR schedule for training.') + group.add_argument('--lr_schedule', type=str, default=None, help='LR schedule for training.') # Learning rate range test - group.add_argument("--lr_range_test_min_lr", - type=float, - default=0.001, - help='Starting lr value.') - group.add_argument("--lr_range_test_step_rate", - type=float, - default=1.0, - help='scaling rate for LR range test.') - group.add_argument("--lr_range_test_step_size", - type=int, - default=1000, - help='training steps per LR change.') + group.add_argument("--lr_range_test_min_lr", type=float, default=0.001, help='Starting lr value.') + group.add_argument("--lr_range_test_step_rate", type=float, default=1.0, help='scaling rate for LR range test.') + group.add_argument("--lr_range_test_step_size", type=int, default=1000, help='training steps per LR change.') group.add_argument("--lr_range_test_staircase", type=bool, default=False, @@ -89,66 +76,34 @@ def add_tuning_arguments(parser): type=int, default=-1, help='first stair count for 1Cycle schedule.') - group.add_argument( - "--cycle_second_step_size", - type=int, - default=-1, - help='size of second step of 1Cycle schedule (default first_step_size).') + group.add_argument("--cycle_second_step_size", + type=int, + default=-1, + help='size of second step of 1Cycle schedule (default first_step_size).') group.add_argument("--cycle_second_stair_count", type=int, default=-1, help='second stair count for 1Cycle schedule.') - group.add_argument( - "--decay_step_size", - type=int, - default=1000, - help='size of intervals for applying post cycle decay (training steps).') - - # 1Cycle LR - group.add_argument("--cycle_min_lr", - type=float, - default=0.01, - help='1Cycle LR lower bound.') - group.add_argument("--cycle_max_lr", - type=float, - default=0.1, - help='1Cycle LR upper bound.') - group.add_argument("--decay_lr_rate", - type=float, - default=0.0, - help='post cycle LR decay rate.') - - # 1Cycle Momentum - group.add_argument('--cycle_momentum', - default=False, - action='store_true', - help='Enable 1Cycle momentum schedule.') - group.add_argument("--cycle_min_mom", - type=float, - default=0.8, - help='1Cycle momentum lower bound.') - group.add_argument("--cycle_max_mom", - type=float, - default=0.9, - help='1Cycle momentum upper bound.') - group.add_argument("--decay_mom_rate", - type=float, - default=0.0, - help='post cycle momentum decay rate.') - - # Warmup LR - group.add_argument('--warmup_min_lr', - type=float, - default=0, - help='WarmupLR minimum/initial LR value') - group.add_argument('--warmup_max_lr', - type=float, - default=0.001, - help='WarmupLR maximum LR value.') - group.add_argument('--warmup_num_steps', + group.add_argument("--decay_step_size", type=int, default=1000, - help='WarmupLR step count for LR warmup.') + help='size of intervals for applying post cycle decay (training steps).') + + # 1Cycle LR + group.add_argument("--cycle_min_lr", type=float, default=0.01, help='1Cycle LR lower bound.') + group.add_argument("--cycle_max_lr", type=float, default=0.1, help='1Cycle LR upper bound.') + group.add_argument("--decay_lr_rate", type=float, default=0.0, help='post cycle LR decay rate.') + + # 1Cycle Momentum + group.add_argument('--cycle_momentum', default=False, action='store_true', help='Enable 1Cycle momentum schedule.') + group.add_argument("--cycle_min_mom", type=float, default=0.8, help='1Cycle momentum lower bound.') + group.add_argument("--cycle_max_mom", type=float, default=0.9, help='1Cycle momentum upper bound.') + group.add_argument("--decay_mom_rate", type=float, default=0.0, help='post cycle momentum decay rate.') + + # Warmup LR + group.add_argument('--warmup_min_lr', type=float, default=0, help='WarmupLR minimum/initial LR value') + group.add_argument('--warmup_max_lr', type=float, default=0.001, help='WarmupLR maximum LR value.') + group.add_argument('--warmup_num_steps', type=int, default=1000, help='WarmupLR step count for LR warmup.') group.add_argument('--warmup_type', type=str, default=WARMUP_LOG_RATE, @@ -168,16 +123,13 @@ def override_lr_range_test_params(args, params): if hasattr(args, LR_RANGE_TEST_MIN_LR) and args.lr_range_test_min_lr is not None: params[LR_RANGE_TEST_MIN_LR] = args.lr_range_test_min_lr - if hasattr(args, - LR_RANGE_TEST_STEP_RATE) and args.lr_range_test_step_rate is not None: + if hasattr(args, LR_RANGE_TEST_STEP_RATE) and args.lr_range_test_step_rate is not None: params[LR_RANGE_TEST_STEP_RATE] = args.lr_range_test_step_rate - if hasattr(args, - LR_RANGE_TEST_STEP_SIZE) and args.lr_range_test_step_size is not None: + if hasattr(args, LR_RANGE_TEST_STEP_SIZE) and args.lr_range_test_step_size is not None: params[LR_RANGE_TEST_STEP_SIZE] = args.lr_range_test_step_size - if hasattr(args, - LR_RANGE_TEST_STAIRCASE) and args.lr_range_test_staircase is not None: + if hasattr(args, LR_RANGE_TEST_STAIRCASE) and args.lr_range_test_staircase is not None: params[LR_RANGE_TEST_STAIRCASE] = args.lr_range_test_staircase @@ -185,15 +137,13 @@ def override_1cycle_params(args, params): if hasattr(args, CYCLE_FIRST_STEP_SIZE) and args.cycle_first_step_size is not None: params[CYCLE_FIRST_STEP_SIZE] = args.cycle_first_step_size - if hasattr(args, - CYCLE_FIRST_STAIR_COUNT) and args.cycle_first_stair_count is not None: + if hasattr(args, CYCLE_FIRST_STAIR_COUNT) and args.cycle_first_stair_count is not None: params[CYCLE_FIRST_STAIR_COUNT] = args.cycle_first_stair_count if hasattr(args, CYCLE_SECOND_STEP_SIZE) and args.cycle_second_step_size is not None: params[CYCLE_SECOND_STEP_SIZE] = args.cycle_second_step_size - if hasattr(args, - CYCLE_SECOND_STAIR_COUNT) and args.cycle_second_stair_count is not None: + if hasattr(args, CYCLE_SECOND_STAIR_COUNT) and args.cycle_second_stair_count is not None: params[CYCLE_SECOND_STAIR_COUNT] = args.cycle_second_stair_count if hasattr(args, DECAY_STEP_SIZE) and args.decay_step_size is not None: @@ -301,8 +251,7 @@ def get_torch_optimizer(optimizer): if hasattr(optimizer, 'optimizer') and isinstance(optimizer.optimizer, Optimizer): return optimizer.optimizer - raise TypeError('{} is not a subclass of torch.optim.Optimizer'.format( - type(optimizer).__name__)) + raise TypeError('{} is not a subclass of torch.optim.Optimizer'.format(type(optimizer).__name__)) class LRRangeTest(object): @@ -343,6 +292,7 @@ class LRRangeTest(object): _A disciplined approach to neural network hyper-parameters: Part 1 -- learning rate, batch size, momentum, and weight decay: https://arxiv.org/abs/1803.09820 """ + def __init__(self, optimizer: Optimizer, lr_range_test_min_lr: float = 1e-3, @@ -353,13 +303,10 @@ class LRRangeTest(object): self.optimizer = get_torch_optimizer(optimizer) - if isinstance(lr_range_test_min_lr, - list) or isinstance(lr_range_test_min_lr, - tuple): + if isinstance(lr_range_test_min_lr, list) or isinstance(lr_range_test_min_lr, tuple): if len(lr_range_test_min_lr) != len(self.optimizer.param_groups): - raise ValueError("expected {} lr_range_test_min_lr, got {}".format( - len(self.optimizer.param_groups), - len(lr_range_test_min_lr))) + raise ValueError("expected {} lr_range_test_min_lr, got {}".format(len(self.optimizer.param_groups), + len(lr_range_test_min_lr))) self.min_lr = list(lr_range_test_min_lr) else: self.min_lr = [lr_range_test_min_lr] * len(self.optimizer.param_groups) @@ -384,9 +331,7 @@ class LRRangeTest(object): def get_lr(self): lr_increase = self._get_increase() - return [ - lr_range_test_min_lr * lr_increase for lr_range_test_min_lr in self.min_lr - ] + return [lr_range_test_min_lr * lr_increase for lr_range_test_min_lr in self.min_lr] def get_last_lr(self): """ Return last computed learning rate by current scheduler. @@ -480,6 +425,7 @@ class OneCycle(object): .. _A disciplined approach to neural network hyper-parameters: Part 1 -- learning rate, batch size, momentum, and weight decay: https://arxiv.org/abs/1803.09820 """ + def __init__(self, optimizer, cycle_min_lr, @@ -499,26 +445,16 @@ class OneCycle(object): self.optimizer = get_torch_optimizer(optimizer) # Initialize cycle shape - self._initialize_cycle(cycle_first_step_size, - cycle_second_step_size, - cycle_first_stair_count, - cycle_second_stair_count, - decay_step_size) + self._initialize_cycle(cycle_first_step_size, cycle_second_step_size, cycle_first_stair_count, + cycle_second_stair_count, decay_step_size) # Initialize cycle lr - self._initialize_lr(self.optimizer, - cycle_min_lr, - cycle_max_lr, - decay_lr_rate, - last_batch_iteration) + self._initialize_lr(self.optimizer, cycle_min_lr, cycle_max_lr, decay_lr_rate, last_batch_iteration) # Initialize cyclic momentum self.cycle_momentum = cycle_momentum if cycle_momentum: - self._initialize_momentum(self.optimizer, - cycle_min_mom, - cycle_max_mom, - decay_mom_rate, + self._initialize_momentum(self.optimizer, cycle_min_mom, cycle_max_mom, decay_mom_rate, last_batch_iteration) # Initialize batch iteration tracker @@ -526,16 +462,11 @@ class OneCycle(object): # Configure cycle shape - def _initialize_cycle(self, - cycle_first_step_size, - cycle_second_step_size, - cycle_first_stair_count, - cycle_second_stair_count, - decay_step_size): + def _initialize_cycle(self, cycle_first_step_size, cycle_second_step_size, cycle_first_stair_count, + cycle_second_stair_count, decay_step_size): cycle_first_step_size = float(cycle_first_step_size) cycle_second_step_size = float( - cycle_second_step_size - ) if cycle_second_step_size is not None else cycle_first_step_size + cycle_second_step_size) if cycle_second_step_size is not None else cycle_first_step_size self.total_size = cycle_first_step_size + cycle_second_step_size self.step_ratio = cycle_first_step_size / self.total_size @@ -551,12 +482,7 @@ class OneCycle(object): self.skip_mom_decay = False # Configure lr schedule - def _initialize_lr(self, - optimizer, - cycle_min_lr, - cycle_max_lr, - decay_lr_rate, - last_batch_iteration): + def _initialize_lr(self, optimizer, cycle_min_lr, cycle_max_lr, decay_lr_rate, last_batch_iteration): self.min_lrs = [cycle_min_lr] * len(optimizer.param_groups) if last_batch_iteration == -1: for lr, group in zip(self.min_lrs, optimizer.param_groups): @@ -569,12 +495,7 @@ class OneCycle(object): self.skip_lr_decay = True # Configure momentum schedule - def _initialize_momentum(self, - optimizer, - cycle_min_mom, - cycle_max_mom, - decay_mom_rate, - last_batch_iteration): + def _initialize_momentum(self, optimizer, cycle_min_mom, cycle_max_mom, decay_mom_rate, last_batch_iteration): if 'betas' not in optimizer.defaults: optimizer_name = type(optimizer).__name__ logger.warn( @@ -722,6 +643,7 @@ class WarmupLR(object): >>> scheduler.step() """ + def __init__(self, optimizer: Optimizer, warmup_min_lr: float = 0.0, @@ -738,9 +660,8 @@ class WarmupLR(object): self.warmup_num_steps = max(2, warmup_num_steps) # Currently only support linear and log function if warmup_type not in {WARMUP_LOG_RATE, WARMUP_LINEAR_RATE}: - logger.warning( - f"Using unknown warmup_type: {warmup_type}. The increasing function " - f"is set to default (log)") + logger.warning(f"Using unknown warmup_type: {warmup_type}. The increasing function " + f"is set to default (log)") warmup_type = WARMUP_LOG_RATE self.warmup_type = warmup_type self.inverse_log_warm_up = 1.0 / math.log(self.warmup_num_steps) @@ -748,15 +669,10 @@ class WarmupLR(object): def get_lr(self): if self.last_batch_iteration < 0: - logger.warning( - "Attempting to get learning rate from scheduler before it has started") + logger.warning("Attempting to get learning rate from scheduler before it has started") return [0.0] gamma = self._get_gamma() - return [ - min_lr + (delta_lr * gamma) for min_lr, - delta_lr in zip(self.min_lrs, - self.delta_lrs) - ] + return [min_lr + (delta_lr * gamma) for min_lr, delta_lr in zip(self.min_lrs, self.delta_lrs)] def get_last_lr(self): """ Return last computed learning rate by current scheduler. @@ -789,10 +705,8 @@ class WarmupLR(object): def _format_param(self, optimizer, param_value, param_name): if isinstance(param_value, list) or isinstance(param_value, tuple): if len(param_value) != len(optimizer.param_groups): - raise ValueError("expected {} value for {}, got {}".format( - len(optimizer.param_groups), - param_name, - FileNotFoundError(param_value))) + raise ValueError("expected {} value for {}, got {}".format(len(optimizer.param_groups), param_name, + FileNotFoundError(param_value))) return list(param_value) return [param_value] * len(optimizer.param_groups) @@ -819,6 +733,7 @@ class WarmupDecayLR(WarmupLR): >>> scheduler.step() """ + def __init__(self, optimizer: Optimizer, total_num_steps: int, @@ -829,17 +744,11 @@ class WarmupDecayLR(WarmupLR): last_batch_iteration: int = -1): self.total_num_steps = total_num_steps - super(WarmupDecayLR, - self).__init__(optimizer, - warmup_min_lr, - warmup_max_lr, - warmup_num_steps, - warmup_type, - last_batch_iteration) + super(WarmupDecayLR, self).__init__(optimizer, warmup_min_lr, warmup_max_lr, warmup_num_steps, warmup_type, + last_batch_iteration) if self.total_num_steps < self.warmup_num_steps: logger.warning('total_num_steps {} is less than warmup_num_steps {}'.format( - total_num_steps, - warmup_num_steps)) + total_num_steps, warmup_num_steps)) def _get_gamma(self): if self.last_batch_iteration < self.warmup_num_steps: @@ -850,5 +759,4 @@ class WarmupDecayLR(WarmupLR): return max( 0.0, float(self.total_num_steps - self.last_batch_iteration) / - float(max(1.0, - self.total_num_steps - self.warmup_num_steps))) + float(max(1.0, self.total_num_steps - self.warmup_num_steps))) diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py index f4aa871d0..f7b882862 100644 --- a/deepspeed/runtime/pipe/engine.py +++ b/deepspeed/runtime/pipe/engine.py @@ -41,18 +41,8 @@ class PipelineEngine(DeepSpeedEngine): is provided. """ ID_TO_DTYPE = [ - torch.float32, - torch.float64, - torch.complex64, - torch.complex128, - torch.float16, - torch.bfloat16, - torch.uint8, - torch.int8, - torch.int16, - torch.int32, - torch.int64, - torch.bool + torch.float32, torch.float64, torch.complex64, torch.complex128, torch.float16, torch.bfloat16, torch.uint8, + torch.int8, torch.int16, torch.int32, torch.int64, torch.bool ] DTYPE_TO_ID = {dtype: id_ for id_, dtype in enumerate(ID_TO_DTYPE)} @@ -134,8 +124,7 @@ class PipelineEngine(DeepSpeedEngine): if self.global_rank != min(d['ranks']): tied_params += sum(p.numel() for p in d['module'].parameters()) unique_params -= tied_params - params_tensor = torch.LongTensor(data=[num_params, - unique_params]).to(self.device) + params_tensor = torch.LongTensor(data=[num_params, unique_params]).to(self.device) dist.all_reduce(params_tensor, group=self.grid.get_model_parallel_group()) params_tensor = params_tensor.tolist() total_params = params_tensor[0] @@ -156,10 +145,10 @@ class PipelineEngine(DeepSpeedEngine): # Pipeline buffers self.num_pipe_buffers = 0 self.pipe_buffers = { - 'inputs' : [], # batch input and received activations - 'labels' : [], # labels from batch input - 'outputs' : [], # activations - 'output_tensors' : [], # tensor object to preserve backward graph + 'inputs': [], # batch input and received activations + 'labels': [], # labels from batch input + 'outputs': [], # activations + 'output_tensors': [], # tensor object to preserve backward graph } self.pipe_recv_buf = None self.grad_layer = None @@ -178,8 +167,7 @@ class PipelineEngine(DeepSpeedEngine): self.dp_group_loss = torch.tensor(0.0, requires_grad=False).to(self.device) if self._config.pipeline['activation_checkpoint_interval'] > 0: - self.module.activation_checkpoint_interval = self._config.pipeline[ - 'activation_checkpoint_interval'] + self.module.activation_checkpoint_interval = self._config.pipeline['activation_checkpoint_interval'] self.module.checkpoint_parallel_write_pipeline = self._config.checkpoint_parallel_write_pipeline @@ -220,11 +208,10 @@ class PipelineEngine(DeepSpeedEngine): self.has_attention_mask = value def _build_data_iter(self, dataset): - sampler = torch.utils.data.distributed.DistributedSampler( - dataset, - num_replicas=self.dp_world_size, - rank=self.mpu.get_data_parallel_rank(), - shuffle=False) + sampler = torch.utils.data.distributed.DistributedSampler(dataset, + num_replicas=self.dp_world_size, + rank=self.mpu.get_data_parallel_rank(), + shuffle=False) # Build a loader and make it repeating. pipe_dataloader = self.deepspeed_io(dataset, data_sampler=sampler) pipe_dataloader = RepeatingLoader(pipe_dataloader) @@ -317,8 +304,7 @@ class PipelineEngine(DeepSpeedEngine): The arithmetic mean of the losses computed this batch. """ if not torch._C.is_grad_enabled(): - raise RuntimeError( - f'train_batch() requires gradients enabled. Use eval_batch() instead.') + raise RuntimeError(f'train_batch() requires gradients enabled. Use eval_batch() instead.') # Curriculum learning could change activation shape if self.curriculum_enabled_legacy(): @@ -360,28 +346,17 @@ class PipelineEngine(DeepSpeedEngine): # Monitoring if self.global_rank == 0 and self.monitor.enabled: - self.summary_events = [(f'Train/Samples/train_loss', - self.agg_train_loss.mean().item(), + self.summary_events = [(f'Train/Samples/train_loss', self.agg_train_loss.mean().item(), self.global_samples)] self.monitor.write_events(self.summary_events) - if self.wall_clock_breakdown( - ) and self.global_steps % self.steps_per_print() == 0: - self.timers.log([ - 'pipe_send_output', - 'pipe_send_grad', - 'pipe_recv_input', - 'pipe_recv_grad' - ]) + if self.wall_clock_breakdown() and self.global_steps % self.steps_per_print() == 0: + self.timers.log(['pipe_send_output', 'pipe_send_grad', 'pipe_recv_input', 'pipe_recv_grad']) # TODO: should return precisely what loss returned and allow others to be queried? return self.agg_train_loss - def eval_batch(self, - data_iter, - return_logits=False, - compute_loss=True, - reduce_output='avg'): + def eval_batch(self, data_iter, return_logits=False, compute_loss=True, reduce_output='avg'): """Evaluate the pipeline on a batch of data from ``data_iter``. The engine will evaluate ``self.train_batch_size()`` total samples collectively across all workers. @@ -448,9 +423,7 @@ class PipelineEngine(DeepSpeedEngine): eval_output = self._bcast_pipe_scalar(eval_output) if self.global_rank == 0 and self.monitor.enabled: - self.summary_events = [(f'Train/Samples/eval_loss', - eval_output.mean().item(), - self.global_samples)] + self.summary_events = [(f'Train/Samples/eval_loss', eval_output.mean().item(), self.global_samples)] self.monitor.write_events(self.summary_events) # Restore the training iterator @@ -510,8 +483,7 @@ class PipelineEngine(DeepSpeedEngine): reduced /= self.dp_world_size else: for idx in range(len(reduced)): - dist.all_reduce(reduced[idx], - group=self.mpu.get_data_parallel_group()) + dist.all_reduce(reduced[idx], group=self.mpu.get_data_parallel_group()) reduced[idx] /= self.dp_world_size return reduced @@ -529,9 +501,7 @@ class PipelineEngine(DeepSpeedEngine): else: result = torch.Tensor([0.]).type(dtype).to(self.device) - dist.broadcast(tensor=result, - src=src_rank, - group=self.mpu.get_pipe_parallel_group()) + dist.broadcast(tensor=result, src=src_rank, group=self.mpu.get_pipe_parallel_group()) return result @@ -550,18 +520,14 @@ class PipelineEngine(DeepSpeedEngine): assert self.global_rank in self.grid.pp_group losses = torch.Tensor([self.dp_group_loss, agg_loss]).to(self.device) - dist.broadcast(tensor=losses, - src=self.global_rank, - group=self.mpu.get_pipe_parallel_group()) + dist.broadcast(tensor=losses, src=self.global_rank, group=self.mpu.get_pipe_parallel_group()) else: # Get loss from last stage src_rank = self.grid.stage_to_global(self.num_stages - 1) assert src_rank in self.grid.pp_group losses = torch.Tensor([0., 0.]).to(self.device) - dist.broadcast(tensor=losses, - src=src_rank, - group=self.grid.get_pipe_parallel_group()) + dist.broadcast(tensor=losses, src=src_rank, group=self.grid.get_pipe_parallel_group()) self.dp_group_loss = losses[0].clone().detach() agg_loss = losses[1].clone().detach() @@ -638,10 +604,9 @@ class PipelineEngine(DeepSpeedEngine): # collect the partitioned input from the previous stage if self.is_pipe_partitioned and not self.is_first_stage(): - part_input = PartitionedTensor.from_meta( - meta=inputs[0], - local_part=inputs[1], - group=self.grid.get_slice_parallel_group()) + part_input = PartitionedTensor.from_meta(meta=inputs[0], + local_part=inputs[1], + group=self.grid.get_slice_parallel_group()) inputs = (part_input.full(), *inputs[2:]) inputs[0].requires_grad = True @@ -662,18 +627,14 @@ class PipelineEngine(DeepSpeedEngine): if isinstance(outputs, tuple): first_output = outputs[0] # TODO: Improve pipe partitioning to pass multiple tensors that require grads - assert all([ - torch.is_tensor(elt) and elt.requires_grad is False - for elt in outputs[1:] - ]) + assert all([torch.is_tensor(elt) and elt.requires_grad is False for elt in outputs[1:]]) outputs_tail = outputs[1:] elif torch.is_tensor(outputs): first_output = outputs outputs_tail = [] else: raise ValueError("expecting a tensor or a tuple of tensors") - part = PartitionedTensor(tensor=first_output, - group=self.grid.get_slice_parallel_group()) + part = PartitionedTensor(tensor=first_output, group=self.grid.get_slice_parallel_group()) # Clear the large output data, but save the computation graph first_output.data = torch.zeros(1) self.pipe_buffers['output_tensors'][buffer_id] = first_output @@ -732,10 +693,9 @@ class PipelineEngine(DeepSpeedEngine): # careful to also restore the computational graph of the tensors we partitioned. if self.is_pipe_partitioned: if self.is_grad_partitioned: - part_output = PartitionedTensor.from_meta( - meta=outputs[0], - local_part=outputs[1], - group=self.grid.get_slice_parallel_group()) + part_output = PartitionedTensor.from_meta(meta=outputs[0], + local_part=outputs[1], + group=self.grid.get_slice_parallel_group()) self.pipe_buffers['output_tensors'][buffer_id].data = part_output.full() outputs = (self.pipe_buffers['output_tensors'][buffer_id], *outputs[2:]) else: @@ -746,10 +706,9 @@ class PipelineEngine(DeepSpeedEngine): grad_tensors = self.grad_layer if self.is_grad_partitioned: #print(f'RANK={self.global_rank} BEFORE-BWD restoring grad={self.grad_layer[0].size()} {self.grad_layer[1].size()}') - part_grad = PartitionedTensor.from_meta( - meta=self.grad_layer[0], - local_part=self.grad_layer[1], - group=self.grid.get_slice_parallel_group()) + part_grad = PartitionedTensor.from_meta(meta=self.grad_layer[0], + local_part=self.grad_layer[1], + group=self.grid.get_slice_parallel_group()) grad_tensors = (part_grad.full(), *grad_tensors[2:]) part_grad = None #print(f'RANK={self.global_rank} BEFORE-BWD restored grad={self.grad_layer[0].size()} {self.grad_layer[1].size()}') @@ -865,8 +824,7 @@ class PipelineEngine(DeepSpeedEngine): assert isinstance(tensor, torch.Tensor) send_shape = torch.LongTensor(data=tensor.size()).to(self.device) send_ndims = torch.LongTensor(data=[len(tensor.size())]).to(self.device) - send_dtype = torch.LongTensor(data=[self.DTYPE_TO_ID[tensor.dtype]]).to( - self.device) + send_dtype = torch.LongTensor(data=[self.DTYPE_TO_ID[tensor.dtype]]).to(self.device) p2p.send(send_dtype, recv_stage) p2p.send(send_ndims, recv_stage) p2p.send(send_shape, recv_stage) @@ -990,17 +948,14 @@ class PipelineEngine(DeepSpeedEngine): if isinstance(inputs, tuple): first_input = inputs[0] assert all([torch.is_tensor(elt) for elt in inputs[1:]]) - inputs_grad_tail = [ - elt.grad for elt in inputs[1:] if elt.grad is not None - ] + inputs_grad_tail = [elt.grad for elt in inputs[1:] if elt.grad is not None] elif torch.is_tensor(inputs): first_input = inputs inputs_grad_tail = [] else: raise ValueError("expecting a tensor or a tuple of tensors") assert torch.is_tensor(first_input) - part = PartitionedTensor(tensor=first_input.grad, - group=self.grid.get_slice_parallel_group()) + part = PartitionedTensor(tensor=first_input.grad, group=self.grid.get_slice_parallel_group()) inputs = (part.to_meta(), part.data(), *inputs_grad_tail) @@ -1060,9 +1015,7 @@ class PipelineEngine(DeepSpeedEngine): # XXX hardcode meta type if self.is_pipe_partitioned and idx == 0 and buffer.dtype != torch.long: if self.meta_buffer is None: - self.meta_buffer = torch.zeros(buffer.size(), - dtype=torch.long, - device=self.device) + self.meta_buffer = torch.zeros(buffer.size(), dtype=torch.long, device=self.device) buffer = self.meta_buffer p2p.recv(buffer, self.prev_stage) @@ -1091,10 +1044,9 @@ class PipelineEngine(DeepSpeedEngine): # XXX these shapes are hardcoded for Megatron # Restore partitioned output if it was partitioned and we are sending full gradients if self.is_pipe_partitioned and not self.is_grad_partitioned: - part_output = PartitionedTensor.from_meta( - meta=outputs[0], - local_part=outputs[1], - group=self.grid.get_slice_parallel_group()) + part_output = PartitionedTensor.from_meta(meta=outputs[0], + local_part=outputs[1], + group=self.grid.get_slice_parallel_group()) outputs[0].data = part_output.full() outputs = (outputs[0], *outputs[2:]) # save for backward @@ -1104,9 +1056,7 @@ class PipelineEngine(DeepSpeedEngine): if self.grad_layer is None: if isinstance(outputs, torch.Tensor): s = list(outputs.size()) - self.grad_layer = self._allocate_buffer(s, - dtype=outputs.dtype, - num_buffers=1)[0] + self.grad_layer = self._allocate_buffer(s, dtype=outputs.dtype, num_buffers=1)[0] else: # XXX This is a HACK # When we exchange activations/gradients, the two pipe stages @@ -1123,17 +1073,12 @@ class PipelineEngine(DeepSpeedEngine): # branches on is_grad_partitioned so we don't filter out the # metadata tensor. if self.is_grad_partitioned: - sizes_and_dtypes = [ - (list(t.size()), - t.dtype) for t in outputs[:2] - ] + [(list(t.size()), - t.dtype) for t in outputs[2:] if t.is_floating_point()] + sizes_and_dtypes = [(list(t.size()), t.dtype) + for t in outputs[:2]] + [(list(t.size()), t.dtype) + for t in outputs[2:] if t.is_floating_point()] else: - sizes_and_dtypes = [(list(t.size()), - t.dtype) for t in outputs - if t.is_floating_point()] - self.grad_layer = self._allocate_buffers(sizes_and_dtypes, - num_buffers=1)[0] + sizes_and_dtypes = [(list(t.size()), t.dtype) for t in outputs if t.is_floating_point()] + self.grad_layer = self._allocate_buffers(sizes_and_dtypes, num_buffers=1)[0] if isinstance(self.grad_layer, torch.Tensor): p2p.recv(self.grad_layer, self.next_stage) @@ -1142,9 +1087,7 @@ class PipelineEngine(DeepSpeedEngine): for idx, buffer in enumerate(self.grad_layer): # XXX GPT-2 hack if self.is_grad_partitioned and idx == 0 and buffer.dtype != torch.long: - buffer.data = torch.zeros(buffer.size(), - dtype=torch.long, - device=self.device) + buffer.data = torch.zeros(buffer.size(), dtype=torch.long, device=self.device) p2p.recv(buffer, self.next_stage) if self.wall_clock_breakdown(): @@ -1163,13 +1106,10 @@ class PipelineEngine(DeepSpeedEngine): self.mem_status('AFTER STEP') if self.global_rank == 0 and self.monitor.enabled: - self.summary_events = [(f'Train/Samples/lr', - self.get_lr()[0], - self.global_samples)] + self.summary_events = [(f'Train/Samples/lr', self.get_lr()[0], self.global_samples)] if self.fp16_enabled() and hasattr(self.optimizer, 'cur_scale'): - self.summary_events.append((f'Train/Samples/loss_scale', - self.optimizer.cur_scale, - self.global_samples)) + self.summary_events.append( + (f'Train/Samples/loss_scale', self.optimizer.cur_scale, self.global_samples)) self.monitor.write_events(self.summary_events) if self.wall_clock_breakdown(): @@ -1177,22 +1117,11 @@ class PipelineEngine(DeepSpeedEngine): self.timers('step').stop() if self.global_steps % self.steps_per_print() == 0: self.timers.log([ - 'batch_input', - 'forward_microstep', - 'backward_microstep', - 'backward_inner_microstep', - 'backward_allreduce_microstep', - 'backward_tied_allreduce_microstep', - 'step_microstep' + 'batch_input', 'forward_microstep', 'backward_microstep', 'backward_inner_microstep', + 'backward_allreduce_microstep', 'backward_tied_allreduce_microstep', 'step_microstep' ]) if self.global_steps % self.steps_per_print() == 0: - self.timers.log([ - 'forward', - 'backward', - 'backward_inner', - 'backward_allreduce', - 'step' - ]) + self.timers.log(['forward', 'backward', 'backward_inner', 'backward_allreduce', 'step']) def _zero_grads(self, inputs): if isinstance(inputs, torch.Tensor): @@ -1236,10 +1165,7 @@ class PipelineEngine(DeepSpeedEngine): for count in range(num_buffers): buffer = [] for shape, dtype in shapes_and_dtypes: - buffer.append( - self._allocate_zeros(shape, - dtype=dtype, - requires_grad=requires_grad)) + buffer.append(self._allocate_zeros(shape, dtype=dtype, requires_grad=requires_grad)) buffers.append(buffer) return buffers @@ -1298,11 +1224,9 @@ class PipelineEngine(DeepSpeedEngine): max_cached /= 1024**3 print( - f'RANK={rank} STAGE={self.stage_id} STEP={self.global_steps} MEMSTATS', - msg, + f'RANK={rank} STAGE={self.stage_id} STEP={self.global_steps} MEMSTATS', msg, f'current alloc={new_alloced:0.4f}GB (delta={delta_alloced:0.4f}GB max={max_alloced:0.4f}GB) ' - f'current cache={new_cached:0.4f}GB (delta={delta_cached:0.4f}GB max={max_cached:0.4f}GB)' - ) + f'current cache={new_cached:0.4f}GB (delta={delta_cached:0.4f}GB max={max_cached:0.4f}GB)') def module_state_dict(self): """Override hack to save a pipe model and return the directory path of the save. @@ -1318,8 +1242,7 @@ class PipelineEngine(DeepSpeedEngine): assert self._curr_ckpt_path is not None, \ "PipelineEngine expects module_state_dict() to be called from save_checkpoint()" - self.module.save_state_dict(self._curr_ckpt_path, - checkpoint_engine=self.checkpoint_engine) + self.module.save_state_dict(self._curr_ckpt_path, checkpoint_engine=self.checkpoint_engine) return None def load_module_state_dict(self, state_dict, strict=True, custom_load_fn=None): @@ -1367,9 +1290,7 @@ class PipelineEngine(DeepSpeedEngine): # For each instruction in the step for cmd in step_cmds: if type(cmd) not in self._INSTRUCTION_MAP: - raise RuntimeError( - f'{self.__class__.__name__} does not understand instruction {repr(cmd)}' - ) + raise RuntimeError(f'{self.__class__.__name__} does not understand instruction {repr(cmd)}') # Equivalent to: self._exec_forward_pass(buffer_id=0) self._exec_instr = MethodType(self._INSTRUCTION_MAP[type(cmd)], self) diff --git a/deepspeed/runtime/pipe/module.py b/deepspeed/runtime/pipe/module.py index 234bb55b1..4574b95ae 100644 --- a/deepspeed/runtime/pipe/module.py +++ b/deepspeed/runtime/pipe/module.py @@ -45,6 +45,7 @@ class LayerSpec: LayerSpec(torch.nn.Linear, self.hidden_hidden, self.out_dim)] ] """ + def __init__(self, typename, *module_args, **module_kwargs): self.typename = typename self.module_args = module_args @@ -59,9 +60,7 @@ class LayerSpec: self.global_rank = -1 def __repr__(self): - return ds_utils.call_to_str(self.typename.__name__, - self.module_args, - self.module_kwargs) + return ds_utils.call_to_str(self.typename.__name__, self.module_args, self.module_kwargs) def build(self, log=False): """Build the stored specification.""" @@ -72,13 +71,8 @@ class LayerSpec: class TiedLayerSpec(LayerSpec): - def __init__(self, - key, - typename, - *module_args, - forward_fn=None, - tied_weight_attr='weight', - **module_kwargs): + + def __init__(self, key, typename, *module_args, forward_fn=None, tied_weight_attr='weight', **module_kwargs): super().__init__(typename, *module_args, **module_kwargs) self.key = key self.forward_fn = forward_fn @@ -120,6 +114,7 @@ class PipelineModule(nn.Module): activation_checkpoint_func (callable, optional): The function to use for activation checkpointing. Defaults to ``deepspeed.checkpointing.checkpoint``. checkpointable_layers(list, optional): Checkpointable layers may not be checkpointed. Defaults to None which does not additional filtering. """ + def __init__(self, layers, num_stages=None, @@ -154,9 +149,7 @@ class PipelineModule(nn.Module): seed_str = self.seed_fn.__name__ except AttributeError: seed_str = None - print( - f'SEED_LAYERS={self.seed_layers} BASE_SEED={self.base_seed} SEED_FN={seed_str}' - ) + print(f'SEED_LAYERS={self.seed_layers} BASE_SEED={self.base_seed} SEED_FN={seed_str}') # Setup world info self.world_group = dist.new_group(ranks=range(dist.get_world_size())) @@ -173,15 +166,13 @@ class PipelineModule(nn.Module): if topology is None: if self.world_size % self.num_stages != 0: raise RuntimeError( - f'num_stages ({self.num_stages}) must divide distributed world size ({self.world_size})' - ) + f'num_stages ({self.num_stages}) must divide distributed world size ({self.world_size})') dp = self.world_size // num_stages topology = PipeDataParallelTopology(num_pp=num_stages, num_dp=dp) self._topo = topology # Construct communicators for pipeline topology - self._grid = PipelineParallelGrid(process_group=self.world_group, - topology=self._topo) + self._grid = PipelineParallelGrid(process_group=self.world_group, topology=self._topo) self.stage_id = self._topo.get_coord(self.global_rank).pipe @@ -245,9 +236,7 @@ class PipelineModule(nn.Module): self.forward_funcs.append(self.tied_modules[layer.key]) else: # User specified fn with args (module, input) - self.forward_funcs.append( - partial(layer.forward_fn, - self.tied_modules[layer.key])) + self.forward_funcs.append(partial(layer.forward_fn, self.tied_modules[layer.key])) # LayerSpec objects contain an nn.Module that should be allocated now. elif isinstance(layer, LayerSpec): @@ -304,8 +293,7 @@ class PipelineModule(nn.Module): idxs.append(idx) if len(idxs) == 0: - raise RuntimeError( - f"Partitioning '{layername}' found no valid layers to partition.") + raise RuntimeError(f"Partitioning '{layername}' found no valid layers to partition.") return idxs def forward(self, forward_input): @@ -327,8 +315,7 @@ class PipelineModule(nn.Module): for idx, layer in enumerate(self.forward_funcs[start:end]): self.curr_layer = idx + self._local_start if self.seed_layers: - new_seed = (self.base_seed * - local_micro_offset) + self.curr_layer + new_seed = (self.base_seed * local_micro_offset) + self.curr_layer if self.seed_fn: self.seed_fn(new_seed) else: @@ -346,8 +333,7 @@ class PipelineModule(nn.Module): num_layers = len(self.forward_funcs) x = forward_input for start_idx in range(0, num_layers, self.activation_checkpoint_interval): - end_idx = min(start_idx + self.activation_checkpoint_interval, - num_layers) + end_idx = min(start_idx + self.activation_checkpoint_interval, num_layers) funcs = self.forward_funcs[start_idx:end_idx] # Since we either pass tensors or tuples of tensors without unpacking, we @@ -356,10 +342,7 @@ class PipelineModule(nn.Module): x = (x, ) if self._is_checkpointable(funcs): - x = self.activation_checkpoint_func( - exec_range_func(start_idx, - end_idx), - *x) + x = self.activation_checkpoint_func(exec_range_func(start_idx, end_idx), *x) else: x = exec_range_func(start_idx, end_idx)(*x) return x @@ -376,19 +359,16 @@ class PipelineModule(nn.Module): # Each stage gets a simple uniform number of layers. if method == 'uniform': num_layers = len(self._layer_specs) - self.parts = ds_utils.partition_uniform(num_items=num_layers, - num_parts=num_stages) + self.parts = ds_utils.partition_uniform(num_items=num_layers, num_parts=num_stages) elif method == 'parameters': param_counts = self._count_layer_params() - self.parts = ds_utils.partition_balanced(weights=param_counts, - num_parts=num_stages) + self.parts = ds_utils.partition_balanced(weights=param_counts, num_parts=num_stages) elif method.startswith('type:'): layertype = method.split(':')[1] binary_weights = [0] * len(self._layer_specs) for idx in self._find_layer_type(layertype): binary_weights[idx] = 1 - self.parts = ds_utils.partition_balanced(weights=binary_weights, - num_parts=num_stages) + self.parts = ds_utils.partition_balanced(weights=binary_weights, num_parts=num_stages) elif method == 'profile': raise NotImplementedError(f'Partitioning method {method} not implemented.') else: @@ -436,8 +416,7 @@ class PipelineModule(nn.Module): def _synchronize_tied_weights(self): for key, comm in self.tied_comms.items(): dist.broadcast( - getattr(comm['module'], - comm['weight_attr']), + getattr(comm['module'], comm['weight_attr']), src=min(comm['ranks']), group=comm['group'], ) @@ -467,14 +446,9 @@ class PipelineModule(nn.Module): tied_ranks = [] for s in sorted(tied_stages): if self._grid.get_slice_parallel_world_size() > 1: - tied_ranks.append( - self._grid.stage_to_global(stage_id=s, - data=dp, - model=mp)) + tied_ranks.append(self._grid.stage_to_global(stage_id=s, data=dp, model=mp)) else: - tied_ranks.append( - self._grid.stage_to_global(stage_id=s, - data=dp)) + tied_ranks.append(self._grid.stage_to_global(stage_id=s, data=dp)) group = dist.new_group(ranks=tied_ranks) # Record this tied module if we own a local copy of it. @@ -599,10 +573,7 @@ class PipelineModule(nn.Module): # It is expected that the garbage collector will reclaim the cloned tensor storage to avoid memory bloat. # See https://pytorch.org/docs/stable/notes/serialization.html#preserve-storage-sharing orig_state_dict = layer.state_dict() - final_state_dict = type(orig_state_dict)( - {k: v.clone() - for k, - v in orig_state_dict.items()}) + final_state_dict = type(orig_state_dict)({k: v.clone() for k, v in orig_state_dict.items()}) checkpoint_engine.save(final_state_dict, model_ckpt_path) def load_state_dir(self, load_dir, checkpoint_engine, strict=True): @@ -616,10 +587,9 @@ class PipelineModule(nn.Module): mp_rank = self._grid.get_slice_parallel_rank() mp_world_size = self._grid.get_slice_parallel_world_size() - sd_loader = SDLoaderFactory.get_sd_loader( - model_ckpt_list, - version=2.0, - checkpoint_engine=checkpoint_engine) + sd_loader = SDLoaderFactory.get_sd_loader(model_ckpt_list, + version=2.0, + checkpoint_engine=checkpoint_engine) load_path, checkpoint, _ = sd_loader.load(mp_world_size, mp_rank, module_key=None, is_pipe_parallel=True) layer.load_state_dict(checkpoint) @@ -636,8 +606,7 @@ class PipelineModule(nn.Module): # Some layers like torch.nn.Embedding will not receive grads if checkpointed, which breaks things. # I presume it's related to the discrete inputs that cannot require_grad? Need to revisit. if self.__class__.__name__ in ('GPTModelPipe', 'GPT2ModelPipe'): - return all('ParallelTransformerLayerPipe' in f.__class__.__name__ - for f in funcs) + return all('ParallelTransformerLayerPipe' in f.__class__.__name__ for f in funcs) if self.checkpointable_layers is not None: return all(f.__class__.__name__ in self.checkpointable_layers for f in funcs) diff --git a/deepspeed/runtime/pipe/p2p.py b/deepspeed/runtime/pipe/p2p.py index 93c61114c..e7c675fc6 100644 --- a/deepspeed/runtime/pipe/p2p.py +++ b/deepspeed/runtime/pipe/p2p.py @@ -111,12 +111,10 @@ def send_obj(msg: typing.Any, dest: int): # serialize the message msg = pickle.dumps(msg) # construct a tensor to send - msg = torch.ByteTensor(torch.ByteStorage.from_buffer(msg)).to( - get_accelerator().device_name()) + msg = torch.ByteTensor(torch.ByteStorage.from_buffer(msg)).to(get_accelerator().device_name()) # Send meta and message - length_tensor = torch.tensor([len(msg)], - dtype=torch.long).to(get_accelerator().device_name()) + length_tensor = torch.tensor([len(msg)], dtype=torch.long).to(get_accelerator().device_name()) dist.send(length_tensor, dst=dest) dist.send(msg, dst=dest) @@ -135,8 +133,7 @@ def recv_obj(sender: int) -> typing.Any: dist.recv(length, src=sender) # Receive and deserialize - msg = torch.empty(length.item(), - dtype=torch.uint8).to(get_accelerator().device_name()) + msg = torch.empty(length.item(), dtype=torch.uint8).to(get_accelerator().device_name()) dist.recv(msg, src=sender) msg = pickle.loads(msg.cpu().numpy().tobytes()) diff --git a/deepspeed/runtime/pipe/schedule.py b/deepspeed/runtime/pipe/schedule.py index a2de45ad5..1d5d2c401 100644 --- a/deepspeed/runtime/pipe/schedule.py +++ b/deepspeed/runtime/pipe/schedule.py @@ -42,6 +42,7 @@ class PipeSchedule(ABC): stages (int): The number of pipeline stages. stage_id (int): The pipe stage that will execute the generated schedule. """ + def __init__(self, micro_batches, stages, stage_id): super().__init__() self.micro_batches = micro_batches @@ -131,6 +132,7 @@ class PipeSchedule(ABC): class InferenceSchedule(PipeSchedule): """A schedule for inferencing batches using pipeline parallelism. """ + def steps(self): """""" prev_micro_batch_id = -1 @@ -188,6 +190,7 @@ class TrainSchedule(PipeSchedule): convergence follows that of a data parallel approach with the same batch size. """ + def steps(self): """""" prev_micro_batch_id = -1 @@ -206,18 +209,14 @@ class TrainSchedule(PipeSchedule): # Exchange activations if is_forward: - if self._valid_micro_batch(prev_micro_batch_id) and self._valid_stage( - self.prev_stage): + if self._valid_micro_batch(prev_micro_batch_id) and self._valid_stage(self.prev_stage): cmds.append(SendGrad(prev_buffer)) - if self._valid_micro_batch(micro_batch_id) and self._valid_stage( - self.prev_stage): + if self._valid_micro_batch(micro_batch_id) and self._valid_stage(self.prev_stage): cmds.append(RecvActivation(curr_buffer)) else: - if self._valid_micro_batch(micro_batch_id) and self._valid_stage( - self.next_stage): + if self._valid_micro_batch(micro_batch_id) and self._valid_stage(self.next_stage): cmds.append(RecvGrad(curr_buffer)) - if self._valid_micro_batch(prev_micro_batch_id) and self._valid_stage( - self.next_stage): + if self._valid_micro_batch(prev_micro_batch_id) and self._valid_stage(self.next_stage): cmds.append(SendActivation(prev_buffer)) # First/last stage loads @@ -300,6 +299,7 @@ class DataParallelSchedule(PipeSchedule): """An example schedule that trains using traditional data parallelism with gradient accumulation. """ + def steps(self): """""" for step_id in range(self.micro_batches): @@ -330,6 +330,7 @@ class PipeInstruction: Args: kwargs (optional): keyword arguments to store as members """ + def __init__(self, **kwargs): self.name = self.__class__.__name__ self.kwargs = kwargs @@ -374,6 +375,7 @@ class BufferOpInstruction(PipeInstruction): Args: buffer_id (int): the index of the pipeline buffer() to modify. """ + def __init__(self, buffer_id, **kwargs): super().__init__(buffer_id=buffer_id, **kwargs) diff --git a/deepspeed/runtime/pipe/topology.py b/deepspeed/runtime/pipe/topology.py index 6c0cd9644..d0d77c377 100644 --- a/deepspeed/runtime/pipe/topology.py +++ b/deepspeed/runtime/pipe/topology.py @@ -19,6 +19,7 @@ class ProcessTopology: Some methods return ProcessCoord namedtuples. """ + def __init__(self, axes, dims): """Create a mapping of n-dimensional tensor coordinates to linear indices. @@ -62,12 +63,7 @@ class ProcessTopology: """Return a list of the axis names in the ordering of the topology. """ return self.axes - def get_rank_repr(self, - rank, - omit_axes=['data', - 'pipe'], - inner_sep='_', - outer_sep='-'): + def get_rank_repr(self, rank, omit_axes=['data', 'pipe'], inner_sep='_', outer_sep='-'): """Return a string representation of a rank. This method is primarily used for checkpointing model data. @@ -181,6 +177,7 @@ class ProcessTopology: Returns: The list of ranks whose coordinates match filter_kwargs. """ + def _filter_helper(x): for key, val in filter_kwargs.items(): if getattr(x, key) != val: @@ -236,12 +233,14 @@ class PipeDataParallelTopology(ProcessTopology): reductions to use high-bandwidth intra-node links and lower-volume pipeline communications to use low-bandwidth inter-node links. """ + def __init__(self, num_pp, num_dp): super().__init__(axes=['pipe', 'data'], dims=[num_pp, num_dp]) class PipeModelDataParallelTopology(ProcessTopology): """ A topology for hybrid pipeline, model, and data parallelism. """ + def __init__(self, num_pp, num_mp, num_dp): super().__init__(axes=['pipe', 'data', 'model'], dims=[num_pp, num_dp, num_mp]) @@ -268,6 +267,7 @@ class PipelineParallelGrid: data_parallel_id = 0, or similarly [9,5] represents wrapped around stages [4,0] for data_parallel_id = 1. """ + def __init__(self, topology=None, process_group=None): # TODO use process_group if provided self.global_rank = dist.get_rank() diff --git a/deepspeed/runtime/progressive_layer_drop.py b/deepspeed/runtime/progressive_layer_drop.py index 65e1a5630..1d40ff6b2 100755 --- a/deepspeed/runtime/progressive_layer_drop.py +++ b/deepspeed/runtime/progressive_layer_drop.py @@ -13,6 +13,7 @@ class ProgressiveLayerDrop(object): The lower the theta value, the faster the training speed. Default value: 0.5. gamma (float): a hyper-parameter that controls how fast the drop ratio increases. Default value: 0.001. """ + def __init__(self, theta=0.5, gamma=0.001): super().__init__() @@ -29,6 +30,7 @@ class ProgressiveLayerDrop(object): return self.current_theta def update_state(self, global_step): + def _prob(x, gamma, p): return (1. - p) * np.exp(-gamma * x) + p diff --git a/deepspeed/runtime/quantize.py b/deepspeed/runtime/quantize.py index 81a7bd569..b2299ca50 100755 --- a/deepspeed/runtime/quantize.py +++ b/deepspeed/runtime/quantize.py @@ -9,6 +9,7 @@ TWO_D_PARAMS = 6 class Quantizer(object): + def __init__(self, q_groups=1, q_mixed_fp16=False, @@ -39,17 +40,12 @@ class Quantizer(object): result = False for index in range(self.layer_num): if self.q_start_bits[index] != self.q_target_bits: - next_step = self.qsteps + ( - TWO_D_PARAMS * (self.layer_num if self.layer_num != 0 else 1)) + next_step = self.qsteps + (TWO_D_PARAMS * (self.layer_num if self.layer_num != 0 else 1)) if next_step >= self.q_period[index]: result = True return result - def quantize(self, - parameter_group, - overflow, - eigenvalue_enabled, - block_eigenvalue={}): + def quantize(self, parameter_group, overflow, eigenvalue_enabled, block_eigenvalue={}): if overflow and not eigenvalue_enabled: return @@ -65,7 +61,8 @@ class Quantizer(object): if block_eigenvalue is None: eigenvalue, layer_id = None, 0 else: - eigenvalue, layer_id = block_eigenvalue[param_id] if param_id in block_eigenvalue else (None, 0) + eigenvalue, layer_id = block_eigenvalue[param_id] if param_id in block_eigenvalue else (None, + 0) if eigenvalue is not None: factor = 1 + math.floor(eigenvalue * 4) p.data = self.compute_quantization(p.data, layer_id, factor) @@ -91,15 +88,11 @@ class Quantizer(object): if self.q_type == 'symmetric': scale = 2 * torch.max(torch.abs(g_min), torch.abs(g_max)) / q_range zero_point = 0. - input_flat = (input_flat / scale + p).round().clamp( - -(q_range >> 1), - (q_range >> 1) - 1) * scale + input_flat = (input_flat / scale + p).round().clamp(-(q_range >> 1), (q_range >> 1) - 1) * scale elif self.q_type == 'asymmetric': scale = (g_max - g_min) / q_range zero_point = (g_min / scale).round() * scale - input_flat = ((input_flat - zero_point) / scale + p).round().clamp( - 0, - (q_range - 1)) * scale + zero_point + input_flat = ((input_flat - zero_point) / scale + p).round().clamp(0, (q_range - 1)) * scale + zero_point output = input_flat.reshape(inputs.shape).contiguous() return output @@ -126,8 +119,7 @@ class Quantizer(object): def mixed_fp16_quantize(self, input, input_q, index): if self.q_mixed_fp16 and self.q_start_bits[index] >= (self.q_target_bits - 1): - input_q = input * self.quantize_real_ratio + ( - 1 - self.quantize_real_ratio) * input_q + input_q = input * self.quantize_real_ratio + (1 - self.quantize_real_ratio) * input_q return input_q return input_q @@ -152,15 +144,12 @@ class Quantizer(object): if self.use_quantizer_kernel: if input.start_bits <= 2: - raise ValueError( - 'Quantization bit is too low, please do it without quantization kernel!' - ) - input_q = ds_quantizer( - input.data.clone(), - self.q_groups, - input.start_bits, - asym=False if self.q_type == 'symmetric' else True, - sr=False if self.q_rounding == 'nearest_neighbor' else True) + raise ValueError('Quantization bit is too low, please do it without quantization kernel!') + input_q = ds_quantizer(input.data.clone(), + self.q_groups, + input.start_bits, + asym=False if self.q_type == 'symmetric' else True, + sr=False if self.q_rounding == 'nearest_neighbor' else True) else: if input.start_bits >= 3: input_flat = self.quantize_highbit(input.data, input.start_bits) diff --git a/deepspeed/runtime/sparse_tensor.py b/deepspeed/runtime/sparse_tensor.py index 49dedbe14..0a1bdb876 100644 --- a/deepspeed/runtime/sparse_tensor.py +++ b/deepspeed/runtime/sparse_tensor.py @@ -10,6 +10,7 @@ import torch class SparseTensor(object): """ Compressed Sparse Tensor """ + def __init__(self, dense_tensor=None): self.orig_dense_tensor = dense_tensor self.is_sparse = dense_tensor.is_sparse @@ -29,9 +30,7 @@ class SparseTensor(object): self.dense_size = None def to_coo_tensor(self): - return torch.sparse_coo_tensor(self.indices.unsqueeze(0), - self.values, - self.dense_size) + return torch.sparse_coo_tensor(self.indices.unsqueeze(0), self.values, self.dense_size) @staticmethod def type(): @@ -40,10 +39,7 @@ class SparseTensor(object): def to_dense(self): it = self.indices.unsqueeze(1) full_indices = torch.cat([it for _ in range(self.dense_size[1])], dim=1) - return self.values.new_zeros(self.dense_size).scatter_add_( - 0, - full_indices, - self.values) + return self.values.new_zeros(self.dense_size).scatter_add_(0, full_indices, self.values) def sparse_size(self): index_size = list(self.indices.size()) diff --git a/deepspeed/runtime/state_dict_factory.py b/deepspeed/runtime/state_dict_factory.py index 1f5c97f27..9cd47ae2d 100755 --- a/deepspeed/runtime/state_dict_factory.py +++ b/deepspeed/runtime/state_dict_factory.py @@ -18,6 +18,7 @@ AUTO_MODULE_KEY = 'auto' class SDLoaderFactory: + @staticmethod def get_sd_loader_json(json_file, checkpoint_engine): if isinstance(json_file, str): @@ -33,10 +34,7 @@ class SDLoaderFactory: mp_size = data.get('mp_size', 0) if sd_type.lower() in ['bloom', 'ds_model']: return data - return SDLoaderFactory.get_sd_loader(ckpt_list, - checkpoint_engine, - sd_type, - version) + return SDLoaderFactory.get_sd_loader(ckpt_list, checkpoint_engine, sd_type, version) @staticmethod def get_sd_loader(ckpt_list, checkpoint_engine, sd_type='Megatron', version=None): @@ -47,12 +45,12 @@ class SDLoaderFactory: class SDLoaderBase(ABC): + def __init__(self, ckpt_list, version, checkpoint_engine): self.module_key = None self.ckpt_list = ckpt_list self.version = version - self.checkpoint_engine = TorchCheckpointEngine( - ) if checkpoint_engine is None else checkpoint_engine + self.checkpoint_engine = TorchCheckpointEngine() if checkpoint_engine is None else checkpoint_engine self.check_ckpt_list() def load(self, @@ -99,9 +97,9 @@ class SDLoaderBase(ABC): loc: storage) if quantize: - quantizer = WeightQuantization(mlp_extra_grouping=mlp_extra_grouping, - mp_size=mp_world_size) - sd_module, all_scales = quantizer.sd_quantize_megatron(self.get_module(sd), quantize_bits, quantize_groups) + quantizer = WeightQuantization(mlp_extra_grouping=mlp_extra_grouping, mp_size=mp_world_size) + sd_module, all_scales = quantizer.sd_quantize_megatron(self.get_module(sd), quantize_bits, + quantize_groups) self.set_module(sd, sd_module) else: all_scales = None @@ -118,17 +116,10 @@ class SDLoaderBase(ABC): assert num_ckpt % mp_world_size == 0, 'Invalid checkpoints and world size for sd merge' num_to_merge = num_ckpt // mp_world_size - ckpt_list = [ - self.ckpt_list[i] for i in range(num_to_merge * mp_rank, - num_to_merge * (mp_rank + 1)) - ] + ckpt_list = [self.ckpt_list[i] for i in range(num_to_merge * mp_rank, num_to_merge * (mp_rank + 1))] logger.info(f"mp_rank: {mp_rank}, ckpt_list: {ckpt_list}") - sd_list = [ - self.checkpoint_engine.load(ckpt, - map_location=lambda storage, - loc: storage) for ckpt in ckpt_list - ] + sd_list = [self.checkpoint_engine.load(ckpt, map_location=lambda storage, loc: storage) for ckpt in ckpt_list] return sd_list def get_split_state_dict(self, mp_world_size, mp_rank): @@ -139,18 +130,15 @@ class SDLoaderBase(ABC): ckpt_index = mp_rank // num_to_split ckpt_offset = mp_rank % num_to_split - logger.info( - f"mp_rank: {mp_rank}, ckpt_list: {self.ckpt_list[ckpt_index]}, offset: {ckpt_offset}" - ) + logger.info(f"mp_rank: {mp_rank}, ckpt_list: {self.ckpt_list[ckpt_index]}, offset: {ckpt_offset}") - sd = self.checkpoint_engine.load(self.ckpt_list[ckpt_index], - map_location=lambda storage, - loc: storage) + sd = self.checkpoint_engine.load(self.ckpt_list[ckpt_index], map_location=lambda storage, loc: storage) return sd, num_to_split, ckpt_offset def _choose_module_key(self, sd): - assert not ('module' in sd and 'model' in sd), "checkpoint has both 'model' and 'module' keys, not sure how to proceed" + assert not ('module' in sd + and 'model' in sd), "checkpoint has both 'model' and 'module' keys, not sure how to proceed" assert 'module' in sd or 'model' in sd, "checkpoint contains neither 'model' or 'module' keys, not sure how to proceed" if 'module' in sd: return 'module' @@ -178,32 +166,19 @@ class SDLoaderBase(ABC): #logger.info(f'checkpoint file list: {self.ckpt_list}') assert len(self.ckpt_list) > 0 - sd = self.checkpoint_engine.load(self.ckpt_list[0], - map_location=lambda storage, - loc: storage) + sd = self.checkpoint_engine.load(self.ckpt_list[0], map_location=lambda storage, loc: storage) # check checkpoint count is same with saved mp_world_size if 'mp_world_size' in sd.keys(): - assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" + assert len(self.ckpt_list) == sd[ + 'mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}" @abstractmethod - def merge_state_dict(self, - mp_world_size, - mp_rank, - quantize, - quantize_bits, - groups, - mlp_extra_grouping): + def merge_state_dict(self, mp_world_size, mp_rank, quantize, quantize_bits, groups, mlp_extra_grouping): pass @abstractmethod - def split_state_dict(self, - mp_world_size, - mp_rank, - quantize, - quantize_bits, - groups, - mlp_extra_grouping): + def split_state_dict(self, mp_world_size, mp_rank, quantize, quantize_bits, groups, mlp_extra_grouping): pass @abstractmethod @@ -212,6 +187,7 @@ class SDLoaderBase(ABC): class MegatronSDLoader(SDLoaderBase): + def __init__(self, ckpt_list, version, checkpoint_engine): super().__init__(ckpt_list, version, checkpoint_engine) """ @@ -340,40 +316,27 @@ class MegatronSDLoader(SDLoaderBase): ckpt_ver = self.get_checkpoint_version(ds_sd) logger.info(f"checkpoint version: {ckpt_ver}") if quantize: - quantizer = WeightQuantization(mlp_extra_grouping=mlp_extra_grouping, - mp_size=mp_world_size) + quantizer = WeightQuantization(mlp_extra_grouping=mlp_extra_grouping, mp_size=mp_world_size) for key in keys: value_list = [sd[key] for sd in client_sd_list] if "attention.dense.weight" in key or "mlp.dense_4h_to_h.weight" in key: if quantize: - value_list = quantizer.Quantize(value_list, - quantize_bits, - groups, - key=key, - merge_dim=1) + value_list = quantizer.Quantize(value_list, quantize_bits, groups, key=key, merge_dim=1) new_client_sd[key] = torch.cat(value_list, axis=1) elif "attention.query_key_value" in key: if quantize and "attention.query_key_value.weight" in key: - value_list = quantizer.Quantize(value_list, - quantize_bits, - groups, - key=key) + value_list = quantizer.Quantize(value_list, quantize_bits, groups, key=key) new_client_sd[key] = torch.cat(value_list, axis=0) else: if quantize: new_client_sd[key] = torch.cat(value_list, axis=0) else: - new_client_sd[key] = self.merge_query_key_value( - value_list, - ckpt_ver) + new_client_sd[key] = self.merge_query_key_value(value_list, ckpt_ver) elif "mlp.dense_h_to_4h.weight" in key or "word_embeddings.weight" in key or "mlp.dense_h_to_4h.bias" in key: if quantize and "mlp.dense_h_to_4h.weight" in key: - value_list = quantizer.Quantize(value_list, - quantize_bits, - groups, - key=key) + value_list = quantizer.Quantize(value_list, quantize_bits, groups, key=key) new_client_sd[key] = torch.cat(value_list, axis=0) else: new_client_sd[key] = value_list[0] @@ -402,8 +365,7 @@ class MegatronSDLoader(SDLoaderBase): logger.info(f"checkpoint version: {ckpt_ver}") if quantize: - quantizer = WeightQuantization(mlp_extra_grouping=mlp_extra_grouping, - mp_size=mp_world_size) + quantizer = WeightQuantization(mlp_extra_grouping=mlp_extra_grouping, mp_size=mp_world_size) for key in client_sd.keys(): value = client_sd[key] @@ -419,11 +381,7 @@ class MegatronSDLoader(SDLoaderBase): if quantize and "attention.query_key_value.weight" in key: q_vals = quantizer.Quantize([value], quantize_bits, groups, key) value = q_vals[0] - new_client_sd[key] = self.split_query_key_value( - value, - num_to_split, - ckpt_offset, - ckpt_ver) + new_client_sd[key] = self.split_query_key_value(value, num_to_split, ckpt_offset, ckpt_ver) elif "mlp.dense_h_to_4h.weight" in key or "word_embeddings.weight" in key or "mlp.dense_h_to_4h.bias" in key or "final_linear.weight" in key: assert value.shape[0] % num_to_split == 0 split_size = value.shape[0] // num_to_split @@ -443,16 +401,11 @@ class MegatronSDLoader(SDLoaderBase): def sanity_check(self, ckpt_file_name): keys_to_check = [ - "attention.dense.weight", - "mlp.dense_4h_to_h.weight", - "attention.query_key_value", - "mlp.dense_h_to_4h.weight", - "mlp.dense_h_to_4h.bias" + "attention.dense.weight", "mlp.dense_4h_to_h.weight", "attention.query_key_value", + "mlp.dense_h_to_4h.weight", "mlp.dense_h_to_4h.bias" ] - sd = self.checkpoint_engine.load(ckpt_file_name, - map_location=lambda storage, - loc: storage) + sd = self.checkpoint_engine.load(ckpt_file_name, map_location=lambda storage, loc: storage) # partial_key is a sub-string of one key in the sd def check_key_exist(partial_key, sd): @@ -465,10 +418,9 @@ class MegatronSDLoader(SDLoaderBase): return found for key in keys_to_check: - assert check_key_exist(key, self.get_module(sd)), f'key: {key} is not found in the checkpoint {ckpt_file_name}' + assert check_key_exist(key, + self.get_module(sd)), f'key: {key} is not found in the checkpoint {ckpt_file_name}' def get_checkpoint_version(self, state_dict): # Use 0 if version info doesn't exist - return self.version if self.version is not None else state_dict.get( - 'checkpoint_version', - 0) + return self.version if self.version is not None else state_dict.get('checkpoint_version', 0) diff --git a/deepspeed/runtime/swap_tensor/aio_config.py b/deepspeed/runtime/swap_tensor/aio_config.py index 6a7014c18..a9b5eed27 100644 --- a/deepspeed/runtime/swap_tensor/aio_config.py +++ b/deepspeed/runtime/swap_tensor/aio_config.py @@ -19,26 +19,11 @@ def get_aio_config(param_dict): if AIO in param_dict.keys() and param_dict[AIO] is not None: aio_dict = param_dict[AIO] return { - AIO_BLOCK_SIZE: - get_scalar_param(aio_dict, - AIO_BLOCK_SIZE, - AIO_BLOCK_SIZE_DEFAULT), - AIO_QUEUE_DEPTH: - get_scalar_param(aio_dict, - AIO_QUEUE_DEPTH, - AIO_QUEUE_DEPTH_DEFAULT), - AIO_THREAD_COUNT: - get_scalar_param(aio_dict, - AIO_THREAD_COUNT, - AIO_THREAD_COUNT_DEFAULT), - AIO_SINGLE_SUBMIT: - get_scalar_param(aio_dict, - AIO_SINGLE_SUBMIT, - AIO_SINGLE_SUBMIT_DEFAULT), - AIO_OVERLAP_EVENTS: - get_scalar_param(aio_dict, - AIO_OVERLAP_EVENTS, - AIO_OVERLAP_EVENTS_DEFAULT) + AIO_BLOCK_SIZE: get_scalar_param(aio_dict, AIO_BLOCK_SIZE, AIO_BLOCK_SIZE_DEFAULT), + AIO_QUEUE_DEPTH: get_scalar_param(aio_dict, AIO_QUEUE_DEPTH, AIO_QUEUE_DEPTH_DEFAULT), + AIO_THREAD_COUNT: get_scalar_param(aio_dict, AIO_THREAD_COUNT, AIO_THREAD_COUNT_DEFAULT), + AIO_SINGLE_SUBMIT: get_scalar_param(aio_dict, AIO_SINGLE_SUBMIT, AIO_SINGLE_SUBMIT_DEFAULT), + AIO_OVERLAP_EVENTS: get_scalar_param(aio_dict, AIO_OVERLAP_EVENTS, AIO_OVERLAP_EVENTS_DEFAULT) } return AIO_DEFAULT_DICT diff --git a/deepspeed/runtime/swap_tensor/async_swapper.py b/deepspeed/runtime/swap_tensor/async_swapper.py index 45614abd7..49db7fd5d 100644 --- a/deepspeed/runtime/swap_tensor/async_swapper.py +++ b/deepspeed/runtime/swap_tensor/async_swapper.py @@ -15,6 +15,7 @@ ASYNC_SWAPPER_WAIT_TIMER = 'async_swap_gradient_wait' class AsyncTensorSwapper(object): + def __init__(self, aio_handle, numel_alignment, timers): self.free_buffer_index = [] self.swapping_buffer_index = [] @@ -70,9 +71,7 @@ class AsyncTensorSwapper(object): if dist.get_rank() == 0: element_size = torch.tensor([], dtype=self.dtype).element_size() swapped_GB = (self.num_elements_swapped * element_size) / (1024**3) - logger.debug( - f'{message} num_elems = {self.num_elements_swapped}, {swapped_GB:5.2f} GB' - ) + logger.debug(f'{message} num_elems = {self.num_elements_swapped}, {swapped_GB:5.2f} GB') def _swap_out_tensor(self, tensor, swap_path): assert len(self.all_buffers) > 0 diff --git a/deepspeed/runtime/swap_tensor/optimizer_utils.py b/deepspeed/runtime/swap_tensor/optimizer_utils.py index 70b806c3a..f05c130f4 100644 --- a/deepspeed/runtime/swap_tensor/optimizer_utils.py +++ b/deepspeed/runtime/swap_tensor/optimizer_utils.py @@ -17,6 +17,7 @@ from deepspeed.runtime.swap_tensor.utils import SwapBufferManager, SwapBufferPoo class FlattenedTensorSwapInfo(object): + def __init__(self, path, length, offset): self.path = path self.offset = offset @@ -24,6 +25,7 @@ class FlattenedTensorSwapInfo(object): class OptimizerStateSwapInfo(object): + def __init__(self, parameter, numel, base_folder): self.tensors = [] self.param_id = id(parameter) @@ -66,13 +68,8 @@ class OptimizerStateSwapInfo(object): gradient_paths = [] for offset, length in zip(offsets, lengths): if not offset in self.swapped_gradients.keys(): - path = os.path.join( - self.swap_folder, - f'{self.param_id}_gradient_{offset}_{length}.tensor.swp') - self.swapped_gradients[offset] = FlattenedTensorSwapInfo( - path, - length, - offset) + path = os.path.join(self.swap_folder, f'{self.param_id}_gradient_{offset}_{length}.tensor.swp') + self.swapped_gradients[offset] = FlattenedTensorSwapInfo(path, length, offset) gradient_paths.append(self.swapped_gradients[offset].path) @@ -86,11 +83,7 @@ class OptimizerStateSwapInfo(object): def get_swap_gradient_buffers(self, swap_buffer): assert self.numel() <= swap_buffer.numel() - return [ - swap_buffer.narrow(0, - grad.offset, - grad.length) for grad in self.swapped_gradients.values() - ] + return [swap_buffer.narrow(0, grad.offset, grad.length) for grad in self.swapped_gradients.values()] def get_swap_gradient_paths(self): return [grad.path for grad in self.swapped_gradients.values()] @@ -116,24 +109,15 @@ SWAP_OUT_GRADIENT_TIMER = 'swap_out_gradient' class OptimizerSwapper(object): - def __init__(self, - swap_config, - aio_config, - base_folder, - optimizer, - largest_numel, - device, - dtype, - timers): + + def __init__(self, swap_config, aio_config, base_folder, optimizer, largest_numel, device, dtype, timers): self.swap_config = swap_config self.aio_config = aio_config # NVMe swap management self.swap_params_info = {} self.swap_element_size = torch.tensor([], dtype=dtype).element_size() - self.swap_folder = os.path.join(base_folder, - 'optimizer', - f'rank{dist.get_rank()}') + self.swap_folder = os.path.join(base_folder, 'optimizer', f'rank{dist.get_rank()}') os.makedirs(self.swap_folder, exist_ok=True) self.optimizer = optimizer @@ -191,11 +175,7 @@ class OptimizerSwapper(object): self.timer_names.add(SWAP_OUT_GRADIENT_TIMER) self.timer_names.update(gradient_swapper.get_timer_names()) - def _swap_out_gradients(self, - parameter, - gradient_offsets, - gradient_tensors, - gradient_swapper): + def _swap_out_gradients(self, parameter, gradient_offsets, gradient_tensors, gradient_swapper): if not id(parameter) in self.swap_params_info.keys(): return @@ -205,10 +185,8 @@ class OptimizerSwapper(object): swappable_offsets = [] swappable_lengths = [] - aligned_gradients, aligned_offsets = self._adjust_for_misaligned_lengths( - tensors=gradient_tensors, - offsets=gradient_offsets - ) + aligned_gradients, aligned_offsets = self._adjust_for_misaligned_lengths(tensors=gradient_tensors, + offsets=gradient_offsets) self._start_timer(SWAP_OUT_GRADIENT_TIMER) for tensor, offset in zip(aligned_gradients, aligned_offsets): @@ -222,38 +200,26 @@ class OptimizerSwapper(object): if len(swappable_tensors) > 0: if not gradient_swapper.has_buffers(): - pinned_buffers = self.swap_buffer_manager.allocate_all( - num_elems=self.largest_numel, - dtype=self.dtype) + pinned_buffers = self.swap_buffer_manager.allocate_all(num_elems=self.largest_numel, dtype=self.dtype) gradient_swapper.add_buffers(pinned_buffers) - swappable_paths = swap_info.get_or_create_gradient_paths( - swappable_offsets, - swappable_lengths) + swappable_paths = swap_info.get_or_create_gradient_paths(swappable_offsets, swappable_lengths) - gradient_swapper.swap_out_tensors(tensor_list=swappable_tensors, - path_list=swappable_paths) + gradient_swapper.swap_out_tensors(tensor_list=swappable_tensors, path_list=swappable_paths) self._stop_timer(SWAP_OUT_GRADIENT_TIMER) self.timer_names.add(SWAP_OUT_GRADIENT_TIMER) - def _initialize_from_swapped_fp16_params(self, - aio_handle, - fp16_partitions_info, - fp16_num_elems, - fp16_pinned_buffers, - fp32_parameters): + def _initialize_from_swapped_fp16_params(self, aio_handle, fp16_partitions_info, fp16_num_elems, + fp16_pinned_buffers, fp32_parameters): assert len(fp32_parameters) == len(fp16_partitions_info) assert len(fp32_parameters) == len(fp16_num_elems) assert all([buffer.is_pinned() for buffer in fp16_pinned_buffers]) - fp32_swap_paths = self._get_swap_paths(parameters=fp32_parameters, - num_elems=fp16_num_elems) + fp32_swap_paths = self._get_swap_paths(parameters=fp32_parameters, num_elems=fp16_num_elems) - fp32_pinned_buffers = self.swap_buffer_manager.allocate_all( - num_elems=self.largest_numel, - dtype=self.dtype) + fp32_pinned_buffers = self.swap_buffer_manager.allocate_all(num_elems=self.largest_numel, dtype=self.dtype) fp16_buffer_numel = [buf.numel() for buf in fp16_pinned_buffers] assert all([numel >= self.largest_numel for numel in fp16_buffer_numel]), \ @@ -264,11 +230,10 @@ class OptimizerSwapper(object): curr_index = 0 while curr_index < len(fp32_parameters): - fp16_pinned_tensors = self._swap_in_fp16_params( - aio_handle=aio_handle, - fp16_num_elems=fp16_num_elems[curr_index:], - fp16_partitions_info=fp16_partitions_info[curr_index:], - fp16_swap_buffers=fp16_swap_buffers) + fp16_pinned_tensors = self._swap_in_fp16_params(aio_handle=aio_handle, + fp16_num_elems=fp16_num_elems[curr_index:], + fp16_partitions_info=fp16_partitions_info[curr_index:], + fp16_swap_buffers=fp16_swap_buffers) if dist.get_rank() == 0 and SWAPPER_DEBUG_MODE: for i, tensor in enumerate(fp16_pinned_tensors): @@ -277,11 +242,10 @@ class OptimizerSwapper(object): f'swap_in_fp16_param: fp32_id = {id(fp32_parameters[true_index])} index = {true_index} orig_num_elem = {fp16_num_elems[true_index]}, swap_num_elem = {fp16_pinned_tensors[i].numel()}' ) - swap_out_count = self._swap_out_fp16_params( - aio_handle=aio_handle, - fp32_swap_paths=fp32_swap_paths[curr_index:], - fp32_swap_buffers=fp32_swap_buffers, - fp16_pinned_tensors=fp16_pinned_tensors) + swap_out_count = self._swap_out_fp16_params(aio_handle=aio_handle, + fp32_swap_paths=fp32_swap_paths[curr_index:], + fp32_swap_buffers=fp32_swap_buffers, + fp16_pinned_tensors=fp16_pinned_tensors) assert swap_out_count == len(fp16_pinned_tensors), \ f"{swap_out_count} does not match {len(fp16_pinned_tensors)}" @@ -291,11 +255,7 @@ class OptimizerSwapper(object): self.swap_buffer_manager.free(fp32_pinned_buffers) - def _swap_in_fp16_params(self, - aio_handle, - fp16_num_elems, - fp16_partitions_info, - fp16_swap_buffers): + def _swap_in_fp16_params(self, aio_handle, fp16_num_elems, fp16_partitions_info, fp16_swap_buffers): assert len(fp16_num_elems) > 0 swapped_fp16_tensors = [] @@ -330,11 +290,7 @@ class OptimizerSwapper(object): return swapped_fp16_tensors - def _swap_out_fp16_params(self, - aio_handle, - fp32_swap_paths, - fp32_swap_buffers, - fp16_pinned_tensors): + def _swap_out_fp16_params(self, aio_handle, fp32_swap_paths, fp32_swap_buffers, fp16_pinned_tensors): assert len(fp16_pinned_tensors) <= len(fp32_swap_paths) swap_out_count = 0 @@ -343,11 +299,8 @@ class OptimizerSwapper(object): fp32_swap_buffers.swap_out(aio_handle) fp32_swap_buffers.reset() - pinned_tensor, _ = fp32_swap_buffers.insert_tensor( - fp16_tensor, - fp32_swap_paths[i], - self._io_aligned_numel(fp16_tensor.numel()) - ) + pinned_tensor, _ = fp32_swap_buffers.insert_tensor(fp16_tensor, fp32_swap_paths[i], + self._io_aligned_numel(fp16_tensor.numel())) assert pinned_tensor is not None swap_out_count += 1 @@ -359,15 +312,12 @@ class OptimizerSwapper(object): def _initialize_parameters(self, parameters, src_tensors, aio_handle): assert len(parameters) == len(src_tensors) - swap_paths = self._get_swap_paths(parameters=parameters, - num_elems=[src.numel() for src in src_tensors]) + swap_paths = self._get_swap_paths(parameters=parameters, num_elems=[src.numel() for src in src_tensors]) SWAP_INIT_TIMER = "swap_init_write" self._start_timer(SWAP_INIT_TIMER) - pinned_buffers = self.swap_buffer_manager.allocate_all( - num_elems=self.largest_numel, - dtype=self.dtype) + pinned_buffers = self.swap_buffer_manager.allocate_all(num_elems=self.largest_numel, dtype=self.dtype) assert pinned_buffers is not None self._swap_out_unpinned_tensors(aio_handle=aio_handle, @@ -397,11 +347,7 @@ class OptimizerSwapper(object): swap_paths = [info.swap_paths[0] for info in swap_info_list] return swap_paths - def _swap_out_unpinned_tensors(self, - aio_handle, - unpinned_tensors, - dest_paths, - pinned_buffers): + def _swap_out_unpinned_tensors(self, aio_handle, unpinned_tensors, dest_paths, pinned_buffers): swap_buffer_count = len(pinned_buffers) unpinned_tensor_count = len(unpinned_tensors) @@ -441,8 +387,7 @@ class OptimizerSwapper(object): continue # Split into two by making remainder a tensor - aligned_length = (orig_tensor.numel() // - self.numel_alignment) * self.numel_alignment + aligned_length = (orig_tensor.numel() // self.numel_alignment) * self.numel_alignment new_tensors.append(orig_tensor.narrow(0, 0, aligned_length)) new_offsets.append(orig_offset) @@ -489,10 +434,9 @@ class OptimizerSwapper(object): param_id = id(parameter) assert not param_id in self.swap_params_info - self.swap_params_info[param_id] = OptimizerStateSwapInfo( - parameter=parameter, - numel=numel, - base_folder=self.swap_folder) + self.swap_params_info[param_id] = OptimizerStateSwapInfo(parameter=parameter, + numel=numel, + base_folder=self.swap_folder) swap_info = self.swap_params_info[param_id] self._update_param_state_info(swap_info, parameter) diff --git a/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py b/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py index 515853529..0e6d48f3a 100644 --- a/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py +++ b/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py @@ -25,30 +25,14 @@ SWAP_IN_GRADIENT_TIMER = 'swap_in_gradient' class PartitionedOptimizerSwapper(OptimizerSwapper): - def __init__(self, - swap_config, - aio_config, - base_folder, - optimizer, - largest_numel, - device, - dtype, - timers): - super(PartitionedOptimizerSwapper, - self).__init__(swap_config, - aio_config, - base_folder, - optimizer, - largest_numel, - device, - dtype, - timers) + + def __init__(self, swap_config, aio_config, base_folder, optimizer, largest_numel, device, dtype, timers): + super(PartitionedOptimizerSwapper, self).__init__(swap_config, aio_config, base_folder, optimizer, + largest_numel, device, dtype, timers) aio_op = AsyncIOBuilder().load() - self.aio_handle = aio_op.aio_handle(aio_config[AIO_BLOCK_SIZE], - aio_config[AIO_QUEUE_DEPTH], - aio_config[AIO_SINGLE_SUBMIT], - aio_config[AIO_OVERLAP_EVENTS], + self.aio_handle = aio_op.aio_handle(aio_config[AIO_BLOCK_SIZE], aio_config[AIO_QUEUE_DEPTH], + aio_config[AIO_SINGLE_SUBMIT], aio_config[AIO_OVERLAP_EVENTS], aio_config[AIO_THREAD_COUNT]) # Overlap swapping out @@ -56,33 +40,21 @@ class PartitionedOptimizerSwapper(OptimizerSwapper): numel_alignment=self.numel_alignment, timers=self.timers) - self.print_exclude_list += [ - 'aio_handle', - 'gradient_swapper', - 'print_exclude_list' - ] + self.print_exclude_list += ['aio_handle', 'gradient_swapper', 'print_exclude_list'] if dist.get_rank() == 0: - print_object(obj=self, - name='PartitionedOptimizerSwapper', - exclude_list=self.print_exclude_list) + print_object(obj=self, name='PartitionedOptimizerSwapper', exclude_list=self.print_exclude_list) def initialize_parameters(self, parameters, src_tensors): - self._initialize_parameters(parameters=parameters, - src_tensors=src_tensors, - aio_handle=self.aio_handle) + self._initialize_parameters(parameters=parameters, src_tensors=src_tensors, aio_handle=self.aio_handle) - def initialize_from_swapped_fp16_params(self, - fp16_partitions_info, - fp16_num_elems, - fp16_pinned_buffers, + def initialize_from_swapped_fp16_params(self, fp16_partitions_info, fp16_num_elems, fp16_pinned_buffers, fp32_parameters): - self._initialize_from_swapped_fp16_params( - aio_handle=self.aio_handle, - fp16_partitions_info=fp16_partitions_info, - fp16_num_elems=fp16_num_elems, - fp16_pinned_buffers=fp16_pinned_buffers, - fp32_parameters=fp32_parameters) + self._initialize_from_swapped_fp16_params(aio_handle=self.aio_handle, + fp16_partitions_info=fp16_partitions_info, + fp16_num_elems=fp16_num_elems, + fp16_pinned_buffers=fp16_pinned_buffers, + fp32_parameters=fp32_parameters) def flush_gradients(self): self._flush_gradient_swapper(self.gradient_swapper) @@ -94,8 +66,7 @@ class PartitionedOptimizerSwapper(OptimizerSwapper): self._flush_gradient_swapper(self.gradient_swapper) - required_buffer_count = len( - swap_info.tensors) + (1 if swap_info.has_gradients() else 0) + required_buffer_count = len(swap_info.tensors) + (1 if swap_info.has_gradients() else 0) aligned_numel = self._io_aligned_numel(swap_info.numel()) pinned_buffers = self.swap_buffer_manager.allocate(num_elems=aligned_numel, count=required_buffer_count, @@ -111,9 +82,7 @@ class PartitionedOptimizerSwapper(OptimizerSwapper): self.timer_names.add(SWAP_IN_PARAM_TIMER) self._start_timer(SWAP_IN_GRADIENT_TIMER) - self._swap_in_gradients(aio_handle=self.aio_handle, - parameter=parameter, - dest_buffer=pinned_buffers[-1]) + self._swap_in_gradients(aio_handle=self.aio_handle, parameter=parameter, dest_buffer=pinned_buffers[-1]) self._stop_timer(SWAP_IN_GRADIENT_TIMER) self.timer_names.add(SWAP_IN_GRADIENT_TIMER) @@ -125,10 +94,7 @@ class PartitionedOptimizerSwapper(OptimizerSwapper): self._start_timer(SWAP_OUT_PARAM_TIMER) pinned_tensors, pinned_paths, unpinned_tensors, unpinned_paths = self._separate_pinned_tensors(swap_info) - swap_bytes = sum([ - self._io_aligned_numel(t.numel()) * t.element_size() - for t in swap_info.tensors - ]) + swap_bytes = sum([self._io_aligned_numel(t.numel()) * t.element_size() for t in swap_info.tensors]) WRITE_TIMER = 'swap_submit_write' self._start_timer(WRITE_TIMER) @@ -139,9 +105,7 @@ class PartitionedOptimizerSwapper(OptimizerSwapper): t.data = torch.Tensor() if len(unpinned_tensors) > 0: - pinned_buffers = self.swap_buffer_manager.allocate_all( - num_elems=self.largest_numel, - dtype=self.dtype) + pinned_buffers = self.swap_buffer_manager.allocate_all(num_elems=self.largest_numel, dtype=self.dtype) self._swap_out_unpinned_tensors(aio_handle=self.aio_handle, unpinned_tensors=unpinned_tensors, dest_paths=unpinned_paths, @@ -176,8 +140,7 @@ class PartitionedOptimizerSwapper(OptimizerSwapper): assert len(swap_info.tensors) <= len(dest_buffers) - swap_lengths = [self._io_aligned_numel(swap_info.numel())] * len( - swap_info.tensors) + swap_lengths = [self._io_aligned_numel(swap_info.numel())] * len(swap_info.tensors) swap_buffers = get_sized_buffers(dest_buffers, swap_lengths) READ_TIMER = 'swap_submit_read_param' @@ -187,8 +150,7 @@ class PartitionedOptimizerSwapper(OptimizerSwapper): swap_in_tensors(aio_handle, swap_buffers, swap_info.swap_paths) self._stop_timer(READ_TIMER) - swap_bytes = sum( - [buffer.numel() * buffer.element_size() for buffer in swap_buffers]) + swap_bytes = sum([buffer.numel() * buffer.element_size() for buffer in swap_buffers]) self._start_timer(WAIT_TIMER) aio_handle.wait() @@ -223,11 +185,7 @@ class PartitionedOptimizerSwapper(OptimizerSwapper): def _swap_in_pinned_gradients(self, aio_handle, parameter, gradient_tensor): swap_info = self.swap_params_info[id(parameter)] param_gradients = swap_info.swapped_gradients.values() - swap_buffers = [ - gradient_tensor.narrow(0, - grad.offset, - grad.length) for grad in param_gradients - ] + swap_buffers = [gradient_tensor.narrow(0, grad.offset, grad.length) for grad in param_gradients] swap_paths = [grad.path for grad in param_gradients] SWAP_READ_GRADIENTS = 'swap_submit_read_gradient' SWAP_WAIT_GRADIENTS = 'swap_submit_wait_gradient' @@ -256,5 +214,4 @@ class PartitionedOptimizerSwapper(OptimizerSwapper): self._swap_in_pinned_gradients(aio_handle, parameter, parameter.grad) if swap_info.unswapped_gradients: - self._retrieve_unswapped_grad_partitions(swap_info=swap_info, - dest_buffer=parameter.grad) + self._retrieve_unswapped_grad_partitions(swap_info=swap_info, dest_buffer=parameter.grad) diff --git a/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py b/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py index e9e134e67..da19a8f7d 100644 --- a/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py +++ b/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py @@ -33,6 +33,7 @@ class PartitionedParamStatus(Enum): class AsyncPartitionedParameterSwapper(object): + def __init__(self, ds_config, model_dtype): aio_op = AsyncIOBuilder().load(verbose=False) @@ -74,9 +75,7 @@ class AsyncPartitionedParameterSwapper(object): if dist.get_rank() == 0: exclude_list = ['aio_read_handle', 'aio_write_handle', 'buffers'] - print_object(obj=self, - name='AsyncPartitionedParameterSwapper', - exclude_list=exclude_list) + print_object(obj=self, name='AsyncPartitionedParameterSwapper', exclude_list=exclude_list) def available_swap_in_buffers(self): return len(self.available_buffer_ids) @@ -84,9 +83,7 @@ class AsyncPartitionedParameterSwapper(object): def _configure_aio(self, ds_config): self.swap_config = ds_config.zero_config.offload_param torch_dtype_string = str(self.dtype).split(".")[1] - self.swap_folder = os.path.join(self.swap_config.nvme_path, - 'zero_stage_3', - f'{torch_dtype_string}params', + self.swap_folder = os.path.join(self.swap_config.nvme_path, 'zero_stage_3', f'{torch_dtype_string}params', f'rank{dist.get_rank()}') shutil.rmtree(self.swap_folder, ignore_errors=True) os.makedirs(self.swap_folder, exist_ok=True) @@ -101,8 +98,7 @@ class AsyncPartitionedParameterSwapper(object): self.numel_alignment = self.aligned_bytes // self.swap_element_size self.elements_per_buffer = self.swap_config.buffer_size - self.aligned_elements_per_buffer = self._io_aligned_numel( - self.elements_per_buffer) + self.aligned_elements_per_buffer = self._io_aligned_numel(self.elements_per_buffer) self.param_buffer_count = self.swap_config.buffer_count self.available_buffer_ids = [i for i in range(self.param_buffer_count)] @@ -112,17 +108,13 @@ class AsyncPartitionedParameterSwapper(object): dtype=self.dtype, requires_grad=False)) - self.aio_read_handle = self.aio_handle(self.aio_config[AIO_BLOCK_SIZE], - self.aio_config[AIO_QUEUE_DEPTH], - self.aio_config[AIO_SINGLE_SUBMIT], - self.aio_config[AIO_OVERLAP_EVENTS], + self.aio_read_handle = self.aio_handle(self.aio_config[AIO_BLOCK_SIZE], self.aio_config[AIO_QUEUE_DEPTH], + self.aio_config[AIO_SINGLE_SUBMIT], self.aio_config[AIO_OVERLAP_EVENTS], self.aio_config[AIO_THREAD_COUNT]) - self.aio_write_handle = self.aio_handle(self.aio_config[AIO_BLOCK_SIZE], - self.aio_config[AIO_QUEUE_DEPTH], + self.aio_write_handle = self.aio_handle(self.aio_config[AIO_BLOCK_SIZE], self.aio_config[AIO_QUEUE_DEPTH], self.aio_config[AIO_SINGLE_SUBMIT], - self.aio_config[AIO_OVERLAP_EVENTS], - self.aio_config[AIO_THREAD_COUNT]) + self.aio_config[AIO_OVERLAP_EVENTS], self.aio_config[AIO_THREAD_COUNT]) self.swap_out_params = [] @@ -147,8 +139,7 @@ class AsyncPartitionedParameterSwapper(object): param_path = self.id_to_path[param_id] else: assert not must_exist, f"Path for param id {param_id} does not exist" - param_path = os.path.join(self.swap_folder, - f'{param_id}_param.tensor.swp') + param_path = os.path.join(self.swap_folder, f'{param_id}_param.tensor.swp') self.id_to_path[param_id] = param_path paths.append(param_path) @@ -177,18 +168,16 @@ class AsyncPartitionedParameterSwapper(object): for param in params: param_id = param.ds_id assert param_id in self.param_id_to_numel.keys(), f" Number of elements in param {param_id} is unknown" - assert param_id not in self.param_id_to_buffer_id.keys(), f"param {param_id} already assigned swap buffer id {self.param_id_to_buffer_id[param_id]}" - assert param_id not in self.param_id_to_swap_buffer.keys(), f"param {param_id} has already been assigned a swap buffer" + assert param_id not in self.param_id_to_buffer_id.keys( + ), f"param {param_id} already assigned swap buffer id {self.param_id_to_buffer_id[param_id]}" + assert param_id not in self.param_id_to_swap_buffer.keys( + ), f"param {param_id} has already been assigned a swap buffer" buffer_id = self.available_buffer_ids.pop() - print_rank_0( - f"param {param.ds_id} is assigned swap in buffer id {buffer_id} ") + print_rank_0(f"param {param.ds_id} is assigned swap in buffer id {buffer_id} ") self.param_id_to_buffer_id[param_id] = buffer_id aligned_swap_numel = self._io_aligned_numel(self.param_id_to_numel[param_id]) - swap_buffer = self.buffers.narrow( - 0, - int(buffer_id * self.aligned_elements_per_buffer), - aligned_swap_numel) + swap_buffer = self.buffers.narrow(0, int(buffer_id * self.aligned_elements_per_buffer), aligned_swap_numel) self.param_id_to_swap_buffer[param_id] = swap_buffer compute_buffer = swap_buffer.narrow(0, 0, self.param_id_to_numel[param_id]) @@ -217,9 +206,7 @@ class AsyncPartitionedParameterSwapper(object): for param, swap_in_buffer in zip(self.inflight_params, self.inflight_swap_in_buffers): param_id = param.ds_id - compute_buffer = swap_in_buffer.narrow(0, - 0, - self.param_id_to_numel[param_id]) + compute_buffer = swap_in_buffer.narrow(0, 0, self.param_id_to_numel[param_id]) param.ds_tensor.data = compute_buffer.data param.ds_tensor.status = PartitionedParamStatus.AVAILABLE @@ -289,7 +276,8 @@ class AsyncPartitionedParameterSwapper(object): #assigns an in memory buffer and swaps in from nvme def swap_in(self, params, async_op=True, swap_in_buffers=None): - assert all([param.ds_tensor.status == PartitionedParamStatus.NOT_AVAILABLE for param in params]), "Some params are already available or in flight" + assert all([param.ds_tensor.status == PartitionedParamStatus.NOT_AVAILABLE + for param in params]), "Some params are already available or in flight" swap_in_paths = self._get_swap_paths(params) if swap_in_buffers is None: @@ -305,7 +293,9 @@ class AsyncPartitionedParameterSwapper(object): f'Num available params: count = {len(self.available_params)}, ids = {self.available_params}, numel = {self.available_numel}', force=True) - assert len(swap_in_paths) <= len(self.available_buffer_ids), f"Not enough buffers {len(self.available_buffer_ids)} for swapping {len(swap_in_paths)}" + assert len(swap_in_paths) <= len( + self.available_buffer_ids + ), f"Not enough buffers {len(self.available_buffer_ids)} for swapping {len(swap_in_paths)}" compute_buffers, swap_in_buffers = self._allocate_and_return_buffers_for_swap_in(params) inflight_numel = sum([t.numel() for t in compute_buffers]) else: @@ -322,8 +312,7 @@ class AsyncPartitionedParameterSwapper(object): def swap_into_buffer(self, param, dest_buffer): assert param.ds_tensor.status == PartitionedParamStatus.NOT_AVAILABLE, f"param {param.ds_id} is already available or inflight" - require_swap_buffer = not (dest_buffer.is_pinned() - and self._is_io_aligned(dest_buffer.numel())) + require_swap_buffer = not (dest_buffer.is_pinned() and self._is_io_aligned(dest_buffer.numel())) if require_swap_buffer: assert len(self.available_buffer_ids) > 0, f"No buffer available to swap param {param.ds_id}." @@ -348,17 +337,15 @@ class AsyncPartitionedParameterSwapper(object): def get_buffer(self, param, numel): param_id = param.ds_id - assert self.available_swap_in_buffers() > 0, f"No swap buffers to allocate for fp16 param {param_id} of numel = {numel}" + assert self.available_swap_in_buffers( + ) > 0, f"No swap buffers to allocate for fp16 param {param_id} of numel = {numel}" assert numel < self.elements_per_buffer, f"More elements {numel} than buffer size {self.elements_per_buffer}" self.param_id_to_numel[param_id] = numel buffer_id = self.available_buffer_ids.pop() self.param_id_to_buffer_id[param_id] = buffer_id aligned_swap_numel = self._io_aligned_numel(self.param_id_to_numel[param_id]) - swap_buffer = self.buffers.narrow( - 0, - int(buffer_id * self.aligned_elements_per_buffer), - aligned_swap_numel) + swap_buffer = self.buffers.narrow(0, int(buffer_id * self.aligned_elements_per_buffer), aligned_swap_numel) self.param_id_to_swap_buffer[param_id] = swap_buffer compute_buffer = swap_buffer.narrow(0, 0, self.param_id_to_numel[param_id]) @@ -369,8 +356,7 @@ class AsyncPartitionedParameterSwapper(object): buffers = [] for id in self.available_buffer_ids: buffers.append( - self.buffers.narrow(0, - int(id * self.aligned_elements_per_buffer), + self.buffers.narrow(0, int(id * self.aligned_elements_per_buffer), int(self.aligned_elements_per_buffer))) self.reserved_buffer_ids.append(id) @@ -390,12 +376,9 @@ class AsyncPartitionedParameterSwapper(object): return (numel % self.numel_alignment) == 0 def reserve_partitioned_swap_space(self, partition_num_elems): - aligned_numel = sum( - [self._io_aligned_numel(numel) for numel in partition_num_elems]) + aligned_numel = sum([self._io_aligned_numel(numel) for numel in partition_num_elems]) self.partitioned_swap_buffer = get_accelerator().pin_memory( - torch.zeros(aligned_numel, - device='cpu', - dtype=self.dtype)) + torch.zeros(aligned_numel, device='cpu', dtype=self.dtype)) self.partitioned_swap_pool = SwapBufferPool([self.partitioned_swap_buffer]) def swap_out_partitioned_params(self, dst_fp16_params, src_fp32_params): @@ -408,11 +391,8 @@ class AsyncPartitionedParameterSwapper(object): self.synchronize_writes() self.partitioned_swap_pool.reset() for i, fp32_tensor in enumerate(src_fp32_params): - swap_tensor, _ = self.partitioned_swap_pool.insert_tensor( - fp32_tensor, - fp16_swap_paths[i], - self._io_aligned_numel(fp32_tensor.numel()) - ) + swap_tensor, _ = self.partitioned_swap_pool.insert_tensor(fp32_tensor, fp16_swap_paths[i], + self._io_aligned_numel(fp32_tensor.numel())) assert swap_tensor is not None dst_fp16_params[i].ds_tensor.status = PartitionedParamStatus.AVAILABLE diff --git a/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py b/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py index 4e101528c..ce38113f4 100644 --- a/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py +++ b/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py @@ -16,13 +16,8 @@ from deepspeed.runtime.swap_tensor.optimizer_utils import OptimizerSwapper class OptimizerSwapOp(object): - def __init__(self, - aio_handle, - read_op, - param_info, - allocated_buffers, - state_buffers, - num_ops): + + def __init__(self, aio_handle, read_op, param_info, allocated_buffers, state_buffers, num_ops): self.aio_handle = aio_handle self.read_op = read_op self.param_info = param_info @@ -53,36 +48,18 @@ ASYNC_SWAP_OUT_STATE_TIMER = 'async_swap_out_state' class PipelinedOptimizerSwapper(OptimizerSwapper): - def __init__(self, - swap_config, - aio_config, - base_folder, - optimizer, - largest_numel, - device, - dtype, - timers): - super(PipelinedOptimizerSwapper, - self).__init__(swap_config, - aio_config, - base_folder, - optimizer, - largest_numel, - device, - dtype, - timers) + + def __init__(self, swap_config, aio_config, base_folder, optimizer, largest_numel, device, dtype, timers): + super(PipelinedOptimizerSwapper, self).__init__(swap_config, aio_config, base_folder, optimizer, largest_numel, + device, dtype, timers) aio_op = AsyncIOBuilder().load() - self.write_aio_handle = aio_op.aio_handle(aio_config[AIO_BLOCK_SIZE], - aio_config[AIO_QUEUE_DEPTH], - aio_config[AIO_SINGLE_SUBMIT], - aio_config[AIO_OVERLAP_EVENTS], + self.write_aio_handle = aio_op.aio_handle(aio_config[AIO_BLOCK_SIZE], aio_config[AIO_QUEUE_DEPTH], + aio_config[AIO_SINGLE_SUBMIT], aio_config[AIO_OVERLAP_EVENTS], aio_config[AIO_THREAD_COUNT]) - self.read_aio_handle = aio_op.aio_handle(aio_config[AIO_BLOCK_SIZE], - aio_config[AIO_QUEUE_DEPTH], - aio_config[AIO_SINGLE_SUBMIT], - aio_config[AIO_OVERLAP_EVENTS], + self.read_aio_handle = aio_op.aio_handle(aio_config[AIO_BLOCK_SIZE], aio_config[AIO_QUEUE_DEPTH], + aio_config[AIO_SINGLE_SUBMIT], aio_config[AIO_OVERLAP_EVENTS], aio_config[AIO_THREAD_COUNT]) # Overlap gradient swap out @@ -93,42 +70,25 @@ class PipelinedOptimizerSwapper(OptimizerSwapper): self.async_swap_in = swap_config.pipeline_read self.async_swap_out = swap_config.pipeline_write - self.swap_ops = { - SYNC_SWAP_IN: None, - ASYNC_SWAP_IN: None, - SYNC_SWAP_OUT: None, - ASYNC_SWAP_OUT: None - } + self.swap_ops = {SYNC_SWAP_IN: None, ASYNC_SWAP_IN: None, SYNC_SWAP_OUT: None, ASYNC_SWAP_OUT: None} self.print_exclude_list += [ - 'gradient_swapper', - 'read_aio_handle', - 'write_aio_handle', - 'swap_ops', - 'print_exclude_list' + 'gradient_swapper', 'read_aio_handle', 'write_aio_handle', 'swap_ops', 'print_exclude_list' ] if dist.get_rank() == 0: - print_object(obj=self, - name='PipelinedOptimizerSwapper', - exclude_list=self.print_exclude_list) + print_object(obj=self, name='PipelinedOptimizerSwapper', exclude_list=self.print_exclude_list) def initialize_parameters(self, parameters, src_tensors): - self._initialize_parameters(parameters=parameters, - src_tensors=src_tensors, - aio_handle=self.write_aio_handle) + self._initialize_parameters(parameters=parameters, src_tensors=src_tensors, aio_handle=self.write_aio_handle) - def initialize_from_swapped_fp16_params(self, - fp16_partitions_info, - fp16_num_elems, - fp16_pinned_buffers, + def initialize_from_swapped_fp16_params(self, fp16_partitions_info, fp16_num_elems, fp16_pinned_buffers, fp32_parameters): - self._initialize_from_swapped_fp16_params( - aio_handle=self.write_aio_handle, - fp16_partitions_info=fp16_partitions_info, - fp16_num_elems=fp16_num_elems, - fp16_pinned_buffers=fp16_pinned_buffers, - fp32_parameters=fp32_parameters) + self._initialize_from_swapped_fp16_params(aio_handle=self.write_aio_handle, + fp16_partitions_info=fp16_partitions_info, + fp16_num_elems=fp16_num_elems, + fp16_pinned_buffers=fp16_pinned_buffers, + fp32_parameters=fp32_parameters) def flush_gradients(self): self._flush_gradient_swapper(self.gradient_swapper) @@ -146,18 +106,16 @@ class PipelinedOptimizerSwapper(OptimizerSwapper): self.swap_ops[SYNC_SWAP_IN] = self.swap_ops[ASYNC_SWAP_IN] self.swap_ops[ASYNC_SWAP_IN] = None else: - self.swap_ops[SYNC_SWAP_IN] = self._swap_in_optimizer_state( - aio_handle=self.read_aio_handle, - parameter=parameter) + self.swap_ops[SYNC_SWAP_IN] = self._swap_in_optimizer_state(aio_handle=self.read_aio_handle, + parameter=parameter) if self.swap_ops[SYNC_SWAP_IN]: self.swap_ops[SYNC_SWAP_IN].wait() if self.async_swap_in and async_parameter is not None: assert self.swap_ops[ASYNC_SWAP_IN] is None - self.swap_ops[ASYNC_SWAP_IN] = self._swap_in_optimizer_state( - aio_handle=self.read_aio_handle, - parameter=async_parameter) + self.swap_ops[ASYNC_SWAP_IN] = self._swap_in_optimizer_state(aio_handle=self.read_aio_handle, + parameter=async_parameter) self._stop_timer(SWAP_IN_STATE_TIMER) self.timer_names.add(SWAP_IN_STATE_TIMER) @@ -209,10 +167,9 @@ class PipelinedOptimizerSwapper(OptimizerSwapper): unpinned_tensors = param_info.get_unpinned_state_tensors() if len(unpinned_tensors) > 0: - new_alloc_buffers = self.swap_buffer_manager.allocate( - num_elems=self._io_aligned_numel(param_info.numel()), - count=len(unpinned_tensors), - dtype=param_info.dtype()) + new_alloc_buffers = self.swap_buffer_manager.allocate(num_elems=self._io_aligned_numel(param_info.numel()), + count=len(unpinned_tensors), + dtype=param_info.dtype()) assert new_alloc_buffers is not None allocated_buffers += new_alloc_buffers @@ -241,13 +198,11 @@ class PipelinedOptimizerSwapper(OptimizerSwapper): if param_info is None: return None - required_buffer_count = len( - param_info.tensors) + (1 if param_info.has_gradients() else 0) + required_buffer_count = len(param_info.tensors) + (1 if param_info.has_gradients() else 0) aligned_numel = self._io_aligned_numel(param_info.numel()) - allocated_buffers = self.swap_buffer_manager.allocate( - num_elems=aligned_numel, - count=required_buffer_count, - dtype=parameter.dtype) + allocated_buffers = self.swap_buffer_manager.allocate(num_elems=aligned_numel, + count=required_buffer_count, + dtype=parameter.dtype) assert allocated_buffers is not None, \ f"PipelinedOptimizerSwapper ran out of swap buffers, try increasing 'buffer_count'" @@ -266,8 +221,7 @@ class PipelinedOptimizerSwapper(OptimizerSwapper): swap_in_tensors(aio_handle, swap_buffers, swap_paths) if param_info.unswapped_gradients: - self._retrieve_unswapped_grad_partitions(swap_info=param_info, - dest_buffer=parameter.grad) + self._retrieve_unswapped_grad_partitions(swap_info=param_info, dest_buffer=parameter.grad) swap_in_op = OptimizerSwapOp(aio_handle=aio_handle, param_info=param_info, diff --git a/deepspeed/runtime/swap_tensor/utils.py b/deepspeed/runtime/swap_tensor/utils.py index 1d236bd7f..786609014 100644 --- a/deepspeed/runtime/swap_tensor/utils.py +++ b/deepspeed/runtime/swap_tensor/utils.py @@ -34,6 +34,7 @@ def print_object(obj, name, exclude_list=[]): class SwapBuffer(object): + def __init__(self, buffer): self.buffer = buffer self.reset() @@ -92,6 +93,7 @@ class SwapBuffer(object): class SwapBufferPool(object): + def __init__(self, buffers): assert all([buf.is_pinned() for buf in buffers]) self.buffers = [SwapBuffer(buf) for buf in buffers] @@ -175,20 +177,17 @@ class SwapBufferPool(object): class SwapBufferManager(object): + def __init__(self, num_elems, count, dtype): self.num_elems = num_elems self.count = count self.dtype = dtype self.all_buffers = [ - get_accelerator().pin_memory( - torch.zeros(num_elems, - device='cpu', - dtype=dtype)) for _ in range(count) + get_accelerator().pin_memory(torch.zeros(num_elems, device='cpu', dtype=dtype)) for _ in range(count) ] self.free_buffer_index = [i for i in range(count)] self.used_buffer_index = {} - self.gigabytes = (self.all_buffers[0].element_size() * num_elems * count) / (1024 - **3) + self.gigabytes = (self.all_buffers[0].element_size() * num_elems * count) / (1024**3) if dist.get_rank() == 0: exclude_list = ['all_buffers'] @@ -211,9 +210,7 @@ class SwapBufferManager(object): return buffers def allocate_all(self, num_elems, dtype): - return self.allocate(num_elems=num_elems, - count=len(self.free_buffer_index), - dtype=dtype) + return self.allocate(num_elems=num_elems, count=len(self.free_buffer_index), dtype=dtype) def free(self, buffers): buffer_ids = [] diff --git a/deepspeed/runtime/utils.py b/deepspeed/runtime/utils.py index 30dad84b1..24b1ff8a4 100755 --- a/deepspeed/runtime/utils.py +++ b/deepspeed/runtime/utils.py @@ -37,6 +37,7 @@ class DummyOptim(): Dummy optimizer presents model parameters as a param group, this is primarily used to allow ZeRO-3 without an optimizer """ + def __init__(self, params): self.param_groups = [] self.param_groups.append({'params': params}) @@ -169,11 +170,8 @@ def move_to_device(item, device, criterion_func): class CheckOverflow(object): '''Checks for overflow in gradient across parallel process''' - def __init__(self, - param_groups=None, - mpu=None, - zero_reduce_scatter=False, - deepspeed=None): + + def __init__(self, param_groups=None, mpu=None, zero_reduce_scatter=False, deepspeed=None): self.mpu = mpu self.params = [] if param_groups else None self.zero_reduce_scatter = zero_reduce_scatter @@ -196,13 +194,9 @@ class CheckOverflow(object): # an overflow due to expert weights, we detect it # Only need to check groups.get_largest_expert_parallel_group() - dist.all_reduce(overflow_gpu, - op=dist.ReduceOp.MAX, - group=groups._get_max_expert_parallel_group()) + dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=groups._get_max_expert_parallel_group()) if self.mpu is not None: - dist.all_reduce(overflow_gpu, - op=dist.ReduceOp.MAX, - group=self.mpu.get_model_parallel_group()) + dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=self.mpu.get_model_parallel_group()) elif reduce_overflow: dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX) dist.barrier() @@ -247,31 +241,18 @@ class CheckOverflow(object): if has_moe_params: # All reduce this across expert_parallel_group, so that if an expert # overflows, we detect it here - dist.all_reduce(overflow_gpu, - op=dist.ReduceOp.MAX, - group=groups._get_max_expert_parallel_group()) + dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=groups._get_max_expert_parallel_group()) if self.zero_reduce_scatter: - dist.all_reduce(overflow_gpu, - op=dist.ReduceOp.MAX, - group=dist.get_world_group()) + dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=dist.get_world_group()) elif self.mpu is not None: if self.deepspeed is not None: - using_pipeline = hasattr(self.deepspeed, - 'pipeline_enable_backward_allreduce') - if (using_pipeline - and self.deepspeed.pipeline_enable_backward_allreduce is False - ) or (not using_pipeline - and self.deepspeed.enable_backward_allreduce is False): - dist.all_reduce(overflow_gpu, - op=dist.ReduceOp.MAX, - group=self.mpu.get_data_parallel_group()) - dist.all_reduce(overflow_gpu, - op=dist.ReduceOp.MAX, - group=self.mpu.get_model_parallel_group()) + using_pipeline = hasattr(self.deepspeed, 'pipeline_enable_backward_allreduce') + if (using_pipeline and self.deepspeed.pipeline_enable_backward_allreduce is False) or ( + not using_pipeline and self.deepspeed.enable_backward_allreduce is False): + dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=self.mpu.get_data_parallel_group()) + dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=self.mpu.get_model_parallel_group()) elif self.deepspeed is not None and self.deepspeed.enable_backward_allreduce is False: - dist.all_reduce(overflow_gpu, - op=dist.ReduceOp.MAX, - group=dist.get_world_group()) + dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=dist.get_world_group()) overflow = overflow_gpu[0].item() return bool(overflow) @@ -308,9 +289,7 @@ def _handle_overflow(cpu_sum, x, i): if not math.isfinite(float(v)): t_i = v_i break - logger.info( - f"rank {rank} detected overflow {cpu_sum} in tensor {i}:{t_i} shape {x.shape}" - ) + logger.info(f"rank {rank} detected overflow {cpu_sum} in tensor {i}:{t_i} shape {x.shape}") def get_global_norm(norm_list): @@ -353,16 +332,13 @@ def clip_grad_norm_(parameters, max_norm, norm_type=2, mpu=None): total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)]) # Take max across all GPUs. if mpu is not None: - dist.all_reduce(total_norm_cuda, - op=dist.ReduceOp.MAX, - group=mpu.get_model_parallel_group()) + dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=mpu.get_model_parallel_group()) total_norm = total_norm_cuda[0].item() else: total_norm = 0 for p in parameters: if mpu is not None: - if (mpu.get_model_parallel_rank() - == 0) or is_model_parallel_parameter(p): + if (mpu.get_model_parallel_rank() == 0) or is_model_parallel_parameter(p): param_norm = p.grad.data.norm(norm_type) total_norm += param_norm.item()**norm_type else: @@ -372,9 +348,7 @@ def clip_grad_norm_(parameters, max_norm, norm_type=2, mpu=None): # Sum across all model parallel GPUs. total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)]) if mpu is not None: - dist.all_reduce(total_norm_cuda, - op=dist.ReduceOp.SUM, - group=mpu.get_model_parallel_group()) + dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=mpu.get_model_parallel_group()) total_norm = total_norm_cuda[0].item()**(1. / norm_type) # Need to average total_norm across different GPUs due to the presence of moe params @@ -419,9 +393,7 @@ def get_grad_norm(parameters, norm_type=2, mpu=None): total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)]) # Take max across all GPUs. if mpu is not None: - dist.all_reduce(total_norm_cuda, - op=dist.ReduceOp.MAX, - group=mpu.get_model_parallel_group()) + dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=mpu.get_model_parallel_group()) total_norm = total_norm_cuda[0].item() else: total_norm = 0. @@ -442,13 +414,10 @@ def get_grad_norm(parameters, norm_type=2, mpu=None): # Sum across all model parallel GPUs. total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)]) if mpu is not None: - dist.all_reduce(total_norm_cuda, - op=dist.ReduceOp.SUM, - group=mpu.get_model_parallel_group()) + dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=mpu.get_model_parallel_group()) total_norm = total_norm_cuda[0].item()**(1. / norm_type) - if total_norm == float( - 'inf') or total_norm == -float('inf') or total_norm != total_norm: + if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm: total_norm = -1 return total_norm @@ -488,9 +457,7 @@ def get_grad_zeros(parameters, mpu=None): # Sum across all model parallel GPUs. total_zeros_cuda = get_accelerator().FloatTensor([float(total_zeros)]) if mpu is not None: - dist.all_reduce(total_zeros_cuda, - op=dist.ReduceOp.SUM, - group=mpu.get_model_parallel_group()) + dist.all_reduce(total_zeros_cuda, op=dist.ReduceOp.SUM, group=mpu.get_model_parallel_group()) total_zeros = total_zeros_cuda[0].item() return total_zeros @@ -522,9 +489,7 @@ def get_weight_norm(parameters, norm_type=2, mpu=None): total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)]) # Take max across all GPUs. if mpu is not None: - dist.all_reduce(total_norm_cuda, - op=dist.ReduceOp.MAX, - group=mpu.get_model_parallel_group()) + dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=mpu.get_model_parallel_group()) total_norm = total_norm_cuda[0].item() else: total_norm = 0. @@ -545,13 +510,10 @@ def get_weight_norm(parameters, norm_type=2, mpu=None): # Sum across all model parallel GPUs. total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)]) if mpu is not None: - dist.all_reduce(total_norm_cuda, - op=dist.ReduceOp.SUM, - group=mpu.get_model_parallel_group()) + dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=mpu.get_model_parallel_group()) total_norm = total_norm_cuda[0].item()**(1. / norm_type) - if total_norm == float( - 'inf') or total_norm == -float('inf') or total_norm != total_norm: + if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm: total_norm = -1 return total_norm @@ -603,11 +565,7 @@ def _lprobe(weights, num_parts, bottleneck): step += chunksize # Find the end index of partition p - parts[p] = bisect_left(weights, - bsum, - lo=step - chunksize, - hi=min(step, - num_items)) + parts[p] = bisect_left(weights, bsum, lo=step - chunksize, hi=min(step, num_items)) # Nothing more to partition, return early if parts[p] == num_items: # See if the current partition is overweight. @@ -655,6 +613,7 @@ def partition_balanced(weights, num_parts, eps=1e-3): class PartitionedTensor: + def __init__(self, tensor, group, partition_meta=None): super().__init__() @@ -696,10 +655,7 @@ class PartitionedTensor: partition = partition_uniform(num_items=tensor.numel(), num_parts=self.num_parts) start = partition[self.rank] length = partition[self.rank + 1] - start - tensor_part = tensor.detach().contiguous().view(-1).narrow( - 0, - start=start, - length=length).clone() + tensor_part = tensor.detach().contiguous().view(-1).narrow(0, start=start, length=length).clone() return tensor_part, partition @@ -709,9 +665,7 @@ class PartitionedTensor: # Allocate the full tensor as a flat buffer. full_numel = prod(self.full_size()) - flat_tensor = torch.zeros([full_numel], - dtype=self.local_data.dtype, - device=device) + flat_tensor = torch.zeros([full_numel], dtype=self.local_data.dtype, device=device) # Prepare all-gather buffer partition_tensors = [] @@ -723,9 +677,7 @@ class PartitionedTensor: partition_tensors.append(buf) # Collect the full tensor - dist.all_gather(partition_tensors, - partition_tensors[self.rank], - group=self.group) + dist.all_gather(partition_tensors, partition_tensors[self.rank], group=self.group) for i in range(len(partition_tensors)): partition_tensors[i].data = torch.zeros(1) @@ -798,12 +750,9 @@ def memory_status(msg, print_rank=-1, reset_max=False): max_cached /= 1024**3 print( - f'RANK={rank} MEMSTATS', - msg, - f'device={get_accelerator().current_device_name()} ' + f'RANK={rank} MEMSTATS', msg, f'device={get_accelerator().current_device_name()} ' f'current alloc={new_alloced:0.4f}GB (delta={delta_alloced:0.4f}GB max={max_alloced:0.4f}GB) ' - f'current cache={new_cached:0.4f}GB (delta={delta_cached:0.4f}GB max={max_cached:0.4f}GB)' - ) + f'current cache={new_cached:0.4f}GB (delta={delta_cached:0.4f}GB max={max_cached:0.4f}GB)') def get_ma_status(): @@ -827,16 +776,14 @@ def see_memory_usage(message, force=False): # Print message except when distributed but not rank 0 logger.info(message) - logger.info( - f"MA {round(get_accelerator().memory_allocated() / (1024 * 1024 * 1024),2 )} GB \ + logger.info(f"MA {round(get_accelerator().memory_allocated() / (1024 * 1024 * 1024),2 )} GB \ Max_MA {round(get_accelerator().max_memory_allocated() / (1024 * 1024 * 1024),2)} GB \ CA {round(torch_memory_reserved() / (1024 * 1024 * 1024),2)} GB \ Max_CA {round(torch_max_memory_reserved() / (1024 * 1024 * 1024))} GB ") vm_stats = psutil.virtual_memory() used_GB = round(((vm_stats.total - vm_stats.available) / (1024**3)), 2) - logger.info( - f'CPU Virtual Memory: used = {used_GB} GB, percent = {vm_stats.percent}%') + logger.info(f'CPU Virtual Memory: used = {used_GB} GB, percent = {vm_stats.percent}%') # get the peak memory to report correct data, so reset the counter for the next call get_accelerator().reset_peak_memory_stats() @@ -915,32 +862,22 @@ def get_global_norm_of_tensors(input_tensors, norm_type=2, mpu=None): total_norm = max(t.data.abs().max() for t in input_tensors) total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)]) if mpu is not None: - dist.all_reduce(total_norm_cuda, - op=dist.ReduceOp.MAX, - group=mpu.get_model_parallel_group()) + dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=mpu.get_model_parallel_group()) total_norm = total_norm_cuda[0].item() else: - total_norm = sum( - [t.data.float().norm(norm_type).item()**norm_type for t in input_tensors]) + total_norm = sum([t.data.float().norm(norm_type).item()**norm_type for t in input_tensors]) total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)]) if mpu is not None: - dist.all_reduce(total_norm_cuda, - op=dist.ReduceOp.SUM, - group=mpu.get_model_parallel_group()) + dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=mpu.get_model_parallel_group()) total_norm = total_norm_cuda[0].item()**(1. / norm_type) - if total_norm == float( - 'inf') or total_norm == -float('inf') or total_norm != total_norm: + if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm: total_norm = -1 return total_norm -def clip_tensors_by_global_norm(input_tensors, - max_norm=1.0, - global_norm=None, - mpu=None, - eps=1e-6): +def clip_tensors_by_global_norm(input_tensors, max_norm=1.0, global_norm=None, mpu=None, eps=1e-6): """Clip list of tensors by global norm. Args: input_tensors: List of tensors to be clipped @@ -968,9 +905,7 @@ def align_dense_tensors(tensor_list, alignment): if remaining: elements_to_add = alignment - remaining - pad_tensor = torch.zeros(elements_to_add, - device=tensor_list[0].device, - dtype=tensor_list[0].dtype) + pad_tensor = torch.zeros(elements_to_add, device=tensor_list[0].device, dtype=tensor_list[0].dtype) padded_tensor_list = tensor_list + [pad_tensor] else: padded_tensor_list = tensor_list @@ -978,19 +913,13 @@ def align_dense_tensors(tensor_list, alignment): return padded_tensor_list -def all_gather_dp_groups(partitioned_param_groups, - dp_process_group, - start_alignment_factor, - allgather_bucket_size): +def all_gather_dp_groups(partitioned_param_groups, dp_process_group, start_alignment_factor, allgather_bucket_size): for group_id, partitioned_params in enumerate(partitioned_param_groups): # Sequential AllGather Best of both worlds partition_id = dist.get_rank(group=dp_process_group[group_id]) dp_world_size = dist.get_world_size(group=dp_process_group[group_id]) - num_shards = max( - 1, - partitioned_params[partition_id].numel() * dp_world_size // - allgather_bucket_size) + num_shards = max(1, partitioned_params[partition_id].numel() * dp_world_size // allgather_bucket_size) shard_size = partitioned_params[partition_id].numel() // num_shards @@ -1004,16 +933,11 @@ def all_gather_dp_groups(partitioned_param_groups, for shard_id in range(num_shards): if shard_id == (num_shards - 1): - num_elements = partitioned_params[partition_id].numel( - ) - shard_id * shard_size + num_elements = partitioned_params[partition_id].numel() - shard_id * shard_size shard_list = [] for dp_id in range(dp_world_size): - curr_shard = partitioned_params[dp_id].narrow(0, - shard_id * shard_size, - num_elements).detach() + curr_shard = partitioned_params[dp_id].narrow(0, shard_id * shard_size, num_elements).detach() shard_list.append(curr_shard) - dist.all_gather(shard_list, - shard_list[partition_id], - dp_process_group[group_id]) + dist.all_gather(shard_list, shard_list[partition_id], dp_process_group[group_id]) diff --git a/deepspeed/runtime/weight_quantizer.py b/deepspeed/runtime/weight_quantizer.py index f88a9310b..998be3046 100644 --- a/deepspeed/runtime/weight_quantizer.py +++ b/deepspeed/runtime/weight_quantizer.py @@ -6,6 +6,7 @@ from deepspeed.accelerator import get_accelerator class WeightQuantization(object): + def __init__(self, mlp_extra_grouping=True, mp_size=1): self.dense_scales = [] self.qkv_scales = [] @@ -20,8 +21,7 @@ class WeightQuantization(object): data_scale = [float(1 << quantize_bits) / (2 * mx + 1e-5) for mx in max_d] data_int = [(g * s) for g, s in zip(data_groups, data_scale)] data_int = [ - di.round().clamp(-(1 << (quantize_bits - 1)), - (((1 << (quantize_bits - 1)) - 1))) for di in data_int + di.round().clamp(-(1 << (quantize_bits - 1)), (((1 << (quantize_bits - 1)) - 1))) for di in data_int ] data_int = torch.cat(data_int).reshape(data.shape) data_int = data_int.to(torch.int8) @@ -37,8 +37,7 @@ class WeightQuantization(object): (self.mp_size * data.shape[1]) / data.shape[0] == 3) def Quantize(self, value_list, quantize_bits, groups, key, merge_dim=0): - if self.mlp_extra_grouping and self.is_mlp(value_list[0], - merge_count=len(value_list)): + if self.mlp_extra_grouping and self.is_mlp(value_list[0], merge_count=len(value_list)): groups *= 2 q_scale = [] index = 0 @@ -47,11 +46,8 @@ class WeightQuantization(object): q_scale.append(data_scale) value_list[index] = data_int index += 1 - q_scale = ( - 1 / - torch.cat(q_scale, - dim=merge_dim).to( - get_accelerator().current_device_name()).view(-1).unsqueeze(0)) + q_scale = (1 / + torch.cat(q_scale, dim=merge_dim).to(get_accelerator().current_device_name()).view(-1).unsqueeze(0)) if "mlp.dense_4h_to_h.weight" in key: self.mlp4hh_scales.append(q_scale) elif "mlp.dense_h_to_4h.weight" in key: @@ -65,10 +61,7 @@ class WeightQuantization(object): def merge_layer_scales(self, layer_scales): max_dim = max([s.shape[-1] for s in layer_scales]) layer_scales = [ - torch.cat((s, - torch.zeros((1, - max_dim - s.shape[-1]), - device=get_accelerator().current_device_name())), + torch.cat((s, torch.zeros((1, max_dim - s.shape[-1]), device=get_accelerator().current_device_name())), dim=-1) if s.shape[-1] < max_dim else s for s in layer_scales ] return torch.cat(layer_scales).unsqueeze(0) @@ -77,11 +70,7 @@ class WeightQuantization(object): all_scales = [] for dense_scale, qkv_scale, m4hh_scale, mh4h_scale in \ zip(self.dense_scales, self.qkv_scales, self.mlp4hh_scales, self.mlph4h_scales): - all_scales.append( - self.merge_layer_scales([qkv_scale, - dense_scale, - mh4h_scale, - m4hh_scale])) + all_scales.append(self.merge_layer_scales([qkv_scale, dense_scale, mh4h_scale, m4hh_scale])) return torch.cat(all_scales) def merge_scales_split(self, split_count): @@ -95,13 +84,8 @@ class WeightQuantization(object): for s in range(split_count): all_scales[s].append( torch.cat([ - torch.cat((qkv_scale[s], - torch.zeros_like(qkv_scale[s])), - dim=1), - torch.cat((dense_scale[s], - torch.zeros_like(dense_scale[s])), - dim=1), - mh4h_scale[s], + torch.cat((qkv_scale[s], torch.zeros_like(qkv_scale[s])), dim=1), + torch.cat((dense_scale[s], torch.zeros_like(dense_scale[s])), dim=1), mh4h_scale[s], m4hh_scale[s] ]).unsqueeze(0)) for scales_a in all_scales: @@ -139,8 +123,7 @@ class WeightQuantization(object): else: data_quantized, data_scale = self.quantize_data(keys[key], quantize_bits, groups) keys[key].copy_(data_quantized) - layer_scales.append((1 / data_scale.to( - get_accelerator().current_device_name()).view(-1).unsqueeze(0))) + layer_scales.append((1 / data_scale.to(get_accelerator().current_device_name()).view(-1).unsqueeze(0))) all_scales.append(self.merge_layer_scales(layer_scales)) return layer diff --git a/deepspeed/runtime/zero/config.py b/deepspeed/runtime/zero/config.py index d7ab55218..b4c21e7bd 100644 --- a/deepspeed/runtime/zero/config.py +++ b/deepspeed/runtime/zero/config.py @@ -47,13 +47,10 @@ def read_zero_config_deprecated(param_dict): zero_config_dict = {} zero_config_dict["stage"] = 1 if param_dict[ZERO_OPTIMIZATION] else 0 if zero_config_dict["stage"] > 0: - zero_config_dict["allgather_bucket_size"] = get_scalar_param( - param_dict, - "allgather_size", - 5e8) + zero_config_dict["allgather_bucket_size"] = get_scalar_param(param_dict, "allgather_size", 5e8) logger.warning( - "DeepSpeedConfig: this format of ZeRO optimization setup is deprecated. Please use the following format: {}" - .format(ZERO_FORMAT)) + "DeepSpeedConfig: this format of ZeRO optimization setup is deprecated. Please use the following format: {}". + format(ZERO_FORMAT)) return zero_config_dict @@ -161,9 +158,7 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel): None, deprecated=True, new_param="offload_param", - new_param_fn=( - lambda val: DeepSpeedZeroOffloadParamConfig(device=OffloadDeviceEnum.cpu) - if val else None), + new_param_fn=(lambda val: DeepSpeedZeroOffloadParamConfig(device=OffloadDeviceEnum.cpu) if val else None), ) """ Deprecated, please use ``offload_param`` """ @@ -179,31 +174,24 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel): None, deprecated=True, new_param="offload_optimizer", - new_param_fn=( - lambda val: DeepSpeedZeroOffloadOptimizerConfig(device=OffloadDeviceEnum.cpu) - if val else None), + new_param_fn=(lambda val: DeepSpeedZeroOffloadOptimizerConfig(device=OffloadDeviceEnum.cpu) if val else None), ) """ Deprecated, please use ``offload_optimizer`` """ - prefetch_bucket_size: int = Field(pp_int(5e7), - ge=0, - alias="stage3_prefetch_bucket_size") + prefetch_bucket_size: int = Field(pp_int(5e7), ge=0, alias="stage3_prefetch_bucket_size") """ Maximum number of parameter elements to fetch ahead of use. Used by ZeRO3, ZeRO3-Offload, ZeRO-Infinity, and ZeRO-Inference. """ - param_persistence_threshold: int = Field(pp_int(1e5), - ge=0, - alias="stage3_param_persistence_threshold") + param_persistence_threshold: int = Field(pp_int(1e5), ge=0, alias="stage3_param_persistence_threshold") """ Do not partition parameters smaller than this threshold. Smaller values use less memory, but can greatly increase communication (especially latency-bound messages). """ - model_persistence_threshold: int = Field(pp_int(sys.maxsize, - "sys.maxsize"), + model_persistence_threshold: int = Field(pp_int(sys.maxsize, "sys.maxsize"), ge=0, alias="stage3_model_persistence_threshold") """ @@ -213,9 +201,7 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel): ZeRO3-Offload, ZeRO-Infinity and ZeRO-Inference. """ - max_live_parameters: int = Field(pp_int(1e9), - ge=0, - alias="stage3_max_live_parameters") + max_live_parameters: int = Field(pp_int(1e9), ge=0, alias="stage3_max_live_parameters") """ The maximum number of parameters resident per GPU before releasing. Smaller values use less memory, but perform more communication. @@ -227,9 +213,7 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel): parameters. Smaller values use less memory, but perform more communication. """ - gather_16bit_weights_on_model_save: bool = Field( - False, - alias="stage3_gather_16bit_weights_on_model_save") + gather_16bit_weights_on_model_save: bool = Field(False, alias="stage3_gather_16bit_weights_on_model_save") """ Consolidate the weights before saving the model by ``save_16bit_model()``. Since the weights are partitioned across GPUs, they aren’t part of @@ -237,10 +221,9 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel): this option is enabled and then saves the fp16 model weights. """ - stage3_gather_fp16_weights_on_model_save: bool = Field( - False, - deprecated=True, - new_param="gather_16bit_weights_on_model_save") + stage3_gather_fp16_weights_on_model_save: bool = Field(False, + deprecated=True, + new_param="gather_16bit_weights_on_model_save") """ Deprecated, please use ``gather_16bit_weights_on_model_save`` """ ignore_unused_parameters: bool = True @@ -270,8 +253,6 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel): @validator("overlap_comm") def overlap_comm_valid(cls, field_value, values): if field_value is None: - assert ( - "stage" in values - ), "DeepSpeedZeroConfig: 'stage' must be defined before 'overlap_comm'" + assert ("stage" in values), "DeepSpeedZeroConfig: 'stage' must be defined before 'overlap_comm'" field_value = values["stage"] == ZeroStageEnum.weights return field_value diff --git a/deepspeed/runtime/zero/contiguous_memory_allocator.py b/deepspeed/runtime/zero/contiguous_memory_allocator.py index 46e55787e..7ce495262 100644 --- a/deepspeed/runtime/zero/contiguous_memory_allocator.py +++ b/deepspeed/runtime/zero/contiguous_memory_allocator.py @@ -11,6 +11,7 @@ def print_rank_0(message): class ContiguousMemoryAllocator(object): + def __init__(self, size, dtype, device): self.buffer = torch.zeros(size, dtype=dtype, device=device) @@ -98,8 +99,7 @@ class ContiguousMemoryAllocator(object): self._unassign_params(tensor_id) self.total_free += tensor_size print_rank_0( - f"Free before release {free_before}. Released {tensor.numel()}. Total free after {self.total_free}." - ) + f"Free before release {free_before}. Released {tensor.numel()}. Total free after {self.total_free}.") assert self.total_free - tensor_size == free_before, "Release bookkeeping error" def release_tensor_with_id(self, tensor_id): @@ -111,8 +111,7 @@ class ContiguousMemoryAllocator(object): self._unassign_params(tensor_id) self.total_free += tensor_size print_rank_0( - f"Free before release {free_before}. Released {tensor.numel()}. Total free after {self.total_free}." - ) + f"Free before release {free_before}. Released {tensor.numel()}. Total free after {self.total_free}.") assert self.total_free - tensor_size == free_before, "Release bookkeeping error" #shows the current memory allocation at specified resolution @@ -136,9 +135,7 @@ class ContiguousMemoryAllocator(object): def _reset_param_data(self): for id, tensor in self.tensor_map.items(): for param in self.id_to_params[id]: - param.data = tensor.narrow(0, - 0, - param.numel()).view(param.data.shape).data + param.data = tensor.narrow(0, 0, param.numel()).view(param.data.shape).data def _unassign_params(self, tensor_id): if tensor_id in self.id_to_params.keys(): diff --git a/deepspeed/runtime/zero/linear.py b/deepspeed/runtime/zero/linear.py index 1ef43d85d..2b346775c 100644 --- a/deepspeed/runtime/zero/linear.py +++ b/deepspeed/runtime/zero/linear.py @@ -99,9 +99,7 @@ class LinearFunctionForZeroStage3(torch.autograd.Function): dim = grad_output.dim() if dim > 2: grad_weight = grad_output.reshape(-1, - grad_output.shape[-1]).t().matmul( - input.reshape(-1, - input.shape[-1])) + grad_output.shape[-1]).t().matmul(input.reshape(-1, input.shape[-1])) else: grad_weight = grad_output.t().matmul(input) #print(f"Computed grad weight grad_weight {grad_weight.shape}") @@ -184,7 +182,5 @@ class LinearModuleForZeroStage3(Module): return LinearFunctionForZeroStage3.apply(input, self.weight, self.bias) def extra_repr(self) -> str: - return 'in_features={}, out_features={}, bias={}'.format( - self.in_features, - self.out_features, - self.bias is not None) + return 'in_features={}, out_features={}, bias={}'.format(self.in_features, self.out_features, self.bias + is not None) diff --git a/deepspeed/runtime/zero/offload_config.py b/deepspeed/runtime/zero/offload_config.py index 7ea76c681..ff2e8b073 100644 --- a/deepspeed/runtime/zero/offload_config.py +++ b/deepspeed/runtime/zero/offload_config.py @@ -88,6 +88,7 @@ class DeepSpeedZeroOffloadOptimizerConfig(DeepSpeedConfigModel): fast_init: bool = False """ Enable fast optimizer initialization when offloading to NVMe. """ + @validator("pipeline_read", "pipeline_write", always=True) def set_pipeline(cls, field_value, values): values["pipeline"] = field_value or values.get("pipeline", False) diff --git a/deepspeed/runtime/zero/parameter_offload.py b/deepspeed/runtime/zero/parameter_offload.py index 060e301cd..0aab8137d 100644 --- a/deepspeed/runtime/zero/parameter_offload.py +++ b/deepspeed/runtime/zero/parameter_offload.py @@ -31,19 +31,13 @@ def _apply_to_tensors_only(module, functional, backward_function, outputs): if isinstance(outputs, (tuple, list)): touched_outputs = [] for output in outputs: - touched_output = _apply_to_tensors_only(module, - functional, - backward_function, - output) + touched_output = _apply_to_tensors_only(module, functional, backward_function, output) touched_outputs.append(touched_output) return outputs.__class__(touched_outputs) elif isinstance(outputs, dict): # apply inplace to avoid recreating dict inherited objects for key in outputs.keys(): - outputs[key] = _apply_to_tensors_only(module, - functional, - backward_function, - outputs[key]) + outputs[key] = _apply_to_tensors_only(module, functional, backward_function, outputs[key]) return outputs elif isinstance(outputs, torch.Tensor): @@ -67,18 +61,12 @@ def _apply_to_tensors_only(module, functional, backward_function, outputs): #for each tensor in outputs run the forward_function and register backward_function as hook -def _apply_forward_and_backward_to_tensors_only(module, - forward_function, - backward_function, - outputs): +def _apply_forward_and_backward_to_tensors_only(module, forward_function, backward_function, outputs): if type(outputs) is tuple: touched_outputs = [] for output in outputs: - touched_output = _apply_forward_and_backward_to_tensors_only( - module, - forward_function, - backward_function, - output) + touched_output = _apply_forward_and_backward_to_tensors_only(module, forward_function, backward_function, + output) touched_outputs.append(touched_output) return tuple(touched_outputs) elif type(outputs) is torch.Tensor: @@ -91,6 +79,7 @@ def _apply_forward_and_backward_to_tensors_only(module, class ZeROOrderedDict(OrderedDict): + def __init__(self, parent_module, *args, **kwargs): """A replacement for ``collections.OrderedDict`` to detect external ZeRO params. @@ -113,9 +102,7 @@ class ZeROOrderedDict(OrderedDict): if self._parent_module._parameters._in_forward: register_external_parameter(FWD_MODULE_STACK[-1], param) param.all_gather() - print_rank_0( - f'Registering external parameter from getter {key} ds_id = {param.ds_id}', - force=False) + print_rank_0(f'Registering external parameter from getter {key} ds_id = {param.ds_id}', force=False) return param @@ -133,6 +120,7 @@ def _inject_parameters(module, cls): class PreBackwardFunction(torch.autograd.Function): + @staticmethod def forward(ctx, module, pre_backward_function, outputs): ctx.module = module @@ -152,6 +140,7 @@ class PreBackwardFunction(torch.autograd.Function): class PostBackwardFunction(torch.autograd.Function): + @staticmethod def forward(ctx, module, pre_backward_function, output): ctx.module = module @@ -179,6 +168,7 @@ class PostBackwardFunction(torch.autograd.Function): class DeepSpeedZeRoOffload(object): + def __init__(self, module, timers, @@ -194,8 +184,7 @@ class DeepSpeedZeRoOffload(object): see_memory_usage("DeepSpeedZeRoOffload initialize [begin]", force=True) - print_rank_0(f"initialized {__class__.__name__} with args: {locals()}", - force=False) + print_rank_0(f"initialized {__class__.__name__} with args: {locals()}", force=False) self.module = module self.dtype = list(module.parameters())[0].dtype @@ -215,16 +204,14 @@ class DeepSpeedZeRoOffload(object): self.param_numel_persistence_threshold = int(param_persistence_threshold) self.model_persistence_threshold = int(model_persistence_threshold) - self.persistent_parameters = self.mark_persistent_parameters( - self.param_numel_persistence_threshold, - self.model_persistence_threshold) + self.persistent_parameters = self.mark_persistent_parameters(self.param_numel_persistence_threshold, + self.model_persistence_threshold) self.param_coordinators = {} self._prefetch_bucket_sz = int(prefetch_bucket_size) self._max_reuse_distance_in_numel = int(max_reuse_distance) self._max_available_parameters_in_numel = int(max_live_parameters) - self.__allgather_stream = get_accelerator().Stream( - ) if overlap_comm else get_accelerator().default_stream() + self.__allgather_stream = get_accelerator().Stream() if overlap_comm else get_accelerator().default_stream() self.forward_hooks = [] self.backward_hooks = [] @@ -240,8 +227,7 @@ class DeepSpeedZeRoOffload(object): """Partitioning Parameters that were not partitioned usually if parameters of modules whose input parameters do not require grad computation do not trigger post call and will therefore will remain unpartitioned""" - self.get_param_coordinator(training=self.module.training).release_and_reset_all( - self.module) + self.get_param_coordinator(training=self.module.training).release_and_reset_all(self.module) for param in iter_params(self.module, recurse=True): if param.ds_status != ZeroParamStatus.NOT_AVAILABLE: raise RuntimeError(f"{param.ds_summary()} expected to be released") @@ -251,8 +237,7 @@ class DeepSpeedZeRoOffload(object): self.param_coordinators[training] = PartitionedParameterCoordinator( prefetch_bucket_sz=self._prefetch_bucket_sz, max_reuse_distance_in_numel=self._max_reuse_distance_in_numel, - max_available_parameters_in_numel=self. - _max_available_parameters_in_numel, + max_available_parameters_in_numel=self._max_available_parameters_in_numel, allgather_stream=self.__allgather_stream, prefetch_nvme=self.offload_device == OffloadDeviceEnum.nvme, ) @@ -294,9 +279,8 @@ class DeepSpeedZeRoOffload(object): for hook in self.backward_hooks: hook.remove() - print_rank_0( - f'Deleted module hooks: forward = {num_forward_hooks}, backward = {num_backward_hooks}', - force=False) + print_rank_0(f'Deleted module hooks: forward = {num_forward_hooks}, backward = {num_backward_hooks}', + force=False) def setup_zero_stage3_hooks(self): self.hierarchy = 0 @@ -368,19 +352,14 @@ class DeepSpeedZeRoOffload(object): outputs.append(val) output = outputs - for item in filter( - lambda item: is_zero_param(item) or hasattr(item, - 'ds_param_alias'), - output): + for item in filter(lambda item: is_zero_param(item) or hasattr(item, 'ds_param_alias'), output): key = id(item) if hasattr(item, 'ds_id') else id(item.ds_param_alias) - actual_external_param = item if hasattr(item, - 'ds_id') else item.ds_param_alias + actual_external_param = item if hasattr(item, 'ds_id') else item.ds_param_alias if not any(key in m._external_params for m in FWD_MODULE_STACK): actual_external_param.is_external_param = True module_to_register = FWD_MODULE_STACK[-1] - register_external_parameter(module_to_register, - actual_external_param) + register_external_parameter(module_to_register, actual_external_param) print_rank_0( f'Registering dangling parameter for module {module_to_register.__class__.__name__}, ds_id = {actual_external_param.ds_id}.', force=False) @@ -398,6 +377,7 @@ class DeepSpeedZeRoOffload(object): self.post_sub_module_forward_function(module) def _pre_backward_module_hook(module, inputs, output): + @instrument_w_nvtx def _run_before_backward_function(sub_module): # some models (e.g. Albert) may run multiple forwards on the same layer in a loop @@ -409,10 +389,7 @@ class DeepSpeedZeRoOffload(object): sub_module.applied_pre_backward_ref_cnt -= 1 #print(f"COUNTER after: {sub_module.applied_pre_backward_ref_cnt}") - return _apply_to_tensors_only(module, - PreBackwardFunction, - _run_before_backward_function, - output) + return _apply_to_tensors_only(module, PreBackwardFunction, _run_before_backward_function, output) #This is an alternate to doing _post_backward_module_hook #it uses tensor.register_hook instead of using torch.autograd.Function @@ -431,11 +408,8 @@ class DeepSpeedZeRoOffload(object): if input.requires_grad: module.ds_grads_remaining += 1 - return _apply_forward_and_backward_to_tensors_only( - module, - _run_before_forward_function, - _run_after_backward_hook, - inputs) + return _apply_forward_and_backward_to_tensors_only(module, _run_before_forward_function, + _run_after_backward_hook, inputs) def _post_backward_module_hook(module, inputs): module.ds_grads_remaining = 0 @@ -445,31 +419,23 @@ class DeepSpeedZeRoOffload(object): if sub_module.ds_grads_remaining == 0: self.post_sub_module_backward_function(sub_module) - return _apply_to_tensors_only(module, - PostBackwardFunction, - _run_after_backward_function, - inputs) + return _apply_to_tensors_only(module, PostBackwardFunction, _run_after_backward_function, inputs) # Pre forward hook - self.forward_hooks.append( - module.register_forward_pre_hook(_pre_forward_module_hook)) + self.forward_hooks.append(module.register_forward_pre_hook(_pre_forward_module_hook)) # Post forward hook - self.forward_hooks.append( - module.register_forward_hook(_post_forward_module_hook)) + self.forward_hooks.append(module.register_forward_hook(_post_forward_module_hook)) # Pre backward hook - self.backward_hooks.append( - module.register_forward_hook(_pre_backward_module_hook)) + self.backward_hooks.append(module.register_forward_hook(_pre_backward_module_hook)) # post backward hook - self.backward_hooks.append( - module.register_forward_pre_hook(_post_backward_module_hook)) + self.backward_hooks.append(module.register_forward_pre_hook(_post_backward_module_hook)) @torch.no_grad() def pre_sub_module_forward_function(self, sub_module): - see_memory_usage(f"Before sub module function {sub_module.__class__.__name__}", - force=False) + see_memory_usage(f"Before sub module function {sub_module.__class__.__name__}", force=False) global FWD_MODULE_STACK FWD_MODULE_STACK.append(sub_module) @@ -480,22 +446,18 @@ class DeepSpeedZeRoOffload(object): param_coordinator.record_module(sub_module) param_coordinator.fetch_sub_module(sub_module) - see_memory_usage( - f"Before sub module function {sub_module.__class__.__name__} after fetch", - force=False) + see_memory_usage(f"Before sub module function {sub_module.__class__.__name__} after fetch", force=False) @torch.no_grad() def post_sub_module_forward_function(self, sub_module): - see_memory_usage( - f"After sub module function {sub_module.__class__.__name__} {sub_module.id} before release", - force=False) + see_memory_usage(f"After sub module function {sub_module.__class__.__name__} {sub_module.id} before release", + force=False) param_coordinator = self.get_param_coordinator(training=sub_module.training) param_coordinator.release_sub_module(sub_module) - see_memory_usage( - f"After sub module function {sub_module.__class__.__name__} {sub_module.id} after release", - force=False) + see_memory_usage(f"After sub module function {sub_module.__class__.__name__} {sub_module.id} after release", + force=False) @torch.no_grad() def pre_sub_module_backward_function(self, sub_module): @@ -511,8 +473,7 @@ class DeepSpeedZeRoOffload(object): f"After sub module backward function {sub_module.__class__.__name__} {sub_module.id} before release", force=False) - self.get_param_coordinator( - training=sub_module.training).release_sub_module(sub_module) + self.get_param_coordinator(training=sub_module.training).release_sub_module(sub_module) see_memory_usage( f"After sub module backward function {sub_module.__class__.__name__} {sub_module.id} after release", diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py index 39cfb7f90..984515e48 100755 --- a/deepspeed/runtime/zero/partition_parameters.py +++ b/deepspeed/runtime/zero/partition_parameters.py @@ -26,11 +26,8 @@ from deepspeed.runtime.zero.utils import assert_ints_same_as_other_ranks from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum from deepspeed.utils import instrument_w_nvtx, logger from deepspeed.comm.comm import init_distributed -from deepspeed.utils.debug import (debug_param2name_id_shape, - debug_param2name_id_shape_device, - debug_module2name, - debug_param2name_id, - debug_param2name_id_shape_status) +from deepspeed.utils.debug import (debug_param2name_id_shape, debug_param2name_id_shape_device, debug_module2name, + debug_param2name_id, debug_param2name_id_shape_status) from deepspeed.accelerator import get_accelerator from ..swap_tensor.partitioned_param_swapper import AsyncPartitionedParameterSwapper, PartitionedParamStatus @@ -40,10 +37,7 @@ zero_init_enabled = False def _dist_allgather_fn(input_tensor: Tensor, output_tensor: Tensor, group=None): - return instrument_w_nvtx(dist.allgather_fn)(output_tensor, - input_tensor, - group=group, - async_op=True) + return instrument_w_nvtx(dist.allgather_fn)(output_tensor, input_tensor, group=group, async_op=True) def print_rank_0(message, debug=False, force=False): @@ -76,9 +70,7 @@ def _init_external_params(module): return self._external_params.items() def all_parameters(self): - return itertools.chain(self.named_parameters(self, - recurse=False), - external_parameters(self)) + return itertools.chain(self.named_parameters(self, recurse=False), external_parameters(self)) module.ds_external_parameters = types.MethodType(external_parameters, module) module.all_parameters = types.MethodType(all_parameters, module) @@ -150,8 +142,7 @@ def unregister_external_parameter(module, parameter): if not isinstance(parameter, torch.nn.Parameter): raise RuntimeError('Parameter is not a torch.nn.Parameter') - if not hasattr(module, - '_external_params') or id(parameter) not in module._external_params: + if not hasattr(module, '_external_params') or id(parameter) not in module._external_params: raise RuntimeError('Parameter is not a registered external parameter of module.') key = id(parameter) @@ -188,12 +179,11 @@ _orig_torch_ones = torch.ones _orig_torch_full = torch.full -def zero_wrapper_for_fp_tensor_constructor(fn: Callable, - target_fp_dtype: torch.dtype) -> Callable: +def zero_wrapper_for_fp_tensor_constructor(fn: Callable, target_fp_dtype: torch.dtype) -> Callable: + def wrapped_fn(*args, **kwargs) -> Tensor: if kwargs.get("device", None) is None: - kwargs['device'] = torch.device(get_accelerator().device_name( - os.environ["LOCAL_RANK"])) + kwargs['device'] = torch.device(get_accelerator().device_name(os.environ["LOCAL_RANK"])) tensor: Tensor = fn(*args, **kwargs) if tensor.is_floating_point(): tensor = tensor.to(target_fp_dtype) @@ -204,6 +194,7 @@ def zero_wrapper_for_fp_tensor_constructor(fn: Callable, def get_new_tensor_fn_for_dtype(dtype: torch.dtype) -> Callable: + def new_tensor(cls, *args) -> Tensor: device = torch.device(get_accelerator().device_name(os.environ["LOCAL_RANK"])) tensor = _orig_torch_empty(0, device=device).new_empty(*args) @@ -250,15 +241,14 @@ empty_buffers = {} # Inserts _post_init_method at the end of init method # for all sub classes of torch.nn.Module class InsertPostInitMethodToModuleSubClasses(object): - def __init__(self, - enabled=True, - mem_efficient_linear=True, - ds_config=None, - dtype=None): + + def __init__(self, enabled=True, mem_efficient_linear=True, ds_config=None, dtype=None): self.mem_efficient_linear = mem_efficient_linear self.enabled = enabled self._set_dtype(ds_config, dtype) - assert self.dtype in [torch.half, torch.bfloat16, torch.float], f"Invalid data type {self.dtype}, allowed values are [torch.half, torch.bfloat16, torch.float]" + assert self.dtype in [ + torch.half, torch.bfloat16, torch.float + ], f"Invalid data type {self.dtype}, allowed values are [torch.half, torch.bfloat16, torch.float]" def __enter__(self): global zero_init_enabled @@ -280,6 +270,7 @@ class InsertPostInitMethodToModuleSubClasses(object): to get around this issue, we wrap the function passed to Module.apply so that the applied function is applied to child modules correctly. """ + def get_wrapped_fn_to_apply(fn_to_apply: Callable) -> Callable: if hasattr(fn_to_apply, "wrapped"): return fn_to_apply @@ -296,19 +287,14 @@ class InsertPostInitMethodToModuleSubClasses(object): 3. broadcasts root rank's parameters to the other ranks 4. re-partitions the parameters """ - if not all( - is_zero_param(p) - for p in module_to_apply_fn_to.parameters(recurse=False)): - raise RuntimeError( - f"not all parameters for {module_to_apply_fn_to.__class__.__name__}, " - f"were zero params, is it possible that the parameters were " - f"overwritten after they were initialized? " - f"params: {[p for p in module_to_apply_fn_to.parameters(recurse=False)]} " - ) + if not all(is_zero_param(p) for p in module_to_apply_fn_to.parameters(recurse=False)): + raise RuntimeError(f"not all parameters for {module_to_apply_fn_to.__class__.__name__}, " + f"were zero params, is it possible that the parameters were " + f"overwritten after they were initialized? " + f"params: {[p for p in module_to_apply_fn_to.parameters(recurse=False)]} ") params_to_apply_fn_to: Iterable[Parameter] = list( - sorted(module_to_apply_fn_to.parameters(recurse=False), - key=lambda p: p.ds_id)) + sorted(module_to_apply_fn_to.parameters(recurse=False), key=lambda p: p.ds_id)) for param in params_to_apply_fn_to: param.all_gather() @@ -332,6 +318,7 @@ class InsertPostInitMethodToModuleSubClasses(object): return wrapped_apply def partition_after(f): + @functools.wraps(f) def wrapper(module, *args, **kwargs): @@ -343,8 +330,7 @@ class InsertPostInitMethodToModuleSubClasses(object): # custom weights init function. So if a parent created the weights param, the child # won't need to gather it in order to tweak it - print_rank_0(f'Before initializing {module.__class__.__name__}', - force=False) + print_rank_0(f'Before initializing {module.__class__.__name__}', force=False) is_child_module = False if not hasattr(module, "_ds_child_entered"): @@ -358,13 +344,10 @@ class InsertPostInitMethodToModuleSubClasses(object): # child's __init__ is done, now we can run a single post_init on the child object delattr(module, "_ds_child_entered") - print_rank_0(f'Running post_init for {module.__class__.__name__}', - force=False) + print_rank_0(f'Running post_init for {module.__class__.__name__}', force=False) self._post_init_method(module) - print_rank_0( - f'After initializing followed by post init for {module.__class__.__name__}', - force=False) + print_rank_0(f'After initializing followed by post init for {module.__class__.__name__}', force=False) return wrapper @@ -387,14 +370,11 @@ class InsertPostInitMethodToModuleSubClasses(object): # Replace .__init__() for future subclasses of torch.nn.Module torch.nn.modules.module.Module.__init_subclass__ = classmethod(_init_subclass) - torch.nn.modules.module.Module.apply = apply_with_gather( - torch.nn.modules.module.Module._old_apply) + torch.nn.modules.module.Module.apply = apply_with_gather(torch.nn.modules.module.Module._old_apply) torch.Tensor.__new__ = get_new_tensor_fn_for_dtype(self.dtype) - torch.empty = zero_wrapper_for_fp_tensor_constructor(_orig_torch_empty, - self.dtype) - torch.zeros = zero_wrapper_for_fp_tensor_constructor(_orig_torch_zeros, - self.dtype) + torch.empty = zero_wrapper_for_fp_tensor_constructor(_orig_torch_empty, self.dtype) + torch.zeros = zero_wrapper_for_fp_tensor_constructor(_orig_torch_zeros, self.dtype) torch.ones = zero_wrapper_for_fp_tensor_constructor(_orig_torch_ones, self.dtype) torch.full = zero_wrapper_for_fp_tensor_constructor(_orig_torch_full, self.dtype) @@ -412,8 +392,7 @@ class InsertPostInitMethodToModuleSubClasses(object): shutdown_init_context() if dist.get_rank() == 0: - logger.info("finished initializing model with %.2fB parameters", - param_count / 1e9) + logger.info("finished initializing model with %.2fB parameters", param_count / 1e9) # Now that we cleaned up the metaclass injection, raise the exception. if exc_type is not None: @@ -471,6 +450,7 @@ def shutdown_init_context(): class AllGatherHandle: + def __init__(self, handle, param: Parameter) -> None: if param.ds_status != ZeroParamStatus.INFLIGHT: raise RuntimeError(f"expected param {param.ds_summary()} to be available") @@ -484,6 +464,7 @@ class AllGatherHandle: class AllGatherCoalescedHandle: + def __init__( self, allgather_handle, @@ -499,8 +480,7 @@ class AllGatherCoalescedHandle: for param in self.__params: if param.ds_status != ZeroParamStatus.INFLIGHT: - raise RuntimeError( - f"expected param {param.ds_summary()} to not be available") + raise RuntimeError(f"expected param {param.ds_summary()} to not be available") @instrument_w_nvtx def wait(self) -> None: @@ -518,10 +498,7 @@ class AllGatherCoalescedHandle: param_start = rank * param.ds_tensor.ds_numel if param_start < param.ds_numel: part_to_copy = self.__partitions[rank].narrow( - 0, - param_offset, - min(param.ds_numel - param_start, - param.ds_tensor.ds_numel)) + 0, param_offset, min(param.ds_numel - param_start, param.ds_tensor.ds_numel)) partitions.append(part_to_copy) param.data = instrument_w_nvtx(torch.cat)(partitions).view(param.ds_shape) @@ -651,16 +628,11 @@ class Init(InsertPostInitMethodToModuleSubClasses): if config is not None: config_dict_or_path = config logger.warning( - f'zero.Init: the `config` argument is deprecated. Please use `config_dict_or_path` instead.' - ) + f'zero.Init: the `config` argument is deprecated. Please use `config_dict_or_path` instead.') - _ds_config = deepspeed.runtime.config.DeepSpeedConfig( - config_dict_or_path, - mpu) if config_dict_or_path is not None else None - super().__init__(enabled=enabled, - mem_efficient_linear=mem_efficient_linear, - ds_config=_ds_config, - dtype=dtype) + _ds_config = deepspeed.runtime.config.DeepSpeedConfig(config_dict_or_path, + mpu) if config_dict_or_path is not None else None + super().__init__(enabled=enabled, mem_efficient_linear=mem_efficient_linear, ds_config=_ds_config, dtype=dtype) if not dist.is_initialized(): init_distributed() assert dist.is_initialized(), "Parameters cannot be scattered without initializing deepspeed.comm" @@ -674,8 +646,7 @@ class Init(InsertPostInitMethodToModuleSubClasses): # Local device is the device where the parameters are consumed, must be default device. # It is the device where parameters are fully instantiated using allgather - self.local_device = torch.device(get_accelerator().device_name( - os.environ["LOCAL_RANK"])) + self.local_device = torch.device(get_accelerator().device_name(os.environ["LOCAL_RANK"])) get_accelerator().set_device(self.local_device) if _ds_config is not None and _ds_config.zero_config.offload_param is not None: @@ -686,13 +657,9 @@ class Init(InsertPostInitMethodToModuleSubClasses): # Remote device is the device where parameter partitions are stored # It can be same as local_device or it could be CPU or NVMe. - self.remote_device = self.local_device if remote_device in [ - None, - OffloadDeviceEnum.none - ] else remote_device - self.pin_memory = pin_memory if ( - self.remote_device in [OffloadDeviceEnum.cpu, - OffloadDeviceEnum.nvme]) else False + self.remote_device = self.local_device if remote_device in [None, OffloadDeviceEnum.none] else remote_device + self.pin_memory = pin_memory if (self.remote_device in [OffloadDeviceEnum.cpu, OffloadDeviceEnum.nvme + ]) else False # Enable fp16 param swapping to NVMe if self.remote_device == OffloadDeviceEnum.nvme: @@ -709,8 +676,7 @@ class Init(InsertPostInitMethodToModuleSubClasses): if dist.has_allgather_base(): self.use_all_gather_base = True else: - logger.info( - f"_all_gather_base API is not available in torch {torch.__version__}") + logger.info(f"_all_gather_base API is not available in torch {torch.__version__}") def _convert_to_zero_parameters(self, param_list): for param in param_list: @@ -737,9 +703,7 @@ class Init(InsertPostInitMethodToModuleSubClasses): def _post_init_method(self, module): #see_memory_usage(f"Before converting parmas in {module.__class__.__name__}", force=False) print_rank_0(f'Converting Params in {module.__class__.__name__}', force=False) - see_memory_usage( - f"Before converting and partitioning parmas in {module.__class__.__name__}", - force=False) + see_memory_usage(f"Before converting and partitioning parmas in {module.__class__.__name__}", force=False) global param_count for name, param in module.named_parameters(recurse=False): @@ -747,8 +711,7 @@ class Init(InsertPostInitMethodToModuleSubClasses): if not is_zero_param(param): self._convert_to_deepspeed_param(param) print_rank_0( - f"Partitioning param {debug_param2name_id_shape(param)} module={debug_module2name(module)}" - ) + f"Partitioning param {debug_param2name_id_shape(param)} module={debug_module2name(module)}") if get_accelerator().on_accelerator(param): dist.broadcast(param, 0, self.ds_process_group) @@ -806,8 +769,7 @@ class Init(InsertPostInitMethodToModuleSubClasses): return self._all_gather(param_list, async_op=async_op, hierarchy=hierarchy) @instrument_w_nvtx - def all_gather_coalesced(params: Iterable[Parameter], - safe_mode: bool = False) -> AllGatherCoalescedHandle: + def all_gather_coalesced(params: Iterable[Parameter], safe_mode: bool = False) -> AllGatherCoalescedHandle: # fetches from nvme if the partition is not available and in nvme self._ensure_availability_of_partitioned_params(params) @@ -845,37 +807,23 @@ class Init(InsertPostInitMethodToModuleSubClasses): device=get_accelerator().current_device_name(), requires_grad=False, ) - handle = _dist_allgather_fn( - param.ds_tensor.to(get_accelerator().current_device_name()), - param_buffer, - self.ds_process_group) - param.data = param_buffer.narrow(0, - 0, - param.ds_numel).view(param.ds_shape).to( - param.device) + handle = _dist_allgather_fn(param.ds_tensor.to(get_accelerator().current_device_name()), param_buffer, + self.ds_process_group) + param.data = param_buffer.narrow(0, 0, param.ds_numel).view(param.ds_shape).to(param.device) return AllGatherHandle(handle, param) else: partition_sz = sum(p.ds_tensor.ds_numel for p in params) flat_tensor = torch.empty(partition_sz * self.world_size, - dtype=get_only_unique_item(p.dtype - for p in params), + dtype=get_only_unique_item(p.dtype for p in params), device=get_accelerator().current_device_name(), requires_grad=False) partitions: List[Parameter] = [] for i in range(self.world_size): - partitions.append( - flat_tensor.narrow(0, - partition_sz * i, - partition_sz)) + partitions.append(flat_tensor.narrow(0, partition_sz * i, partition_sz)) - instrument_w_nvtx(torch.cat)([ - p.ds_tensor.to(get_accelerator().current_device_name()) - for p in params - ], + instrument_w_nvtx(torch.cat)([p.ds_tensor.to(get_accelerator().current_device_name()) for p in params], out=partitions[self.rank]) - handle = _dist_allgather_fn(partitions[self.rank], - flat_tensor, - self.ds_process_group) + handle = _dist_allgather_fn(partitions[self.rank], flat_tensor, self.ds_process_group) return AllGatherCoalescedHandle( allgather_handle=handle, @@ -886,9 +834,7 @@ class Init(InsertPostInitMethodToModuleSubClasses): def partition(param_list=None, hierarchy=0, has_been_updated=False): cls = param - print_rank_0( - f"{'--'*hierarchy}----Partitioning param {debug_param2name_id_shape_device(cls)}" - ) + print_rank_0(f"{'--'*hierarchy}----Partitioning param {debug_param2name_id_shape_device(cls)}") if param_list is None: param_list = [cls] self._partition(param_list, has_been_updated=has_been_updated) @@ -902,22 +848,16 @@ class Init(InsertPostInitMethodToModuleSubClasses): ) self._reduce_scatter_gradients(param_list) - def partition_gradients(param_list=None, - partition_buffers=None, - hierarchy=0, - accumulate=False): + def partition_gradients(param_list=None, partition_buffers=None, hierarchy=0, accumulate=False): cls = param print_rank_0( - f"{'--'*hierarchy}----Partitioning param gradient with id {debug_param2name_id_shape_device(cls)}" - ) + f"{'--'*hierarchy}----Partitioning param gradient with id {debug_param2name_id_shape_device(cls)}") if param_list is None: param_list = [cls] if isinstance(partition_buffers, torch.Tensor): partition_buffers = [partition_buffers] - self._partition_gradients(param_list, - partition_buffers=partition_buffers, - accumulate=accumulate) + self._partition_gradients(param_list, partition_buffers=partition_buffers, accumulate=accumulate) def aligned_size(): return self._aligned_size(param) @@ -950,6 +890,7 @@ class Init(InsertPostInitMethodToModuleSubClasses): self._convert_to_zero_parameters(param_list) def allgather_before(func: Callable) -> Callable: + def wrapped(*args, **kwargs): param.all_gather() return func(*args, **kwargs) @@ -1011,9 +952,7 @@ class Init(InsertPostInitMethodToModuleSubClasses): for param in param_list: if param.ds_status == ZeroParamStatus.NOT_AVAILABLE: if async_op: - handle = self._allgather_param(param, - async_op=async_op, - hierarchy=hierarchy) + handle = self._allgather_param(param, async_op=async_op, hierarchy=hierarchy) param.ds_status = ZeroParamStatus.INFLIGHT # if async_op else ZeroParamStatus.AVAILABLE handles.append(handle) else: @@ -1050,9 +989,7 @@ class Init(InsertPostInitMethodToModuleSubClasses): global reuse_buffers #print_rank_0(f"Param id {param.ds_id} status is {param.ds_status}") if param.ds_status is ZeroParamStatus.AVAILABLE: - print_rank_0( - f"Partitioning param id {param.ds_id} reuse buffers {reuse_buffers}", - force=False) + print_rank_0(f"Partitioning param id {param.ds_id} reuse buffers {reuse_buffers}", force=False) # if reuse_buffers and False: # numel = buffer.numel() # buffer = param.data.view(-1) @@ -1068,18 +1005,13 @@ class Init(InsertPostInitMethodToModuleSubClasses): #param.data = param.ds_tensor.data - see_memory_usage( - f'Before partitioning param {param.ds_id} {param.shape}', - force=False) + see_memory_usage(f'Before partitioning param {param.ds_id} {param.shape}', force=False) # param.data does not store anything meaningful in partitioned state free_param(param) - see_memory_usage(f'After partitioning param {param.ds_id} {param.shape}', - force=False) + see_memory_usage(f'After partitioning param {param.ds_id} {param.shape}', force=False) if param.ds_tensor.final_location == OffloadDeviceEnum.nvme: - print_rank_0( - f"Param {param.ds_id} partition released since it exists in nvme", - force=False) + print_rank_0(f"Param {param.ds_id} partition released since it exists in nvme", force=False) param.nvme_swapper.remove_partition_and_release_buffers([param]) return @@ -1093,23 +1025,17 @@ class Init(InsertPostInitMethodToModuleSubClasses): numel=partition_size): final_location = OffloadDeviceEnum.nvme buffer = self.param_swapper.get_buffer(param, partition_size) - partitioned_tensor = torch.empty(0, - dtype=param.dtype, - device=buffer.device) + partitioned_tensor = torch.empty(0, dtype=param.dtype, device=buffer.device) partitioned_tensor.data = buffer.data - print_rank_0( - f"ID {param.ds_id} Initializing partition for the first time for nvme offload." - ) + print_rank_0(f"ID {param.ds_id} Initializing partition for the first time for nvme offload.") else: - partitioned_tensor = torch.empty( - partition_size, - dtype=param.dtype, - device=OffloadDeviceEnum.cpu if self.remote_device - == OffloadDeviceEnum.nvme else self.remote_device) + partitioned_tensor = torch.empty(partition_size, + dtype=param.dtype, + device=OffloadDeviceEnum.cpu if self.remote_device + == OffloadDeviceEnum.nvme else self.remote_device) if self.pin_memory: - partitioned_tensor = get_accelerator().pin_memory( - partitioned_tensor) + partitioned_tensor = get_accelerator().pin_memory(partitioned_tensor) partitioned_tensor.requires_grad = False param.ds_tensor = partitioned_tensor @@ -1135,13 +1061,8 @@ class Init(InsertPostInitMethodToModuleSubClasses): if start < param.ds_numel: elements_to_copy = param.ds_numel - start - param.ds_tensor.narrow(0, - 0, - elements_to_copy).copy_( - one_dim_param.narrow( - 0, - start, - elements_to_copy)) + param.ds_tensor.narrow(0, 0, + elements_to_copy).copy_(one_dim_param.narrow(0, start, elements_to_copy)) #print(f"Remote device {self.remote_device}") @@ -1151,23 +1072,16 @@ class Init(InsertPostInitMethodToModuleSubClasses): # param.data does not store anything meaningful in partitioned state - see_memory_usage(f'Before partitioning param {param.ds_id} {param.shape}', - force=False) + see_memory_usage(f'Before partitioning param {param.ds_id} {param.shape}', force=False) free_param(param) - see_memory_usage(f'After partitioning param {param.ds_id} {param.shape}', - force=False) + see_memory_usage(f'After partitioning param {param.ds_id} {param.shape}', force=False) if param.ds_tensor.final_location == OffloadDeviceEnum.nvme: self.param_swapper.swap_out_and_release([param]) - print_rank_0( - f"ID {param.ds_id} Offloaded to nvme offload and buffers released.") - see_memory_usage( - f"ID {param.ds_id} Offloaded to nvme offload and buffers released.", - force=False) + print_rank_0(f"ID {param.ds_id} Offloaded to nvme offload and buffers released.") + see_memory_usage(f"ID {param.ds_id} Offloaded to nvme offload and buffers released.", force=False) - print_rank_0( - f"ID {param.ds_id} partitioned type {param.dtype} dev {param.device} shape {param.shape}" - ) + print_rank_0(f"ID {param.ds_id} partitioned type {param.dtype} dev {param.device} shape {param.shape}") def _param_status(self, param): if param.ds_tensor is not None: @@ -1194,9 +1108,7 @@ class Init(InsertPostInitMethodToModuleSubClasses): see_memory_usage( f'Before allocate allgather param {debug_param2name_id_shape_status(param)} partition_size={partition_size} ', force=False) - flat_tensor = torch.zeros(aligned_param_size, - dtype=param.dtype, - device=param.device).view(-1) + flat_tensor = torch.zeros(aligned_param_size, dtype=param.dtype, device=param.device).view(-1) see_memory_usage( f'After allocate allgather param {debug_param2name_id_shape_status(param)} {aligned_param_size} {partition_size} ', force=False) @@ -1215,25 +1127,18 @@ class Init(InsertPostInitMethodToModuleSubClasses): if self.use_all_gather_base: # try the _all_gather_base on PyTorch master branch handle = dist.all_gather_base(flat_tensor, - param.ds_tensor.to( - get_accelerator().device_name()), + param.ds_tensor.to(get_accelerator().device_name()), group=self.ds_process_group, async_op=async_op) else: partitions = [] for i in range(self.world_size): - partitions.append( - flat_tensor.narrow(0, - partition_size * i, - partition_size)) + partitions.append(flat_tensor.narrow(0, partition_size * i, partition_size)) if i == dist.get_rank(group=self.ds_process_group): partitions[i].data.copy_(param.ds_tensor.data, non_blocking=True) - handle = dist.all_gather(partitions, - partitions[self.rank], - group=self.ds_process_group, - async_op=async_op) + handle = dist.all_gather(partitions, partitions[self.rank], group=self.ds_process_group, async_op=async_op) replicated_tensor = flat_tensor.narrow(0, 0, param.ds_numel).view(param.ds_shape) param.data = replicated_tensor.data @@ -1256,9 +1161,7 @@ class Init(InsertPostInitMethodToModuleSubClasses): allgather_params = [] for psize in partition_sizes: tensor_size = psize * self.world_size - flat_tensor = torch.empty(tensor_size, - dtype=param_list[0].dtype, - device=self.local_device).view(-1) + flat_tensor = torch.empty(tensor_size, dtype=param_list[0].dtype, device=self.local_device).view(-1) flat_tensor.requires_grad = False allgather_params.append(flat_tensor) @@ -1283,14 +1186,10 @@ class Init(InsertPostInitMethodToModuleSubClasses): output_list.append(partition) if not get_accelerator().on_accelerator(partition): logger.warning( - f'param {param_idx}, partition {i} is not on CUDA, partition shape {partition.size()}' - ) + f'param {param_idx}, partition {i} is not on CUDA, partition shape {partition.size()}') # back to old all_gather function signature - h = dist.all_gather(output_list, - input_tensor, - group=self.ds_process_group, - async_op=True) + h = dist.all_gather(output_list, input_tensor, group=self.ds_process_group, async_op=True) launch_handles.append(h) # Wait ensures the operation is enqueued, but not necessarily complete. @@ -1299,9 +1198,7 @@ class Init(InsertPostInitMethodToModuleSubClasses): # assign to param.data (not copy) for i, param in enumerate(param_list): gathered_tensor = allgather_params[i] - param.data = gathered_tensor.narrow(0, - 0, - param.ds_numel).view(param.ds_shape).data + param.data = gathered_tensor.narrow(0, 0, param.ds_numel).view(param.ds_shape).data # guarantee the communication to be completed get_accelerator().synchronize() @@ -1315,9 +1212,7 @@ class Init(InsertPostInitMethodToModuleSubClasses): partition_size = sum([param.ds_tensor.ds_numel for param in param_list]) tensor_size = partition_size * self.world_size - flat_tensor = torch.empty(tensor_size, - dtype=param_list[0].dtype, - device=self.local_device) + flat_tensor = torch.empty(tensor_size, dtype=param_list[0].dtype, device=self.local_device) flat_tensor.requires_grad = False partitions = [] for i in range(self.world_size): @@ -1330,24 +1225,17 @@ class Init(InsertPostInitMethodToModuleSubClasses): for param in param_list: param_numel = param.ds_tensor.ds_numel - partitions[i].narrow(0, - offset, - param_numel).copy_(param.ds_tensor.data) + partitions[i].narrow(0, offset, param_numel).copy_(param.ds_tensor.data) offset += param_numel - dist.all_gather(partitions, - partitions[self.rank], - group=self.ds_process_group, - async_op=False) + dist.all_gather(partitions, partitions[self.rank], group=self.ds_process_group, async_op=False) param_offset = 0 for param in param_list: param_partition_size = param.ds_tensor.ds_numel param_size = param.ds_numel - replicated_tensor = torch.empty(param.ds_shape, - dtype=param.dtype, - device=self.local_device) + replicated_tensor = torch.empty(param.ds_shape, dtype=param.dtype, device=self.local_device) for i in range(self.world_size): @@ -1360,9 +1248,7 @@ class Init(InsertPostInitMethodToModuleSubClasses): part_to_copy = partitions[i].narrow(0, param_offset, numel_to_copy) - replicated_tensor.view(-1).narrow(0, - param_start, - numel_to_copy).copy_(part_to_copy) + replicated_tensor.view(-1).narrow(0, param_start, numel_to_copy).copy_(part_to_copy) #param_offset += param.data.numel() param_offset += param.ds_tensor.ds_numel @@ -1394,12 +1280,7 @@ class Init(InsertPostInitMethodToModuleSubClasses): #print_rank_0("REduce scatter was executed for praam {param.ds_id}") if start < param.ds_numel and end > param.ds_numel: elements = param.ds_numel - start - param.grad.view(-1).narrow(0, - start, - elements).copy_( - reduced_partition.narrow(0, - 0, - elements)) + param.grad.view(-1).narrow(0, start, elements).copy_(reduced_partition.narrow(0, 0, elements)) def _reduce_scatter_gradient(self, param): @@ -1418,26 +1299,16 @@ class Init(InsertPostInitMethodToModuleSubClasses): if start < param.ds_numel and end <= param.ds_numel: input = param.grad.view(-1).narrow(0, start, partition_size) else: - input = torch.zeros(partition_size, - dtype=param.dtype, - device=param.device) + input = torch.zeros(partition_size, dtype=param.dtype, device=param.device) if start < param.ds_numel: elements = param.ds_numel - start - input.narrow(0, - 0, - elements).copy_( - param.grad.view(-1).narrow(0, - start, - elements)) + input.narrow(0, 0, elements).copy_(param.grad.view(-1).narrow(0, start, elements)) #print("after reduce scatter gradients") input_list.append(input) rank = dist.get_rank(group=self.ds_process_group) - handle = dist.reduce_scatter(input_list[rank], - input_list, - group=self.ds_process_group, - async_op=True) + handle = dist.reduce_scatter(input_list[rank], input_list, group=self.ds_process_group, async_op=True) return handle, input_list[rank] @@ -1446,9 +1317,7 @@ class Init(InsertPostInitMethodToModuleSubClasses): partition_buffers = [None] * len(param_list) for param, partition_buffer in zip(param_list, partition_buffers): - self._partition_gradient(param, - partition_buffer=partition_buffer, - accumulate=accumulate) + self._partition_gradient(param, partition_buffer=partition_buffer, accumulate=accumulate) def _partition_gradient(self, param, partition_buffer=None, accumulate=False): #import pdb;pdb.set_trace() @@ -1462,9 +1331,7 @@ class Init(InsertPostInitMethodToModuleSubClasses): if partition_buffer is None: assert not accumulate, "No buffer to accumulate to" - partition_buffer = torch.zeros(partition_size, - dtype=param.dtype, - device=param.device) + partition_buffer = torch.zeros(partition_size, dtype=param.dtype, device=param.device) else: assert partition_buffer.numel( ) >= partition_size, f"The partition buffer size {partition_buffer.numel()} should match the size of param.ds_tensor {partition_size}" @@ -1496,9 +1363,7 @@ class Init(InsertPostInitMethodToModuleSubClasses): # when src is gpu and dest is cpu # adding directly to cpu is very slow else: - acc_tensor = torch.empty(src_tensor.numel(), - dtype=param.dtype, - device=param.device) + acc_tensor = torch.empty(src_tensor.numel(), dtype=param.dtype, device=param.device) acc_tensor.copy_(dest_tensor) acc_tensor.add_(src_tensor) @@ -1517,6 +1382,7 @@ class Init(InsertPostInitMethodToModuleSubClasses): class GatheredParameters: + def __init__(self, params, modifier_rank=None, fwd_module=None, enabled=True): """A context that collects parameters that were partitioned via a :class:`deepspeed.zero.Init` context. The parameters are partitioned @@ -1627,8 +1493,7 @@ class GatheredParameters: self.src_rank = modifier_rank else: # A group was specified; convert DP rank to global rank - self.src_rank = dist.get_global_rank(self.params[0].ds_process_group, - modifier_rank) + self.src_rank = dist.get_global_rank(self.params[0].ds_process_group, modifier_rank) self.fwd_module = fwd_module if self.fwd_module is not None: # is a no-op if already registered @@ -1647,12 +1512,7 @@ class GatheredParameters: self.params[0].partition(param_list=self.params, has_been_updated=False) return - handles = [ - dist.broadcast(p, - self.src_rank, - group=p.ds_process_group, - async_op=True) for p in self.params - ] + handles = [dist.broadcast(p, self.src_rank, group=p.ds_process_group, async_op=True) for p in self.params] for h in handles: h.wait() self.params[0].partition(param_list=self.params, has_been_updated=True) diff --git a/deepspeed/runtime/zero/partitioned_param_coordinator.py b/deepspeed/runtime/zero/partitioned_param_coordinator.py index f6edfe626..f5af8568f 100644 --- a/deepspeed/runtime/zero/partitioned_param_coordinator.py +++ b/deepspeed/runtime/zero/partitioned_param_coordinator.py @@ -24,8 +24,7 @@ def debug_rank0(message: str) -> None: @instrument_w_nvtx def get_all_parameters(sub_module, recurse=False): - return itertools.chain(sub_module.named_parameters(recurse=recurse), - sub_module.ds_external_parameters()) + return itertools.chain(sub_module.named_parameters(recurse=recurse), sub_module.ds_external_parameters()) def iter_params(module: Module, recurse=False) -> Iterable[Parameter]: @@ -43,17 +42,15 @@ class ZeRoTraceMode(Enum): class PartitionedParameterCoordinator: """Handles partitioning and gathering of parameters.""" + class __InflightParamRegistry(UserDict): """registry for parameters in flight""" - def __setitem__(self, - param: Parameter, - handle: AllGatherCoalescedHandle) -> None: + + def __setitem__(self, param: Parameter, handle: AllGatherCoalescedHandle) -> None: if param in self.data: raise RuntimeError(f"{param.ds_summary()} already in registry") if param.ds_status != ZeroParamStatus.INFLIGHT: - raise RuntimeError( - f"attempted to add non-inflight parameter to registry {param.ds_summary()}" - ) + raise RuntimeError(f"attempted to add non-inflight parameter to registry {param.ds_summary()}") self.data[param] = handle @dataclass @@ -78,10 +75,8 @@ class PartitionedParameterCoordinator: # sequence of submodules/parameters in forward pass + backward pass self.__submodule_order: Iterable[Module] = [] self.__param_order: Iterable[__class__.__ParamInTrace] = [] - self.__most_recent_step_id_param_fetched_for = collections.defaultdict( - lambda: int(-1e10)) - self.__step_id_module_fetched_for = collections.defaultdict( - lambda: collections.deque()) + self.__most_recent_step_id_param_fetched_for = collections.defaultdict(lambda: int(-1e10)) + self.__step_id_module_fetched_for = collections.defaultdict(lambda: collections.deque()) # number of available params, and max number of available params self.__n_available_params: int = 0 self.__max_n_available_params: int = max_available_parameters_in_numel @@ -122,8 +117,7 @@ class PartitionedParameterCoordinator: def _clear_trace_structures(self) -> None: self.__submodule_order = [] self.__param_order = [] - self.__most_recent_step_id_param_fetched_for = collections.defaultdict( - lambda: int(-1e10)) + self.__most_recent_step_id_param_fetched_for = collections.defaultdict(lambda: int(-1e10)) self.__param_queue = None def is_complete_trace(self) -> bool: @@ -146,17 +140,14 @@ class PartitionedParameterCoordinator: # sub_module must match expectation else invalidate trace cache if sub_module != self.__submodule_order[self.__step_id]: expected_module_id = self.__submodule_order[self.__step_id].id - debug_rank0( - f"Invalidate trace cache @ step {self.__step_id}: " - f"expected module {expected_module_id}, but got module {sub_module.id}" - ) + debug_rank0(f"Invalidate trace cache @ step {self.__step_id}: " + f"expected module {expected_module_id}, but got module {sub_module.id}") self._invalidate_trace() def record_module(self, sub_module: Module) -> None: """adds sub module to trace""" if not self.is_record_trace(): - raise RuntimeError( - f"attempted to record trace when status = {self.__trace_mode}") + raise RuntimeError(f"attempted to record trace when status = {self.__trace_mode}") self.__submodule_order.append(sub_module) self.__step_id_module_fetched_for[sub_module.id].append(self.__step_id) @@ -164,14 +155,11 @@ class PartitionedParameterCoordinator: def record_parameters(self, sub_module: Module) -> None: """adds sub module to trace""" if not self.is_record_trace(): - raise RuntimeError( - f"attempted to record trace when status = {self.__trace_mode}") + raise RuntimeError(f"attempted to record trace when status = {self.__trace_mode}") step_id = self.__step_id_module_fetched_for[sub_module.id].popleft() for param in sorted(set(iter_params(sub_module)), key=lambda p: p.ds_id): - self.__param_order.append( - __class__.__ParamInTrace(param=param, - step_id_last_used_at=step_id)) + self.__param_order.append(__class__.__ParamInTrace(param=param, step_id_last_used_at=step_id)) def construct_parameter_trace_from_module_trace(self): """use module trace to construct parameter trace""" @@ -182,9 +170,8 @@ class PartitionedParameterCoordinator: def reset_step(self) -> None: """indicate that we have completed one fwd+bwd for the model""" if self.__inflight_param_registry: - raise RuntimeError( - f"still have inflight params " - f"{[p.ds_summary for p in self.__inflight_param_registry.keys()]}") + raise RuntimeError(f"still have inflight params " + f"{[p.ds_summary for p in self.__inflight_param_registry.keys()]}") if not self.is_complete_trace(): # not self.trace_complete: # Make sure that recorded submodule orders are identical across ranks @@ -194,26 +181,20 @@ class PartitionedParameterCoordinator: # Successfully recorded a trace self.construct_parameter_trace_from_module_trace() # Make sure that recorded parameter orders are identical across ranks - assert_ints_same_as_other_ranks( - [p.param.ds_id for p in self.__param_order]) - assert_ints_same_as_other_ranks( - [p.step_id_last_used_at for p in self.__param_order]) + assert_ints_same_as_other_ranks([p.param.ds_id for p in self.__param_order]) + assert_ints_same_as_other_ranks([p.step_id_last_used_at for p in self.__param_order]) self.__submodule_order = tuple(self.__submodule_order) # freeze self.__param_order = tuple(self.__param_order) # freeze self.__trace_mode = ZeRoTraceMode.COMPLETE - print_rank_0( - f"completed record trace: {[m.id for m in self.__submodule_order]}", - force=False) + print_rank_0(f"completed record trace: {[m.id for m in self.__submodule_order]}", force=False) else: # Enable trace recording for next forward/backward pass self.__trace_mode = ZeRoTraceMode.RECORD self.__param_queue = collections.deque(self.__param_order) # reset fetch queue - self.__most_recent_step_id_param_fetched_for = collections.defaultdict( - lambda: int(-1e10)) - self.__step_id_module_fetched_for = collections.defaultdict( - lambda: collections.deque()) + self.__most_recent_step_id_param_fetched_for = collections.defaultdict(lambda: int(-1e10)) + self.__step_id_module_fetched_for = collections.defaultdict(lambda: collections.deque()) self.__step_id = 0 self.__n_available_params = 0 @@ -221,9 +202,7 @@ class PartitionedParameterCoordinator: if step_id is None: step_id = self.__step_id param_names = [debug_param2name_id(p) for p in params] - print( - f'{tag} step = {step_id} mod = {debug_module2name_id(sub_module)} p_names = {param_names}' - ) + print(f'{tag} step = {step_id} mod = {debug_module2name_id(sub_module)} p_names = {param_names}') def _dump_param_ids(self, tag, mod_id, p_ids, step_id=None): if step_id is None: @@ -263,11 +242,9 @@ class PartitionedParameterCoordinator: debug_rank0(f"-wait: {param.ds_summary()}") if param in self.__inflight_param_registry: with get_accelerator().stream(self.__allgather_stream): - while self.__ongoing_fetch_events and self.__ongoing_fetch_events[ - 0].query(): + while self.__ongoing_fetch_events and self.__ongoing_fetch_events[0].query(): self.__ongoing_fetch_events.popleft() - if len(self.__ongoing_fetch_events - ) > self.__max_ongoing_fetch_events: + if len(self.__ongoing_fetch_events) > self.__max_ongoing_fetch_events: self.__ongoing_fetch_events.popleft().synchronize() self.__inflight_param_registry.pop(param).wait() @@ -288,12 +265,8 @@ class PartitionedParameterCoordinator: # prefetches we won't look for them here discarded_from_prefetch_queue = set() params_not_already_fetched = set( - filter( - lambda p: self.__most_recent_step_id_param_fetched_for[p] < self. - __step_id, - params_to_fetch)) - while self.__param_queue and len(discarded_from_prefetch_queue) < len( - params_not_already_fetched): + filter(lambda p: self.__most_recent_step_id_param_fetched_for[p] < self.__step_id, params_to_fetch)) + while self.__param_queue and len(discarded_from_prefetch_queue) < len(params_not_already_fetched): param_in_trace = self.__param_queue.popleft() self.__most_recent_step_id_param_fetched_for[ param_in_trace.param] = param_in_trace.step_id_last_used_at @@ -305,8 +278,7 @@ class PartitionedParameterCoordinator: f"module id: {current_submodule.id}, training: {current_submodule.training}\n" f"expected the next {len(params_not_already_fetched)} parameters in the " f"parameter fetch queue to be {tuple(p.ds_summary(use_debug_name=True) for p in params_not_already_fetched)} \n" - f"but got \n {tuple(p.ds_summary(use_debug_name=True) for p in discarded_from_prefetch_queue)}." - ) + f"but got \n {tuple(p.ds_summary(use_debug_name=True) for p in discarded_from_prefetch_queue)}.") def _is_currently_on_nvme(param): if param.nvme_swapper is None: @@ -317,14 +289,12 @@ class PartitionedParameterCoordinator: # kick off all gather for params in the next few submodules (prefetch) if self.__prefetch_bucket_sz > 0: - max_params_to_prefetch = min( - self.__max_n_available_params - self.__n_available_params, - self.__prefetch_bucket_sz) + max_params_to_prefetch = min(self.__max_n_available_params - self.__n_available_params, + self.__prefetch_bucket_sz) params_to_prefetch = set() numel_prefetching = 0 while self.__param_queue and numel_prefetching < max_params_to_prefetch: - param_in_trace: __class__.__ParamInTrace = self.__param_queue.popleft( - ) + param_in_trace: __class__.__ParamInTrace = self.__param_queue.popleft() if _is_currently_on_nvme(param_in_trace.param): # nvme prefetch is handled elsewhere. Need to break here to preserve fetch order @@ -358,10 +328,8 @@ class PartitionedParameterCoordinator: def release_sub_module(self, submodule: Module) -> None: """release the parameters of a sub module, assuming they meet conditions to be released.""" - params_to_release = (self.__params_to_release(submodule, - self.__step_id) - if self.is_complete_trace() else set( - p.ds_id for p in iter_params(submodule))) + params_to_release = (self.__params_to_release(submodule, self.__step_id) if self.is_complete_trace() else set( + p.ds_id for p in iter_params(submodule))) for param in iter_params(submodule): param.ds_active_sub_modules.discard(submodule.id) if param.ds_id in params_to_release and not param.is_external_param: @@ -404,13 +372,10 @@ class PartitionedParameterCoordinator: # Release swap buffers for persisted params on nvme since they will never be partitioned or evicted from GPU swap_persisted_params = [ - p for p in partitioned_params - if p.ds_persist and p.ds_tensor.final_location == OffloadDeviceEnum.nvme + p for p in partitioned_params if p.ds_persist and p.ds_tensor.final_location == OffloadDeviceEnum.nvme ] if swap_persisted_params: - swap_persisted_params[ - 0].nvme_swapper.remove_partition_and_release_buffers( - swap_persisted_params) + swap_persisted_params[0].nvme_swapper.remove_partition_and_release_buffers(swap_persisted_params) @instrument_w_nvtx def __release_param(self, param: Parameter) -> None: @@ -421,14 +386,11 @@ class PartitionedParameterCoordinator: @instrument_w_nvtx @functools.lru_cache(maxsize=None) - def __params_to_release(self, - submodule_to_release: Module, - step_id: int) -> Set[int]: + def __params_to_release(self, submodule_to_release: Module, step_id: int) -> Set[int]: if not self.is_complete_trace(): raise RuntimeError("expected trace to be complete") - params_to_release = set(p.ds_id for p in iter_params(submodule_to_release) - if not p.ds_persist) + params_to_release = set(p.ds_id for p in iter_params(submodule_to_release) if not p.ds_persist) # Problem: When prefetcher scans the param trace, it skips AVAILABLE params. # This creates issues if those params are released before the skipped uses: @@ -470,8 +432,8 @@ class PartitionedParameterCoordinator: param = param_in_trace.param if param.nvme_swapper is None: continue - if (numel_considered > 2 * numel_in_flight or len(swap_in_params) >= - param.nvme_swapper.available_swap_in_buffers()): + if (numel_considered > 2 * numel_in_flight + or len(swap_in_params) >= param.nvme_swapper.available_swap_in_buffers()): break if param.ds_tensor.status == PartitionedParamStatus.NOT_AVAILABLE: swap_in_params.append(param) diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py index ad56887f4..25b91f7c3 100644 --- a/deepspeed/runtime/zero/stage3.py +++ b/deepspeed/runtime/zero/stage3.py @@ -73,6 +73,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): For usage examples, refer to TODO: DeepSpeed Tutorial """ + def __init__(self, module, init_optimizer, @@ -106,8 +107,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): see_memory_usage("Stage 3 initialize beginning", force=True) - print_rank_0(f"initialized {__class__.__name__} with args: {locals()}", - force=False) + print_rank_0(f"initialized {__class__.__name__} with args: {locals()}", force=False) if dist.get_rank() == 0: logger.info(f"Reduce bucket size {reduce_bucket_size}") @@ -147,18 +147,17 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): self.params_in_nvme_and_cpu = False self.max_params_in_cpu = 0 - self.parameter_offload = DeepSpeedZeRoOffload( - module=module, - timers=timers, - ds_config=ds_config, - overlap_comm=overlap_comm, - prefetch_bucket_size=prefetch_bucket_size, - max_reuse_distance=max_reuse_distance, - max_live_parameters=max_live_parameters, - param_persistence_threshold=param_persistence_threshold, - model_persistence_threshold=model_persistence_threshold, - offload_param_config=offload_optimizer_config, - mpu=mpu) + self.parameter_offload = DeepSpeedZeRoOffload(module=module, + timers=timers, + ds_config=ds_config, + overlap_comm=overlap_comm, + prefetch_bucket_size=prefetch_bucket_size, + max_reuse_distance=max_reuse_distance, + max_live_parameters=max_live_parameters, + param_persistence_threshold=param_persistence_threshold, + model_persistence_threshold=model_persistence_threshold, + offload_param_config=offload_optimizer_config, + mpu=mpu) self.persistent_parameters = self.parameter_offload.persistent_parameters self._configure_offloading(offload_optimizer_config, offload_param_config) @@ -166,20 +165,17 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): self.module = module self.elastic_checkpoint = elastic_checkpoint - self.__inf_or_nan_tracker: Tensor = torch.zeros( - 1, - dtype=torch.bool, - device=get_accelerator().current_device_name(), - requires_grad=False) + self.__inf_or_nan_tracker: Tensor = torch.zeros(1, + dtype=torch.bool, + device=get_accelerator().current_device_name(), + requires_grad=False) - self.deepspeed_adam_offload = (self.offload_optimizer - and type(init_optimizer) == DeepSpeedCPUAdam) + self.deepspeed_adam_offload = (self.offload_optimizer and type(init_optimizer) == DeepSpeedCPUAdam) - self.device = get_accelerator().current_device_name( - ) if not self.offload_optimizer else OffloadDeviceEnum.cpu + self.device = get_accelerator().current_device_name() if not self.offload_optimizer else OffloadDeviceEnum.cpu ### streams used for overlapping computation with communication - self.__reduce_and_partition_stream = get_accelerator().Stream( - ) if overlap_comm else get_accelerator().default_stream() + self.__reduce_and_partition_stream = get_accelerator().Stream() if overlap_comm else get_accelerator( + ).default_stream() ############################################################################ @@ -212,7 +208,9 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): self.reduce_bucket_size = int(reduce_bucket_size) if self.reduce_scatter: - assert self.communication_data_type in (torch.float16, torch.bfloat16, torch.float32), f"ZeRO-3 supports only float16 or bfloat16 communication_data_type with reduce scatter enabled. Got: '{self.communication_data_type}'" + assert self.communication_data_type in ( + torch.float16, torch.bfloat16, torch.float32 + ), f"ZeRO-3 supports only float16 or bfloat16 communication_data_type with reduce scatter enabled. Got: '{self.communication_data_type}'" assert self.gradient_predivide_factor == 1.0, "gradient_predivide_factor != 1.0 is not yet supported with ZeRO-3 with reduce scatter enabled" assert self.postscale_gradients, "pre-scale gradients is not yet supported with ZeRO-3 with reduce scatter enabled" @@ -258,8 +256,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): see_memory_usage("Before creating fp16 partitions", force=True) self._create_fp16_partitions_with_defragmentation(self.trainable_param_groups) num_fp16_subgroups = len(self.fp16_partitioned_groups_flat) - see_memory_usage(f"After creating fp16 partitions: {num_fp16_subgroups}", - force=True) + see_memory_usage(f"After creating fp16 partitions: {num_fp16_subgroups}", force=True) # Optimizer tensor swapping if self.swap_optimizer: @@ -300,14 +297,10 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): #Largest partitioned param largest_partitioned_param_numel = max([ - max([ - max(tensor.numel(), - tensor.ds_numel) for tensor in fp16_partitioned_group - ]) for fp16_partitioned_group in self.fp16_partitioned_groups + max([max(tensor.numel(), tensor.ds_numel) for tensor in fp16_partitioned_group]) + for fp16_partitioned_group in self.fp16_partitioned_groups ]) - print_rank_0( - f'Largest partitioned param numel = {largest_partitioned_param_numel}', - force=False) + print_rank_0(f'Largest partitioned param numel = {largest_partitioned_param_numel}', force=False) self._setup_for_real_optimizer() self.grad_position = {} @@ -351,9 +344,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): def _get_trainable_parameter_groups(self): param_groups = [] for param_group in self.optimizer.param_groups: - trainable_params = { - "params": [p for p in param_group["params"] if p.requires_grad] - } + trainable_params = {"params": [p for p in param_group["params"] if p.requires_grad]} param_groups.append(trainable_params) return param_groups @@ -377,31 +368,25 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): # IPG if self.contiguous_gradients: - self.__ipg_bucket_flat_buffer: Tensor = torch.empty( - self.reduce_bucket_size, - dtype=self.dtype, - device=get_accelerator().current_device_name()) + self.__ipg_bucket_flat_buffer: Tensor = torch.empty(self.reduce_bucket_size, + dtype=self.dtype, + device=get_accelerator().current_device_name()) grad_partitions_flat_buffer = None self.__param_id_to_grad_partition: Dict[int, Tensor] = {} all_params = list(itertools.chain.from_iterable(self.fp16_groups)) - grad_partitions_flat_buffer: Tensor = torch.zeros(sum(p.partition_numel() - for p in all_params), + grad_partitions_flat_buffer: Tensor = torch.zeros(sum(p.partition_numel() for p in all_params), dtype=self.dtype, device=self.device) if self.offload_optimizer_pin_memory: - grad_partitions_flat_buffer = get_accelerator().pin_memory( - grad_partitions_flat_buffer) + grad_partitions_flat_buffer = get_accelerator().pin_memory(grad_partitions_flat_buffer) offset = 0 for param in all_params: - self.__param_id_to_grad_partition[ - param.ds_id] = grad_partitions_flat_buffer.narrow( - 0, - offset, - param.partition_numel()) + self.__param_id_to_grad_partition[param.ds_id] = grad_partitions_flat_buffer.narrow( + 0, offset, param.partition_numel()) offset += param.partition_numel() def _link_all_hp_params(self): @@ -477,23 +462,21 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): force=False) def _configure_tensor_swapping(self, offload_optimizer_config, aio_config): - nvme_swap_folder = os.path.join(offload_optimizer_config.nvme_path, - 'zero_stage_3') + nvme_swap_folder = os.path.join(offload_optimizer_config.nvme_path, 'zero_stage_3') os.makedirs(nvme_swap_folder, exist_ok=True) if dist.get_rank() == 0: logger.info(f'Tensor Swapping: Adding optimizer tensors') swapper_type = PipelinedOptimizerSwapper if offload_optimizer_config.pipeline else PartitionedOptimizerSwapper - self.optimizer_swapper = swapper_type( - swap_config=offload_optimizer_config, - aio_config=aio_config, - base_folder=nvme_swap_folder, - optimizer=self.optimizer, - largest_numel=max(self.fp16_partitioned_groups_flat_numel), - device=self.device, - dtype=torch.float32, - timers=self.timers) + self.optimizer_swapper = swapper_type(swap_config=offload_optimizer_config, + aio_config=aio_config, + base_folder=nvme_swap_folder, + optimizer=self.optimizer, + largest_numel=max(self.fp16_partitioned_groups_flat_numel), + device=self.device, + dtype=torch.float32, + timers=self.timers) @property def elements_in_ipg_bucket(self): @@ -518,8 +501,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): '''if the parameter was initialized in nvme then bring it to the destination buffer directly''' if src.status == PartitionedParamStatus.NOT_AVAILABLE: print_rank_0( - f"Swapping in {param.ds_id} with partition size {param.partition_numel()} permanently to CPU" - ) + f"Swapping in {param.ds_id} with partition size {param.partition_numel()} permanently to CPU") param.nvme_swapper.swap_into_buffer(param, dest) src.data = dest.data src.status = PartitionedParamStatus.AVAILABLE @@ -544,33 +526,24 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): if self.params_in_nvme_and_cpu and \ aggregate_params_count + params_in_group > self.max_params_in_cpu: - flat_buffer_size = max(0, - self.max_params_in_cpu - aggregate_params_count) + flat_buffer_size = max(0, self.max_params_in_cpu - aggregate_params_count) aggregate_params_count += params_in_group if flat_buffer_size > 0: - print_rank_0(f"group {j} flat buffer size {flat_buffer_size}", - force=False) - self.param_groups_fp16_flat_cpu_memory.append( - get_accelerator().pin_memory( - torch.empty(int(flat_buffer_size), - dtype=self.dtype))) + print_rank_0(f"group {j} flat buffer size {flat_buffer_size}", force=False) + self.param_groups_fp16_flat_cpu_memory.append(get_accelerator().pin_memory( + torch.empty(int(flat_buffer_size), dtype=self.dtype))) else: - print_rank_0( - f"No flat buffer size. Param group size was {params_in_group}", - force=False) + print_rank_0(f"No flat buffer size. Param group size was {params_in_group}", force=False) - self.param_groups_fp16_flat_cpu_memory.append( - torch.empty(1, - dtype=self.dtype)) + self.param_groups_fp16_flat_cpu_memory.append(torch.empty(1, dtype=self.dtype)) def _create_fp16_partitions_with_defragmentation(self, fp16_param_groups): dist.barrier() param_groups: List[List[Parameter]] = tuple( - self._create_fp16_sub_groups(param_group["params"]) - for param_group in fp16_param_groups) + self._create_fp16_sub_groups(param_group["params"]) for param_group in fp16_param_groups) # bookkeeping related to param groups for param_group_idx, param_group in enumerate(param_groups): @@ -579,23 +552,18 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): # record sub group and partitions self.fp16_groups.append(sub_group) - self.fp16_partitioned_groups.append( - [param.ds_tensor for param in sub_group]) + self.fp16_partitioned_groups.append([param.ds_tensor for param in sub_group]) # record sub group -> group mapping self.sub_group_to_group_id[sub_group_idx] = param_group_idx # record total elements of parameter partitions in sub group - self.fp16_partitioned_groups_flat_numel.append( - sum(p.partition_numel() for p in sub_group)) + self.fp16_partitioned_groups_flat_numel.append(sum(p.partition_numel() for p in sub_group)) # record padding required to align group to world size (only applies to last rank) rank_requires_padding = dist.get_rank( - self.dp_process_group) == dist.get_world_size( - self.dp_process_group) - 1 - self.groups_padding.append([ - p.padding_size() if rank_requires_padding else 0 for p in sub_group - ]) + self.dp_process_group) == dist.get_world_size(self.dp_process_group) - 1 + self.groups_padding.append([p.padding_size() if rank_requires_padding else 0 for p in sub_group]) # move parameters to flattened buffer if not self.offload_param: # partitioned params remain in GPU during training @@ -611,10 +579,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): offset = 0 for sub_group in self.fp16_groups: sub_group_numel = sum(param.partition_numel() for param in sub_group) - self.fp16_partitioned_groups_flat.append( - device_buffer.narrow(0, - offset, - sub_group_numel)) + self.fp16_partitioned_groups_flat.append(device_buffer.narrow(0, offset, sub_group_numel)) offset += sub_group_numel else: # partitioned params offloaded to CPU when not in use # create a flat CPU memory allocation for each param group @@ -627,19 +592,15 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): #Flat buffer may not be available for parameters that reside in NVME if not self.params_in_nvme_and_cpu or flat_offset + total_elements <= self.param_groups_fp16_flat_cpu_memory[ param_group_idx].numel(): - fp16_partitioned_group_flat = self.param_groups_fp16_flat_cpu_memory[ - param_group_idx].narrow(0, - flat_offset, - total_elements) + fp16_partitioned_group_flat = self.param_groups_fp16_flat_cpu_memory[param_group_idx].narrow( + 0, flat_offset, total_elements) print_rank_0( f"Creating a flat buffer for subgroup {i} requiring {total_elements} elements, and cumulative CPU elements {flat_offset + total_elements}", force=False) elif self.params_in_nvme_and_cpu: fp16_partitioned_group_flat = None - print_rank_0( - f"No flat buffer for sub group {i} of {total_elements} elements", - force=False) + print_rank_0(f"No flat buffer for sub group {i} of {total_elements} elements", force=False) else: assert False, "Either params are in nvme, or they are in CPU memory. This code path should not be triggered. Please see you max_params_in_cpu and params_in_nvme configs" @@ -652,9 +613,8 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): # if necessary, create a pinned memory buffer to be used for swapping out # params to NVME after optimizer step - should_create_fp16_flat_reuse_buffer = any( - flattened_partition_group is None - for flattened_partition_group in self.fp16_partitioned_groups_flat) + should_create_fp16_flat_reuse_buffer = any(flattened_partition_group is None + for flattened_partition_group in self.fp16_partitioned_groups_flat) if should_create_fp16_flat_reuse_buffer: max_partition_numel, largest_partition_numel = 0, None for sub_group in self.fp16_groups: @@ -664,15 +624,14 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): max_partition_numel = total_elements assert len(largest_partition_numel) > 0, f'Unexpected that largest partition is empty' - self.fp16_groups[0][0].nvme_swapper.reserve_partitioned_swap_space( - largest_partition_numel) + self.fp16_groups[0][0].nvme_swapper.reserve_partitioned_swap_space(largest_partition_numel) def _swap_in_sub_group_to_flat_buffer(self, flat_buffer, sub_group_id): offset = 0 - elements_in_sub_group = sum( - [t.ds_numel for t in self.fp16_partitioned_groups[sub_group_id]]) + elements_in_sub_group = sum([t.ds_numel for t in self.fp16_partitioned_groups[sub_group_id]]) assert (flat_buffer.numel() == elements_in_sub_group) - for param, partitioned_param in zip(self.fp16_groups[sub_group_id], self.fp16_partitioned_groups[sub_group_id]): + for param, partitioned_param in zip(self.fp16_groups[sub_group_id], + self.fp16_partitioned_groups[sub_group_id]): dest = flat_buffer.narrow(0, offset, partitioned_param.ds_numel) if partitioned_param.status == PartitionedParamStatus.NOT_AVAILABLE: print_rank_0( @@ -687,9 +646,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): offset += partitioned_param.ds_numel def _create_next_swappable_fp32_groups(self): - reverse_order_indices = [ - i for i in range(len(self.fp32_partitioned_groups_flat)) - ] + reverse_order_indices = [i for i in range(len(self.fp32_partitioned_groups_flat))] reverse_order_indices.reverse() next_group = None @@ -702,16 +659,13 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): def _get_sub_group_partitions(self, sub_group_id): sub_group_partitions = [] - for param, partitioned_param in zip(self.fp16_groups[sub_group_id], self.fp16_partitioned_groups[sub_group_id]): + for param, partitioned_param in zip(self.fp16_groups[sub_group_id], + self.fp16_partitioned_groups[sub_group_id]): if partitioned_param.status == PartitionedParamStatus.NOT_AVAILABLE: swap_path = param.nvme_swapper.get_path(param, True) - sub_group_partitions.append((partitioned_param, - param.partition_numel(), - swap_path)) + sub_group_partitions.append((partitioned_param, param.partition_numel(), swap_path)) else: - sub_group_partitions.append((partitioned_param, - partitioned_param.ds_numel, - None)) + sub_group_partitions.append((partitioned_param, partitioned_param.ds_numel, None)) return sub_group_partitions @@ -749,60 +703,47 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): sub_group_partitions = self._get_sub_group_partitions(i) nvme_fp16_partitions_info.append(sub_group_partitions) nvme_fp16_num_elems.append(num_elements) - nvme_fp32_dest_tensors.append( - self.fp32_partitioned_groups_flat[i]) + nvme_fp32_dest_tensors.append(self.fp32_partitioned_groups_flat[i]) else: - unpinned_fp32_buffer = torch.empty(num_elements, - device=self.device, - dtype=torch.float) + unpinned_fp32_buffer = torch.empty(num_elements, device=self.device, dtype=torch.float) self._swap_in_sub_group_to_flat_buffer(unpinned_fp32_buffer, i) - self.optimizer_swapper.initialize_parameters( - parameters=[self.fp32_partitioned_groups_flat[i]], - src_tensors=[unpinned_fp32_buffer]) + self.optimizer_swapper.initialize_parameters(parameters=[self.fp32_partitioned_groups_flat[i]], + src_tensors=[unpinned_fp32_buffer]) else: num_swap_from_cpu_partitions += 1 swap_from_cpu_memory_usage += (fp32_element_size * num_elements) swappable_fp32_tensors.append(self.fp32_partitioned_groups_flat[i]) - swappable_fp16_src_tensors.append( - self.fp16_partitioned_groups_flat[i]) + swappable_fp16_src_tensors.append(self.fp16_partitioned_groups_flat[i]) else: cpu_memory_usage += (fp32_element_size * num_elements) cpu_memory_sub_groups += 1 if self.params_in_nvme_and_cpu and tensor is None: - unpinned_fp32_buffer = torch.empty(num_elements, - device=self.device, - dtype=torch.float) + unpinned_fp32_buffer = torch.empty(num_elements, device=self.device, dtype=torch.float) self._swap_in_sub_group_to_flat_buffer(unpinned_fp32_buffer, i) self.fp32_partitioned_groups_flat.append(unpinned_fp32_buffer) else: - self.fp32_partitioned_groups_flat.append( - self.fp16_partitioned_groups_flat[i].to( - self.device).clone().float().detach()) + self.fp32_partitioned_groups_flat.append(self.fp16_partitioned_groups_flat[i].to( + self.device).clone().float().detach()) - self.fp32_partitioned_groups_flat[ - i].requires_grad = True # keep this in case internal optimizer uses it + self.fp32_partitioned_groups_flat[i].requires_grad = True # keep this in case internal optimizer uses it if len(swappable_fp32_tensors) > 0: - self.optimizer_swapper.initialize_parameters( - parameters=swappable_fp32_tensors, - src_tensors=swappable_fp16_src_tensors) + self.optimizer_swapper.initialize_parameters(parameters=swappable_fp32_tensors, + src_tensors=swappable_fp16_src_tensors) if len(nvme_fp32_dest_tensors) > 0: - fp16_pinned_buffers = self.fp16_groups[0][ - 0].nvme_swapper.reserve_available_buffers() + fp16_pinned_buffers = self.fp16_groups[0][0].nvme_swapper.reserve_available_buffers() assert len(fp16_pinned_buffers) > 0 - self.optimizer_swapper.initialize_from_swapped_fp16_params( - fp16_partitions_info=nvme_fp16_partitions_info, - fp16_num_elems=nvme_fp16_num_elems, - fp16_pinned_buffers=fp16_pinned_buffers, - fp32_parameters=nvme_fp32_dest_tensors) + self.optimizer_swapper.initialize_from_swapped_fp16_params(fp16_partitions_info=nvme_fp16_partitions_info, + fp16_num_elems=nvme_fp16_num_elems, + fp16_pinned_buffers=fp16_pinned_buffers, + fp32_parameters=nvme_fp32_dest_tensors) self.fp16_groups[0][0].nvme_swapper.release_reserved_buffers() nvme_gigabytes = nvme_memory_usage / GIGA_BYTES - print_rank_0( - f'Swappable FP32 Partitions: count={num_swappable_partitions} size={nvme_gigabytes:5.2f} GB', - force=False) + print_rank_0(f'Swappable FP32 Partitions: count={num_swappable_partitions} size={nvme_gigabytes:5.2f} GB', + force=False) if self.params_in_nvme_and_cpu: print_rank_0( f'Swap from NVMe Partitions: count = {num_swap_from_nvme_partitions}, size = {swap_from_nvme_memory_usage/GIGA_BYTES:5.2f}GB', @@ -812,9 +753,8 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): force=False) cpu_memory_gigabytes = cpu_memory_usage / GIGA_BYTES - print_rank_0( - f'In-Memory FP32 Partitions: count={cpu_memory_sub_groups} size={cpu_memory_gigabytes:5.2f} GB', - force=False) + print_rank_0(f'In-Memory FP32 Partitions: count={cpu_memory_sub_groups} size={cpu_memory_gigabytes:5.2f} GB', + force=False) # Clear for on-the-fly population before the optimizer step for param_group in self.optimizer.param_groups: @@ -836,8 +776,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): sub_group.append(param) local_sub_group_size += param.partition_numel() - if local_sub_group_size >= sub_group_size or id(param) == id( - params_group[-1]): + if local_sub_group_size >= sub_group_size or id(param) == id(params_group[-1]): sub_groups.append(sub_group) @@ -862,9 +801,8 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): if not self.swap_optimizer: return False - return self.optimizer_swapper.swappable_tensor( - None, - numel=self.fp16_partitioned_groups_flat_numel[sub_group_id]) + return self.optimizer_swapper.swappable_tensor(None, + numel=self.fp16_partitioned_groups_flat_numel[sub_group_id]) def _partitioned_params_swap_out(self, i): offset = 0 @@ -884,19 +822,15 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): offset += partitioned_param.ds_numel if len(swap_fp16_params): - swap_fp16_params[0].nvme_swapper.swap_out_partitioned_params( - dst_fp16_params=swap_fp16_params, - src_fp32_params=swap_fp32_params) + swap_fp16_params[0].nvme_swapper.swap_out_partitioned_params(dst_fp16_params=swap_fp16_params, + src_fp32_params=swap_fp32_params) def initialize_optimizer_states(self): num_subgroups = len(self.fp16_groups) - largest_numel = max( - [sum([p.ds_numel for p in psg]) for psg in self.fp16_partitioned_groups]) + largest_numel = max([sum([p.ds_numel for p in psg]) for psg in self.fp16_partitioned_groups]) gradient_dtype = self.fp32_partitioned_groups_flat[0].dtype - gradient_buffer = torch.zeros(int(largest_numel), - dtype=gradient_dtype, - device=self.device) + gradient_buffer = torch.zeros(int(largest_numel), dtype=gradient_dtype, device=self.device) timer_names = set() @@ -921,19 +855,13 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): self._optimizer_states_and_gradient_swap_in(i, timer_names) if self.offload_optimizer and not swappable_optimizer_subgroup: - subgroup_gradient_buffer = torch.zeros(num_elements, - dtype=gradient_dtype, - device=self.device) + subgroup_gradient_buffer = torch.zeros(num_elements, dtype=gradient_dtype, device=self.device) if self.offload_optimizer_pin_memory: - subgroup_gradient_buffer = get_accelerator().pin_memory( - subgroup_gradient_buffer) + subgroup_gradient_buffer = get_accelerator().pin_memory(subgroup_gradient_buffer) self.fp32_partitioned_groups_flat[i].grad = subgroup_gradient_buffer else: - self.fp32_partitioned_groups_flat[i].grad = gradient_buffer.narrow( - 0, - 0, - num_elements) + self.fp32_partitioned_groups_flat[i].grad = gradient_buffer.narrow(0, 0, num_elements) self._optimizer_step(i) @@ -992,11 +920,8 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): self.grad_start_offset[i][partition_id] = {} self.initialize_gradient_partition(i, param_group, partition_id) self.is_partition_reduced[i][partition_id] = False - self.first_param_index_in_partition[i][ - partition_id] = self.get_first_param_index( - i, - param_group, - partition_id) + self.first_param_index_in_partition[i][partition_id] = self.get_first_param_index( + i, param_group, partition_id) @instrument_w_nvtx def independent_gradient_partition_epilogue(self): @@ -1017,8 +942,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): for i, sub_group in enumerate(self.fp16_groups): self.averaged_gradients[i] = [ self.__param_id_to_grad_partition[param.ds_id] - if param.requires_grad else torch.zeros_like(param.ds_tensor) - for param in sub_group + if param.requires_grad else torch.zeros_like(param.ds_tensor) for param in sub_group ] # self.averaged_gradients[i] = self.get_flat_partition( # self.fp16_groups[i], @@ -1087,8 +1011,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): # garbage data and `self.average_tensor()` will crash because its params_to_reduce will be # empty, while reduction_list will have that garbage data. if self.elements_in_ipg_bucket > 0 and self.elements_in_ipg_bucket + param.ds_numel > self.reduce_bucket_size: - self.report_ipg_memory_usage("In ipg_remove_grads before reduce_ipg_grads", - param.ds_numel) + self.report_ipg_memory_usage("In ipg_remove_grads before reduce_ipg_grads", param.ds_numel) self.__reduce_and_partition_ipg_grads() @@ -1103,18 +1026,14 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): @instrument_w_nvtx @torch.no_grad() def __add_grad_to_ipg_bucket(self, param: Parameter) -> None: - self.__reduce_and_partition_stream.wait_stream( - get_accelerator().default_stream()) + self.__reduce_and_partition_stream.wait_stream(get_accelerator().default_stream()) - if self.contiguous_gradients and self.elements_in_ipg_bucket + param.grad.numel( - ) < self.reduce_bucket_size: + if self.contiguous_gradients and self.elements_in_ipg_bucket + param.grad.numel() < self.reduce_bucket_size: # move the gradient to a contiguous buffer with get_accelerator().stream(self.__reduce_and_partition_stream): # move the parameter's gradient to the contiguous flat buffer - new_grad_tensor = self.__ipg_bucket_flat_buffer.narrow( - 0, - self.elements_in_ipg_bucket, - param.grad.numel()).view_as(param.grad) + new_grad_tensor = self.__ipg_bucket_flat_buffer.narrow(0, self.elements_in_ipg_bucket, + param.grad.numel()).view_as(param.grad) new_grad_tensor.copy_(param.grad, non_blocking=True) param.grad.record_stream(get_accelerator().current_stream()) param.grad.data = new_grad_tensor @@ -1129,14 +1048,12 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): for param in self.__params_in_ipg_bucket: if param.grad.numel() != param.ds_numel: - raise RuntimeError( - f"{param.grad.numel()} != {param.ds_numel} Cannot reduce scatter " - f"gradients whose size is not same as the params") + raise RuntimeError(f"{param.grad.numel()} != {param.ds_numel} Cannot reduce scatter " + f"gradients whose size is not same as the params") self.__params_in_ipg_bucket.sort(key=lambda p: p.ds_id) - assert len(set(p.ds_id for p in self.__params_in_ipg_bucket)) == len( - self.__params_in_ipg_bucket) + assert len(set(p.ds_id for p in self.__params_in_ipg_bucket)) == len(self.__params_in_ipg_bucket) while self.__param_reduce_events and self.__param_reduce_events[0].query(): self.__param_reduce_events.popleft() @@ -1145,8 +1062,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): with get_accelerator().stream(self.__reduce_and_partition_stream): if safe_mode: - assert_ints_same_as_other_ranks( - [p.ds_id for p in self.__params_in_ipg_bucket]) + assert_ints_same_as_other_ranks([p.ds_id for p in self.__params_in_ipg_bucket]) grad_partitions = self.__avg_scatter_grads(self.__params_in_ipg_bucket) self.__partition_grads(self.__params_in_ipg_bucket, grad_partitions) @@ -1163,28 +1079,18 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): full_grads_for_rank = [p.grad for p in params_to_reduce] if self.communication_data_type != self.dtype: - full_grads_for_rank = [ - g.to(self.communication_data_type) for g in full_grads_for_rank - ] + full_grads_for_rank = [g.to(self.communication_data_type) for g in full_grads_for_rank] if self.postscale_gradients and self.gradient_predivide_factor != 1.0: - full_grads_for_rank = [ - g.div(self.gradient_predivide_factor) for g in full_grads_for_rank - ] + full_grads_for_rank = [g.div(self.gradient_predivide_factor) for g in full_grads_for_rank] - grad_partitions_for_rank = reduce_scatter_coalesced(full_grads_for_rank, - self.dp_process_group) + grad_partitions_for_rank = reduce_scatter_coalesced(full_grads_for_rank, self.dp_process_group) - if self.postscale_gradients and self.gradient_predivide_factor != dist.get_world_size( - self.dp_process_group): - grad_partitions_for_rank = [ - g.mul(self.gradient_predivide_factor) for g in grad_partitions_for_rank - ] + if self.postscale_gradients and self.gradient_predivide_factor != dist.get_world_size(self.dp_process_group): + grad_partitions_for_rank = [g.mul(self.gradient_predivide_factor) for g in grad_partitions_for_rank] if self.communication_data_type != self.dtype: - grad_partitions_for_rank = [ - g.to(self.dtype) for g in grad_partitions_for_rank - ] + grad_partitions_for_rank = [g.to(self.dtype) for g in grad_partitions_for_rank] return grad_partitions_for_rank @@ -1195,11 +1101,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): param_id = self.get_param_id(param) num_elements = param.partition_numel() - self.grad_position[param_id] = [ - int(i), - int(current_offset), - int(num_elements) - ] + self.grad_position[param_id] = [int(i), int(current_offset), int(num_elements)] #print(f"param id {param_id} i:{i}, ds_tensor {num_elements} numel {param.numel()}") current_offset += num_elements see_memory_usage(f"After Set Grad positions", force=False) @@ -1240,40 +1142,31 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): # Sum across all model parallel GPUs. total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)]) - dist.all_reduce(total_norm_cuda, - op=dist.ReduceOp.SUM, - group=self.dp_process_group) + dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=self.dp_process_group) self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.SUM) total_norm = total_norm_cuda[0].item()**(1. / norm_type) - if total_norm == float( - 'inf') or total_norm == -float('inf') or total_norm != total_norm: + if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm: total_norm = -1 return total_norm @instrument_w_nvtx - def __partition_grads(self, - params_to_release: List[Parameter], - grad_partitions: List[Tensor]) -> None: + def __partition_grads(self, params_to_release: List[Parameter], grad_partitions: List[Tensor]) -> None: offload_fp32_gradients = {} offload_fp32_offsets = {} for param, grad_partition in zip(params_to_release, grad_partitions): - contains_real_data = param.partition_numel() * dist.get_rank( - self.dp_process_group) < param.ds_numel + contains_real_data = param.partition_numel() * dist.get_rank(self.dp_process_group) < param.ds_numel if not contains_real_data: # this grad partition is empty - don't need to do anything param.grad = None continue # move or accumulate gradient partition to target buffer - grad_buffer = self.__param_id_to_grad_partition[param.ds_id].narrow( - 0, - 0, - grad_partition.numel()) + grad_buffer = self.__param_id_to_grad_partition[param.ds_id].narrow(0, 0, grad_partition.numel()) if self.micro_step_id == 0: # don't accumulate grad_buffer.copy_(grad_partition, non_blocking=True) # ensure grad buffer is a CUDA buffer to speed up the next few @@ -1284,8 +1177,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): else: # if dst is CPU, copy first to src device, do the addition # there, then move back to dst. adding directly to cpu is very slow - cuda_grad_buffer = grad_buffer.to(grad_partition.device, - non_blocking=True) + cuda_grad_buffer = grad_buffer.to(grad_partition.device, non_blocking=True) cuda_grad_buffer.add_(grad_partition) grad_buffer.copy_(cuda_grad_buffer, non_blocking=True) # ensure grad buffer is a CUDA buffer to speed up the next few @@ -1306,8 +1198,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): i, dest_offset, _ = self.grad_position[self.get_param_id(param)] if self.is_gradient_accumulation_boundary: - self.norm_for_param_grads[self.get_param_id( - param)] = self._constant_buffered_norm2(grad_buffer) + self.norm_for_param_grads[self.get_param_id(param)] = self._constant_buffered_norm2(grad_buffer) if self._swappable_optimizer_subgroup(i): if not i in offload_fp32_gradients.keys(): @@ -1317,10 +1208,8 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): offload_fp32_gradients[i].append(grad_buffer.float()) offload_fp32_offsets[i].append(dest_offset) else: - fp32_grad_tensor = self.fp32_partitioned_groups_flat[ - i].grad.narrow(0, - dest_offset, - grad_buffer.numel()) + fp32_grad_tensor = self.fp32_partitioned_groups_flat[i].grad.narrow( + 0, dest_offset, grad_buffer.numel()) fp32_grad_tensor.copy_(grad_buffer) # free the gradient @@ -1329,16 +1218,16 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): if self.offload_optimizer and self.swap_optimizer: for i in offload_fp32_gradients.keys(): - self.optimizer_swapper.swap_out_gradients( - parameter=self.fp32_partitioned_groups_flat[i], - gradient_offsets=offload_fp32_offsets[i], - gradient_tensors=offload_fp32_gradients[i]) + self.optimizer_swapper.swap_out_gradients(parameter=self.fp32_partitioned_groups_flat[i], + gradient_offsets=offload_fp32_offsets[i], + gradient_tensors=offload_fp32_gradients[i]) def reduce_ready_partitions_and_remove_grads(self, param, i): #print_rank_0(f"Backward {debug_param2name_id_shape(param)}", force=True) self.reduce_independent_p_g_buckets_and_remove_grads(param, i) def zero_reduced_gradients(self, partition_id, i): + def are_all_related_partitions_reduced(params_id): for partition_id in self.param_to_partition_ids[i][params_id]: if not self.is_partition_reduced[i][partition_id]: @@ -1358,29 +1247,23 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): self.sequential_execution(print_func, message) def get_grads_to_reduce(self, i, partition_id): + def get_reducible_portion(key): grad = self.param_dict[key].grad total_elements = grad.numel() start = self.grad_start_offset[i][partition_id][key] - num_elements = min( - total_elements - start, - self.partition_size[i] - - self.grad_partition_insertion_offset[i][partition_id][key]) + num_elements = min(total_elements - start, + self.partition_size[i] - self.grad_partition_insertion_offset[i][partition_id][key]) if not pg_correctness_test: if num_elements == total_elements: return grad else: - return grad.contiguous().view(-1).narrow(0, - int(start), - int(num_elements)) + return grad.contiguous().view(-1).narrow(0, int(start), int(num_elements)) else: if num_elements == total_elements: return grad.clone() else: - return grad.clone().contiguous().view(-1).narrow( - 0, - int(start), - int(num_elements)) + return grad.clone().contiguous().view(-1).narrow(0, int(start), int(num_elements)) grads_to_reduce = [] for key in self.is_grad_computed[i][partition_id]: @@ -1443,11 +1326,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): for buf, synced in zip(small_bucket, self.unflatten(allreduced, small_bucket)): buf.copy_(synced) - def allreduce_no_retain(self, - bucket, - numel_per_bucket=500000000, - rank=None, - log=None): + def allreduce_no_retain(self, bucket, numel_per_bucket=500000000, rank=None, log=None): small_bucket = [] numel = 0 for tensor in bucket: @@ -1502,11 +1381,11 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): if (current_index >= start_index and current_index < end_index): params_in_partition.append(tensor) - elif start_index > current_index and start_index < (current_index + - tensor_size): + elif start_index > current_index and start_index < (current_index + tensor_size): params_in_partition.append(tensor) - assert (first_offset == 0), "This can happen either zero or only once as this must be the first tensor in the partition" + assert (first_offset == 0 + ), "This can happen either zero or only once as this must be the first tensor in the partition" first_offset = start_index - current_index else: @@ -1566,9 +1445,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): if norm_type == inf: total_norm = max(g.data.abs().max() for g in gradients) total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)]) - dist.all_reduce(total_norm_cuda, - op=dist.ReduceOp.MAX, - group=self.dp_process_group) + dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=self.dp_process_group) # Take max across all GPUs. self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.MAX) @@ -1579,23 +1456,18 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): grad_norms = [] for g, p in zip(gradients, params): if is_model_parallel_parameter(p) or (self.model_parallel_rank == 0): - grad_norms.append( - g.to(get_accelerator().device_name(), - non_blocking=True).double().norm(2)) + grad_norms.append(g.to(get_accelerator().device_name(), non_blocking=True).double().norm(2)) # Sum across all model parallel GPUs. total_norm_cuda = torch.sum(torch.pow(torch.stack(grad_norms), 2)) - dist.all_reduce(total_norm_cuda, - op=dist.ReduceOp.SUM, - group=self.dp_process_group) + dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=self.dp_process_group) self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.SUM) total_norm = total_norm_cuda.item()**(1. / norm_type) - if total_norm == float( - 'inf') or total_norm == -float('inf') or total_norm != total_norm: + if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm: total_norm = -1 return total_norm @@ -1603,11 +1475,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): # creates a flat fused tensor from the tensor list starting at the first_offset # in the first tensor of the list. If there are not enough elements in the tensor # list then the flat tensor will be padded with zeros - def get_flat_partition(self, - tensor_list, - first_offset, - partition_size, - return_tensor_list=False): + def get_flat_partition(self, tensor_list, first_offset, partition_size, return_tensor_list=False): flat_tensor_list = [] current_size = 0 for i, tensor in enumerate(tensor_list): @@ -1630,10 +1498,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): # we need a narrow view of the tensor based on the tensor offset and number of elements that # we need from this tensor if tensor_offset > 0 or num_elements < tensor.numel(): - flat_tensor_list.append(tensor.contiguous().view(-1).narrow( - 0, - int(tensor_offset), - int(num_elements))) + flat_tensor_list.append(tensor.contiguous().view(-1).narrow(0, int(tensor_offset), int(num_elements))) else: flat_tensor_list.append(tensor) @@ -1695,13 +1560,9 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): norm_groups = [] for i, group in enumerate(self.fp16_groups): if self.offload_optimizer: - norm_groups.append( - self.complete_grad_norm_calculation_for_cpu_offload( - self.fp16_groups[i])) + norm_groups.append(self.complete_grad_norm_calculation_for_cpu_offload(self.fp16_groups[i])) else: - norm_groups.append( - self.get_grad_norm_direct(self.averaged_gradients[i], - self.fp16_groups[i])) + norm_groups.append(self.get_grad_norm_direct(self.averaged_gradients[i], self.fp16_groups[i])) return norm_groups @instrument_w_nvtx @@ -1720,22 +1581,19 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): # release all the gradient since we have already created a necessary copy in dp_grad_partition self.zero_grad(set_to_none=True) - for grad in filter(lambda g: get_accelerator().on_accelerator(g), - self.averaged_gradients[sub_group_id]): + for grad in filter(lambda g: get_accelerator().on_accelerator(g), self.averaged_gradients[sub_group_id]): grad.record_stream(get_accelerator().current_stream()) self.averaged_gradients[sub_group_id] = None @instrument_w_nvtx def _prepare_sub_group(self, sub_group_id, timer_names=set()): - see_memory_usage(f'Before prepare optimizer sub group {sub_group_id}', - force=False) + see_memory_usage(f'Before prepare optimizer sub group {sub_group_id}', force=False) if self._swappable_optimizer_subgroup(sub_group_id): self._optimizer_states_and_gradient_swap_in(sub_group_id, timer_names) elif not self.offload_optimizer: self._prepare_fp32_grad_for_sub_group(sub_group_id) - see_memory_usage(f'After prepare optimizer sub group {sub_group_id}', - force=False) + see_memory_usage(f'After prepare optimizer sub group {sub_group_id}', force=False) def _optimizer_states_and_gradient_swap_in(self, sub_group_id, timer_names=set()): param_length = self.fp16_partitioned_groups_flat_numel[sub_group_id] @@ -1744,8 +1602,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): f'Parameter {fp32_param_id} of numel={param_length} is not swappable' OPTIMIZER_SWAP_IN_STATE = 'optimizer_swap_in_state' - see_memory_usage(f'pre-step Before swapping in optimizer tensors {sub_group_id}', - force=False) + see_memory_usage(f'pre-step Before swapping in optimizer tensors {sub_group_id}', force=False) self.start_timers([OPTIMIZER_SWAP_IN_STATE]) self.optimizer_swapper.swap_in_optimizer_state( @@ -1754,21 +1611,18 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): self.stop_timers([OPTIMIZER_SWAP_IN_STATE]) timer_names.add(OPTIMIZER_SWAP_IN_STATE) - see_memory_usage(f'pre-step After swapping in optimizer tensors {sub_group_id}', - force=False) + see_memory_usage(f'pre-step After swapping in optimizer tensors {sub_group_id}', force=False) @instrument_w_nvtx def _release_sub_group(self, sub_group_id, timer_names=set()): - see_memory_usage(f'Before release optimizer sub group {sub_group_id}', - force=False) + see_memory_usage(f'Before release optimizer sub group {sub_group_id}', force=False) # get rid of the fp32 gradients. Not needed anymore if not self.offload_optimizer: self.fp32_partitioned_groups_flat[sub_group_id].grad = None if self._swappable_optimizer_subgroup(sub_group_id): self._optimizer_states_and_gradient_swap_out(sub_group_id, timer_names) - see_memory_usage(f'After release optimizer sub group {sub_group_id}', - force=False) + see_memory_usage(f'After release optimizer sub group {sub_group_id}', force=False) # create a flat tensor aligned at the alignment boundary @instrument_w_nvtx @@ -1781,9 +1635,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): if remaining: elements_to_add = alignment - remaining - pad_tensor = torch.zeros(elements_to_add, - device=tensor_list[0].device, - dtype=tensor_list[0].dtype) + pad_tensor = torch.zeros(elements_to_add, device=tensor_list[0].device, dtype=tensor_list[0].dtype) padded_tensor_list = tensor_list + [pad_tensor] num_elements = num_elements + elements_to_add @@ -1799,20 +1651,15 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): f'Parameter {fp32_param_id} of numel={param_length} is not swappable' OPTIMIZER_SWAP_OUT_STATE = 'optimizer_swap_out_state' - see_memory_usage( - f'post-step Before swapping out optimizer tensors {sub_group_id}', - force=False) + see_memory_usage(f'post-step Before swapping out optimizer tensors {sub_group_id}', force=False) self.start_timers([OPTIMIZER_SWAP_OUT_STATE]) self.optimizer_swapper.swap_out_optimizer_state( parameter=self.fp32_partitioned_groups_flat[sub_group_id], - async_swap=self.next_swappable_fp32_partitioned_groups[sub_group_id] - is not None) + async_swap=self.next_swappable_fp32_partitioned_groups[sub_group_id] is not None) self.stop_timers([OPTIMIZER_SWAP_OUT_STATE]) - see_memory_usage( - f'post-step After swapping out optimizer tensors {sub_group_id}', - force=False) + see_memory_usage(f'post-step After swapping out optimizer tensors {sub_group_id}', force=False) timer_names.add(OPTIMIZER_SWAP_OUT_STATE) # get rid of the fp32 gradients. Not needed anymore @@ -1881,9 +1728,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): def override_loss_scale(self, loss_scale): if loss_scale != self.external_loss_scale: - logger.info( - f'[deepspeed] setting loss scale from {self.external_loss_scale} -> {loss_scale}' - ) + logger.info(f'[deepspeed] setting loss scale from {self.external_loss_scale} -> {loss_scale}') self.custom_loss_scaler = True self.external_loss_scale = loss_scale @@ -1965,21 +1810,15 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): def dump_post_step_gradients(self): # Dump gradient norms for debugging for i, group in enumerate(self.fp16_groups): - print( - f'Post-Step Dump Norms for Group {i} FP16P, FP16DS, FP16FLAT, FP32FLAT') + print(f'Post-Step Dump Norms for Group {i} FP16P, FP16DS, FP16FLAT, FP32FLAT') unflat_fp16 = self.unflatten(self.fp16_groups_flat[i], self.fp16_groups[i]) - unflat_fp32 = self.unflatten(self.fp32_partitioned_groups_flat[i], - self.fp16_groups[i]) + unflat_fp32 = self.unflatten(self.fp32_partitioned_groups_flat[i], self.fp16_groups[i]) for j, p in enumerate(self.fp16_groups[i]): param_id = self.get_param_id(p) param_norm = float(p.data.float().norm(2)) ds_norm = float(p.ds_tensor.data.float().norm(2)) - unflat_norm = [ - float(t.data.float().norm(2)) - for t in [unflat_fp16[j], - unflat_fp32[j]] - ] + unflat_norm = [float(t.data.float().norm(2)) for t in [unflat_fp16[j], unflat_fp32[j]]] norm_list = [param_norm, ds_norm] + unflat_norm print(f'Post-Step Norms {i} {param_id} = {norm_list}') @@ -2023,9 +1862,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): overflow = self.local_overflow #overflow = self.has_overflow_partitioned_grads_serial() overflow_gpu = get_accelerator().ByteTensor([overflow]) - dist.all_reduce(overflow_gpu, - op=dist.ReduceOp.MAX, - group=self.dp_process_group) + dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=self.dp_process_group) else: params = [] @@ -2100,10 +1937,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): for group in self.fp16_groups: for param_idx, param in enumerate(group): group_idx, dest_offset, num_elements = self.grad_position[self.get_param_id(param)] - fp32_grad = self.fp32_partitioned_groups_flat[group_idx].grad.narrow( - 0, - dest_offset, - num_elements) + fp32_grad = self.fp32_partitioned_groups_flat[group_idx].grad.narrow(0, dest_offset, num_elements) grad_dict[group_idx][param_idx] = fp32_grad else: for group_idx, group in self.averaged_gradients.items(): @@ -2119,8 +1953,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): my_rank = dist.get_rank(group=self.dp_process_group) partitions = [ reduce_buffer.narrow(0, - fp32_state.numel() * i, - fp32_state.numel()) for i in range(self.partition_count) + fp32_state.numel() * i, fp32_state.numel()) for i in range(self.partition_count) ] partitions[my_rank].data.copy_(fp32_state.data, non_blocking=False) @@ -2136,10 +1969,8 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): if self.offload_optimizer: group_idx, dest_offset, num_elements = self.grad_position[self.get_param_id(param)] - fp32_grad = self.fp32_partitioned_groups_flat[group_idx].grad.narrow( - 0, - dest_offset, - num_elements).to(device=param.device) + fp32_grad = self.fp32_partitioned_groups_flat[group_idx].grad.narrow(0, dest_offset, + num_elements).to(device=param.device) else: fp32_grad = self.__param_id_to_grad_partition[param.ds_id].float() @@ -2157,14 +1988,10 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): fp32_param = self.fp32_partitioned_groups_flat[group_idx] if optim_state_key is None: - fp32_opt_state = fp32_param.narrow(0, - dest_offset, - num_elements).to(device=param.device) + fp32_opt_state = fp32_param.narrow(0, dest_offset, num_elements).to(device=param.device) else: fp32_opt_state = self.optimizer.state[fp32_param][optim_state_key].narrow( - 0, - dest_offset, - num_elements).to(device=param.device) + 0, dest_offset, num_elements).to(device=param.device) hp_param = self._fp32_state_allgather(param, fp32_opt_state) if self._swappable_optimizer_subgroup(group_idx): @@ -2234,10 +2061,8 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): for key, value in self.optimizer.state[p].items(): if torch.is_tensor(value): padded_lens = [t.numel() for t in self.fp16_partitioned_groups[i]] - lean_state[key] = self._get_lean_tensors( - value, - self.fp16_partitioned_groups[i], - self.groups_padding[i]) + lean_state[key] = self._get_lean_tensors(value, self.fp16_partitioned_groups[i], + self.groups_padding[i]) lean_flat_len = sum([t.numel() for t in lean_state[key]]) else: lean_state[key] = value @@ -2250,9 +2075,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): # Return group tensor after removing paddings added for alignment to DP world size. groups_without_padding = [] for i, group in enumerate(groups_with_padding): - lean_group = self._get_lean_tensors(group, - self.fp16_partitioned_groups[i], - self.groups_padding[i]) + lean_group = self._get_lean_tensors(group, self.fp16_partitioned_groups[i], self.groups_padding[i]) groups_without_padding.append(lean_group) return groups_without_padding @@ -2294,14 +2117,11 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): torch.save(checkpoint, "saved.pth") """ if self.elastic_checkpoint: - raise NotImplementedError( - "ZeRO-3 does not yet support elastic checkpointing, please disable for now." - ) + raise NotImplementedError("ZeRO-3 does not yet support elastic checkpointing, please disable for now.") if self.swap_optimizer or self.params_in_nvme_and_cpu: raise NotImplementedError( - "ZeRO-3 does not yet support checkpointing with NVMe offloading, please disable for now." - ) + "ZeRO-3 does not yet support checkpointing with NVMe offloading, please disable for now.") return self._rigid_state_dict() @@ -2323,7 +2143,8 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): # Restore base optimizer fp32 weights from ZeRO fp16 weights def _restore_from_bit16_weights(self): - for fp16_partitions, fp32_partition in zip(self.fp16_partitioned_groups_flat, self.fp32_partitioned_groups_flat): + for fp16_partitions, fp32_partition in zip(self.fp16_partitioned_groups_flat, + self.fp32_partitioned_groups_flat): fp32_partition.data.copy_(fp16_partitions.data) # Refresh the fp32 master params from the fp16 copies. @@ -2342,9 +2163,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): local_state_partitions = [] for param_index, param_slices in enumerate(param_partitions): - flattened_merged_tensor = self.flatten_dense_tensors_aligned( - param_slices, - alignment) + flattened_merged_tensor = self.flatten_dense_tensors_aligned(param_slices, alignment) new_partitions = self.get_data_parallel_partitions(flattened_merged_tensor) local_state_partitions.append(new_partitions[partition_id]) @@ -2362,15 +2181,10 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): base_optimizer_group_states = [] for i in range(len(self.optimizer.param_groups)): partition_states = {} - all_partition_group_states = [ - sd['base_optimizer_state'][i] for sd in all_state_dict - ] + all_partition_group_states = [sd['base_optimizer_state'][i] for sd in all_state_dict] for key in all_partition_group_states[0].keys(): - all_partition_states = [ - all_states[key] for all_states in all_partition_group_states - ] - partition_states[key] = self._get_flattened_partition( - all_partition_states) + all_partition_states = [all_states[key] for all_states in all_partition_group_states] + partition_states[key] = self._get_flattened_partition(all_partition_states) base_optimizer_group_states.append(partition_states) for i, group in enumerate(self.optimizer.param_groups): @@ -2404,9 +2218,8 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): # update fp16 unflattened params for sub_group_id in range(len(self.fp16_partitioned_groups_flat)): - updated_params = self.unflatten( - self.fp16_partitioned_groups_flat[sub_group_id], - self.fp16_partitioned_groups[sub_group_id]) + updated_params = self.unflatten(self.fp16_partitioned_groups_flat[sub_group_id], + self.fp16_partitioned_groups[sub_group_id]) for partitioned_param, q in zip(self.fp16_partitioned_groups[sub_group_id], updated_params): partitioned_param.data = q.data @@ -2443,18 +2256,14 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): """ if self.elastic_checkpoint: - raise NotImplementedError( - "ZeRO-3 does not yet support elastic checkpointing, please disable for now." - ) + raise NotImplementedError("ZeRO-3 does not yet support elastic checkpointing, please disable for now.") if self.swap_optimizer or self.params_in_nvme_and_cpu: raise NotImplementedError( - "ZeRO-3 does not yet support checkpointing with NVMe offloading, please disable for now." - ) + "ZeRO-3 does not yet support checkpointing with NVMe offloading, please disable for now.") - self._rigid_load_state_dict( - state_dict_list[dist.get_rank(group=self.dp_process_group)], - load_optimizer_states=load_optimizer_states) + self._rigid_load_state_dict(state_dict_list[dist.get_rank(group=self.dp_process_group)], + load_optimizer_states=load_optimizer_states) if len(self.persistent_parameters) > 0: self.persistent_parameters[0].partition(self.persistent_parameters) @@ -2480,9 +2289,7 @@ def _handle_overflow(cpu_sum, x, i): if not math.isfinite(float(v)): t_i = v_i break - logger.info( - f"rank {rank} detected overflow {cpu_sum} in tensor {i}:{t_i} shape {x.shape}" - ) + logger.info(f"rank {rank} detected overflow {cpu_sum} in tensor {i}:{t_i} shape {x.shape}") def estimate_zero3_model_states_mem_needs(total_params, @@ -2505,8 +2312,7 @@ def estimate_zero3_model_states_mem_needs(total_params, if zero_init: cpu_mem = total_params * 18 * gpus_factor * additional_buffer_factor else: - cpu_mem = total_params * max(4 * num_gpus_per_node, - 18 * gpus_factor) * additional_buffer_factor + cpu_mem = total_params * max(4 * num_gpus_per_node, 18 * gpus_factor) * additional_buffer_factor else: gpu_mem = largest_layer_memory + int(2 * total_params / total_gpus) @@ -2514,8 +2320,7 @@ def estimate_zero3_model_states_mem_needs(total_params, if zero_init: cpu_mem = total_params * 16 * gpus_factor * additional_buffer_factor else: - cpu_mem = total_params * max(4 * num_gpus_per_node, - 16 * gpus_factor) * additional_buffer_factor + cpu_mem = total_params * max(4 * num_gpus_per_node, 16 * gpus_factor) * additional_buffer_factor else: gpu_mem = largest_layer_memory + int(18 * total_params / total_gpus) if zero_init: @@ -2528,9 +2333,7 @@ def estimate_zero3_model_states_mem_needs(total_params, def model_to_params(model): # shared params calculated only once - total_params = sum( - dict((p.data_ptr(), - p.numel()) for p in model.parameters()).values()) + total_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values()) largest_layer_params = 0 for m in model.modules(): @@ -2565,12 +2368,11 @@ def estimate_zero3_model_states_mem_needs_all_live(model, total_params, largest_layer_params = model_to_params(model) - estimate_zero3_model_states_mem_needs_all_cold( - total_params=total_params, - largest_layer_params=largest_layer_params, - num_gpus_per_node=num_gpus_per_node, - num_nodes=num_nodes, - additional_buffer_factor=additional_buffer_factor) + estimate_zero3_model_states_mem_needs_all_cold(total_params=total_params, + largest_layer_params=largest_layer_params, + num_gpus_per_node=num_gpus_per_node, + num_nodes=num_nodes, + additional_buffer_factor=additional_buffer_factor) def estimate_zero3_model_states_mem_needs_all_cold(total_params, @@ -2596,6 +2398,7 @@ def estimate_zero3_model_states_mem_needs_all_cold(total_params, - ``additional_buffer_factor``: estimation factor (defaults to 1.5): """ + def format_options(cpu_offload, cpu_offload_params, zero_init): enabled = [] padded_cpu_str = f'{OffloadDeviceEnum.cpu:4}' @@ -2627,11 +2430,9 @@ def estimate_zero3_model_states_mem_needs_all_cold(total_params, cpu_offload=cpu_offload, cpu_offload_params=cpu_offload_params, zero_init=zero_init, - additional_buffer_factor=additional_buffer_factor - ) + additional_buffer_factor=additional_buffer_factor) options_str = format_options(cpu_offload=cpu_offload, cpu_offload_params=cpu_offload_params, zero_init=zero_init) - print( - f" {cpu_mem/2**30:7.2f}GB | {gpu_mem/2**30:6.2f}GB | {options_str}") + print(f" {cpu_mem/2**30:7.2f}GB | {gpu_mem/2**30:6.2f}GB | {options_str}") diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py index 9c22bd0c0..b7d534791 100755 --- a/deepspeed/runtime/zero/stage_1_and_2.py +++ b/deepspeed/runtime/zero/stage_1_and_2.py @@ -10,14 +10,8 @@ from collections import OrderedDict from deepspeed.runtime import ZeROOptimizer from deepspeed.runtime.fp16.loss_scaler import CreateLossScaler -from deepspeed.runtime.utils import (bwc_tensor_model_parallel_rank, - get_global_norm, - empty_cache, - see_memory_usage, - inf, - is_model_parallel_parameter, - align_dense_tensors, - all_gather_dp_groups) +from deepspeed.runtime.utils import (bwc_tensor_model_parallel_rank, get_global_norm, empty_cache, see_memory_usage, + inf, is_model_parallel_parameter, align_dense_tensors, all_gather_dp_groups) from deepspeed.runtime.zero.config import ZeroStageEnum from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum @@ -30,14 +24,9 @@ from deepspeed.runtime.constants import PIPE_REPLICATED from deepspeed.accelerator import get_accelerator from deepspeed.ops.op_builder import UtilsBuilder -from deepspeed.checkpoint.constants import (DS_VERSION, - GROUP_PADDINGS, - PARTITION_COUNT, - SINGLE_PARTITION_OF_FP32_GROUPS, - BASE_OPTIMIZER_STATE, - CLIP_GRAD, - ZERO_STAGE, - PARAM_SLICE_MAPPINGS) +from deepspeed.checkpoint.constants import (DS_VERSION, GROUP_PADDINGS, PARTITION_COUNT, + SINGLE_PARTITION_OF_FP32_GROUPS, BASE_OPTIMIZER_STATE, CLIP_GRAD, + ZERO_STAGE, PARAM_SLICE_MAPPINGS) from deepspeed.utils import link_hp_params from deepspeed.checkpoint import enable_universal_checkpoint @@ -53,10 +42,8 @@ def input(msg): def split_half_float_double(tensors): device_type = get_accelerator().device_name() dtypes = [ - "torch.{}.HalfTensor".format(device_type), - "torch.{}.FloatTensor".format(device_type), - "torch.{}.DoubleTensor".format(device_type), - "torch.{}.BFloat16Tensor".format(device_type) + "torch.{}.HalfTensor".format(device_type), "torch.{}.FloatTensor".format(device_type), + "torch.{}.DoubleTensor".format(device_type), "torch.{}.BFloat16Tensor".format(device_type) ] buckets = [] for i, dtype in enumerate(dtypes): @@ -110,6 +97,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): For usage examples, refer to TODO: DeepSpeed Tutorial """ + def __init__(self, init_optimizer, param_names, @@ -179,8 +167,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): self.deepspeed_adam_offload = cpu_offload - self.device = get_accelerator().current_device_name( - ) if not self.cpu_offload else 'cpu' + self.device = get_accelerator().current_device_name() if not self.cpu_offload else 'cpu' self.dp_process_group = dp_process_group @@ -195,9 +182,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): #For MoE models this maybe different for different param group #It will be modified during MoE setup later in the init - self.real_dp_process_group = [ - dp_process_group for i in range(len(self.optimizer.param_groups)) - ] + self.real_dp_process_group = [dp_process_group for i in range(len(self.optimizer.param_groups))] self.partition_count = [dp_size for i in range(len(self.optimizer.param_groups))] self.is_gradient_accumulation_boundary = True @@ -233,10 +218,14 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): self.fp16_master_weights_and_gradients = fp16_master_weights_and_gradients if self.fp16_master_weights_and_gradients: - assert self.cpu_offload and type(self.optimizer) in [DeepSpeedCPUAdam], f"fp16_master_and_gradients requires optimizer to support keeping fp16 master and gradients while keeping the optimizer states in fp32. Currently only supported using ZeRO-Offload with DeepSpeedCPUAdam. But current setting is ZeRO-Offload:{self.cpu_offload} and optimizer type {type(self.optimizer)}. Either disable fp16_master_weights_and_gradients or enable ZeRO-2 Offload with DeepSpeedCPUAdam" + assert self.cpu_offload and type(self.optimizer) in [ + DeepSpeedCPUAdam + ], f"fp16_master_and_gradients requires optimizer to support keeping fp16 master and gradients while keeping the optimizer states in fp32. Currently only supported using ZeRO-Offload with DeepSpeedCPUAdam. But current setting is ZeRO-Offload:{self.cpu_offload} and optimizer type {type(self.optimizer)}. Either disable fp16_master_weights_and_gradients or enable ZeRO-2 Offload with DeepSpeedCPUAdam" if self.reduce_scatter: - assert self.communication_data_type in (torch.float16, torch.bfloat16), f"ZeRO-2 supports only float16 or bfloat16 communication_data_type with reduce scatter enabled. Got: '{self.communication_data_type}'" + assert self.communication_data_type in ( + torch.float16, torch.bfloat16 + ), f"ZeRO-2 supports only float16 or bfloat16 communication_data_type with reduce scatter enabled. Got: '{self.communication_data_type}'" assert self.gradient_predivide_factor == 1.0, "gradient_predivide_factor != 1.0 is not yet supported with ZeRO-2 with reduce scatter enabled" assert self.postscale_gradients, "pre-scale gradients is not yet supported with ZeRO-2 with reduce scatter enabled" @@ -272,7 +261,9 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): # align nccl all-gather send buffers to 4-byte boundary self.nccl_start_alignment_factor = 2 # 4-byte alignment/sizeof(fp16) = 2 - assert (allgather_bucket_size % self.nccl_start_alignment_factor == 0), f"allgather_bucket_size must be a multiple of nccl_start_alignment_factor, {self.nccl_start_alignment_factor} " + assert ( + allgather_bucket_size % self.nccl_start_alignment_factor == 0 + ), f"allgather_bucket_size must be a multiple of nccl_start_alignment_factor, {self.nccl_start_alignment_factor} " self.all_reduce_print = False self.dtype = self.optimizer.param_groups[0]['params'][0].dtype @@ -289,9 +280,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): # push this group to list before modify # TODO: Explore simplification that avoids the extra book-keeping by pushing the reordered group - trainable_parameters = [ - param for param in param_group['params'] if param.requires_grad - ] + trainable_parameters = [param for param in param_group['params'] if param.requires_grad] self.bit16_groups.append(trainable_parameters) # not sure why apex was cloning the weights before flattening @@ -309,9 +298,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): # to the same rank, instead they will belong to 3 ranks (r_m+2, r_m+1, r_m). if self.round_robin_gradients: round_robin_tensors, round_robin_indices = self._round_robin_reorder( - self.bit16_groups[i], - dist.get_world_size(group=self.real_dp_process_group[i]) - ) + self.bit16_groups[i], dist.get_world_size(group=self.real_dp_process_group[i])) else: round_robin_tensors = self.bit16_groups[i] round_robin_indices = list(range(len(self.bit16_groups[i]))) @@ -323,15 +310,12 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): self.bit16_groups_flat.append( self.flatten_dense_tensors_aligned( self.round_robin_bit16_groups[i], - self.nccl_start_alignment_factor * - dist.get_world_size(group=self.real_dp_process_group[i])).to( + self.nccl_start_alignment_factor * dist.get_world_size(group=self.real_dp_process_group[i])).to( get_accelerator().current_device_name())) - see_memory_usage(f"After flattening and moving param group {i} to GPU", - force=False) + see_memory_usage(f"After flattening and moving param group {i} to GPU", force=False) # Record padding required for alignment - if partition_id == dist.get_world_size( - group=self.real_dp_process_group[i]) - 1: + if partition_id == dist.get_world_size(group=self.real_dp_process_group[i]) - 1: padding = self.bit16_groups_flat[i].numel() - sum( [t.numel() for t in self.round_robin_bit16_groups[i]]) else: @@ -339,36 +323,29 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): self.groups_padding.append(padding) if dist.get_rank(group=self.real_dp_process_group[i]) == 0: - see_memory_usage( - f"After Flattening and after emptying param group {i} cache", - force=False) + see_memory_usage(f"After Flattening and after emptying param group {i} cache", force=False) # set model bit16 weight to slices of flattened buffer self._update_model_bit16_weights(i) # divide the flat weights into near equal partition equal to the data parallel degree # each process will compute on a different part of the partition - data_parallel_partitions = self.get_data_parallel_partitions( - self.bit16_groups_flat[i], - i) + data_parallel_partitions = self.get_data_parallel_partitions(self.bit16_groups_flat[i], i) self.parallel_partitioned_bit16_groups.append(data_parallel_partitions) # verify that data partition start locations are 4-byte aligned for partitioned_data in data_parallel_partitions: - assert (partitioned_data.data_ptr() % - (2 * self.nccl_start_alignment_factor) == 0) + assert (partitioned_data.data_ptr() % (2 * self.nccl_start_alignment_factor) == 0) # A partition of the fp32 master weights that will be updated by this process. # Note that the params in single_partition_of_fp32_groups is cloned and detached # from the origin params of the model. if not fp16_master_weights_and_gradients: - self.single_partition_of_fp32_groups.append( - self.parallel_partitioned_bit16_groups[i][partition_id].to( - self.device).clone().float().detach()) + self.single_partition_of_fp32_groups.append(self.parallel_partitioned_bit16_groups[i][partition_id].to( + self.device).clone().float().detach()) else: - self.single_partition_of_fp32_groups.append( - self.parallel_partitioned_bit16_groups[i][partition_id].to( - self.device).clone().half().detach()) + self.single_partition_of_fp32_groups.append(self.parallel_partitioned_bit16_groups[i][partition_id].to( + self.device).clone().half().detach()) # Set local optimizer to have flat params of its own partition. # After this, the local optimizer will only contain its own partition of params. @@ -377,12 +354,9 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): i].requires_grad = True # keep this in case internal optimizer uses it param_group['params'] = [self.single_partition_of_fp32_groups[i]] - partition_size = len(self.bit16_groups_flat[i]) / dist.get_world_size( - group=self.real_dp_process_group[i]) + partition_size = len(self.bit16_groups_flat[i]) / dist.get_world_size(group=self.real_dp_process_group[i]) params_in_partition, params_not_in_partition, first_offset = self.get_partition_info( - self.round_robin_bit16_groups[i], - partition_size, - partition_id) + self.round_robin_bit16_groups[i], partition_size, partition_id) self.partition_size.append(partition_size) self.params_in_partition.append(params_in_partition) @@ -399,8 +373,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): self.reduce_bucket_size = int(reduce_bucket_size) self.allgather_bucket_size = int(allgather_bucket_size) - self.reduction_event = get_accelerator().Event(enable_timing=False, - blocking=False) + self.reduction_event = get_accelerator().Event(enable_timing=False, blocking=False) self.reduction_stream = get_accelerator().Stream() self.cpu_computation_stream = get_accelerator().Stream() self.copy_grad_stream = get_accelerator().Stream() @@ -449,18 +422,12 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): self.local_overflow = False self.grad_position = {} self.temp_grad_buffer_for_cpu_offload = get_accelerator().pin_memory( - torch.zeros(largest_param_numel, - device=self.device, - dtype=self.dtype)) - self.temp_grad_buffer_for_gpu_offload = torch.zeros( - largest_param_numel, - device=get_accelerator().current_device_name(), - dtype=self.dtype) + torch.zeros(largest_param_numel, device=self.device, dtype=self.dtype)) + self.temp_grad_buffer_for_gpu_offload = torch.zeros(largest_param_numel, + device=get_accelerator().current_device_name(), + dtype=self.dtype) for i, params_group in enumerate(self.bit16_groups): - self.get_grad_position(i, - self.params_in_partition[i], - self.first_offset[i], - self.partition_size[i]) + self.get_grad_position(i, self.params_in_partition[i], self.first_offset[i], self.partition_size[i]) # mapping from parameter to partition that it belongs to self.param_to_partition_ids = {} @@ -537,8 +504,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): for lp in self.bit16_groups[i]: if lp._hp_mapping is not None: lp_name = self.param_names[lp] - param_mapping_per_group[ - lp_name] = lp._hp_mapping.get_hp_fragment_address() + param_mapping_per_group[lp_name] = lp._hp_mapping.get_hp_fragment_address() param_mapping.append(param_mapping_per_group) return param_mapping @@ -553,17 +519,16 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): partition_id = dist.get_rank(group=self.real_dp_process_group[i]) partition_size = self.bit16_groups_flat[i].numel() // dp_world_size flat_hp_partition = self.single_partition_of_fp32_groups[i] - link_hp_params( - lp_param_list=self.bit16_groups[i], - flat_hp_partition=flat_hp_partition, - gradient_dict=self.averaged_gradients, - offload_gradient_dict=self.offload_gradient_dict, - use_offload=self.cpu_offload, - param_group_index=i, - partition_start=partition_id * partition_size, - partition_size=partition_size, - partition_optimizer_state=self.optimizer.state[flat_hp_partition], - dp_group=self.real_dp_process_group[i]) + link_hp_params(lp_param_list=self.bit16_groups[i], + flat_hp_partition=flat_hp_partition, + gradient_dict=self.averaged_gradients, + offload_gradient_dict=self.offload_gradient_dict, + use_offload=self.cpu_offload, + param_group_index=i, + partition_start=partition_id * partition_size, + partition_size=partition_size, + partition_optimizer_state=self.optimizer.state[flat_hp_partition], + dp_group=self.real_dp_process_group[i]) def is_moe_group(self, group): return 'moe' in group and group['moe'] @@ -575,19 +540,19 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): # NOTE: To run ZeRO stage 1 with MoE, we need to set self.contiguous_gradients to True or ignore the assertion if not self.partition_gradients and not self.contiguous_gradients: logger.warn( - "ZeRO Stage 1 has not been thoroughly tested with MoE. This configuration is still experimental." - ) + "ZeRO Stage 1 has not been thoroughly tested with MoE. This configuration is still experimental.") assert self.reduce_scatter, "Reduce Scatter in ZeRO Stage 2 must be set to True for MoE. Other code paths are not tested with MoE" - assert any([self.is_moe_group(group) for group in self.optimizer.param_groups]), "The model has moe layers, but None of the param groups are marked as MoE. Create a param group with 'moe' key set to True before creating optimizer" + assert any( + [self.is_moe_group(group) for group in self.optimizer.param_groups] + ), "The model has moe layers, but None of the param groups are marked as MoE. Create a param group with 'moe' key set to True before creating optimizer" self.is_moe_param_group = [] for i, group in enumerate(self.optimizer.param_groups): if self.is_moe_group(group): - assert all([is_moe_param(param) for param in group['params']]), "All params in MoE group must be MoE params" - self.real_dp_process_group[i] = self.expert_dp_process_group[ - group['name']] - self.partition_count[i] = dist.get_world_size( - group=self.expert_dp_process_group[group['name']]) + assert all([is_moe_param(param) + for param in group['params']]), "All params in MoE group must be MoE params" + self.real_dp_process_group[i] = self.expert_dp_process_group[group['name']] + self.partition_count[i] = dist.get_world_size(group=self.expert_dp_process_group[group['name']]) self.is_moe_param_group.append(True) else: self.is_moe_param_group.append(False) @@ -638,10 +603,9 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): def initialize_optimizer_states(self): for i, group in enumerate(self.bit16_groups): - single_grad_partition = torch.zeros( - int(self.partition_size[i]), - dtype=self.single_partition_of_fp32_groups[i].dtype, - device=self.device) + single_grad_partition = torch.zeros(int(self.partition_size[i]), + dtype=self.single_partition_of_fp32_groups[i].dtype, + device=self.device) self.single_partition_of_fp32_groups[i].grad = get_accelerator().pin_memory( single_grad_partition) if self.cpu_offload else single_grad_partition @@ -709,11 +673,8 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): self.total_grads_in_partition[i][partition_id] = 0 self.initialize_gradient_partition(i, param_group, partition_id) self.is_partition_reduced[i][partition_id] = False - self.first_param_index_in_partition[i][ - partition_id] = self.get_first_param_index( - i, - param_group, - partition_id) + self.first_param_index_in_partition[i][partition_id] = self.get_first_param_index( + i, param_group, partition_id) def independent_gradient_partition_epilogue(self): self.report_ipg_memory_usage(f"In ipg_epilogue before reduce_ipg_grads", 0) @@ -742,13 +703,12 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): device=get_accelerator().current_device_name(), return_tensor_list=True) else: - avg_new = self.get_flat_partition( - self.params_in_partition[i], - self.first_offset[i], - self.partition_size[i], - dtype=self.dtype, - device=get_accelerator().current_device_name(), - return_tensor_list=True) + avg_new = self.get_flat_partition(self.params_in_partition[i], + self.first_offset[i], + self.partition_size[i], + dtype=self.dtype, + device=get_accelerator().current_device_name(), + return_tensor_list=True) for accumulated_grad, new_avg_grad in zip(self.averaged_gradients[i], avg_new): accumulated_grad.add_(new_avg_grad) @@ -769,13 +729,13 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): total_partitions = dist.get_world_size(group=self.real_dp_process_group[i]) for partition_id in range(total_partitions): self.is_partition_reduced[i][partition_id] = False - self.remaining_grads_in_partition[i][ - partition_id] = self.total_grads_in_partition[i][partition_id] + self.remaining_grads_in_partition[i][partition_id] = self.total_grads_in_partition[i][partition_id] for param_id in self.is_grad_computed[i][partition_id]: self.is_grad_computed[i][partition_id][param_id] = False def initialize_gradient_partition(self, i, param_group, partition_id): + def set_key_value_list(dictionary, key, value): if key in dictionary: dictionary[key].append(value) @@ -802,25 +762,20 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): param_id = self.get_param_id(param) if (current_index >= start_index and current_index < end_index): - set_key_value_list(self.param_to_partition_ids[i], - param_id, - partition_id) + set_key_value_list(self.param_to_partition_ids[i], param_id, partition_id) increment_value(self.total_grads_in_partition[i], partition_id) self.is_grad_computed[i][partition_id][param_id] = False - self.grad_partition_insertion_offset[i][partition_id][ - param_id] = current_index - start_index + self.grad_partition_insertion_offset[i][partition_id][param_id] = current_index - start_index self.grad_start_offset[i][partition_id][param_id] = 0 - elif start_index > current_index and start_index < (current_index + - param_size): - assert (first_offset == 0), "This can happen either zero or only once as this must be the first tensor in the partition" + elif start_index > current_index and start_index < (current_index + param_size): + assert (first_offset == 0 + ), "This can happen either zero or only once as this must be the first tensor in the partition" first_offset = start_index - current_index - set_key_value_list(self.param_to_partition_ids[i], - param_id, - partition_id) + set_key_value_list(self.param_to_partition_ids[i], param_id, partition_id) increment_value(self.total_grads_in_partition[i], partition_id) self.is_grad_computed[i][partition_id][param_id] = False @@ -869,14 +824,12 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): ############### Independent Partition Gradient ######################## def reduce_independent_p_g_buckets_and_remove_grads(self, param, i): if self.elements_in_ipg_bucket + param.numel() > self.reduce_bucket_size: - self.report_ipg_memory_usage("In ipg_remove_grads before reduce_ipg_grads", - param.numel()) + self.report_ipg_memory_usage("In ipg_remove_grads before reduce_ipg_grads", param.numel()) self.reduce_ipg_grads() if self.contiguous_gradients and self.overlap_comm: # Swap ipg_index between 0 and 1 self.ipg_index = 1 - self.ipg_index - self.report_ipg_memory_usage("In ipg_remove_grads after reduce_ipg_grads", - param.numel()) + self.report_ipg_memory_usage("In ipg_remove_grads after reduce_ipg_grads", param.numel()) param_id = self.get_param_id(param) assert self.params_already_reduced[param_id] == False, \ @@ -889,10 +842,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): elif self.contiguous_gradients: # keeping the gradients contiguous to prevent memory fragmentation, and avoid flattening - new_grad_tensor = self.ipg_buffer[self.ipg_index].narrow( - 0, - self.elements_in_ipg_bucket, - param.numel()) + new_grad_tensor = self.ipg_buffer[self.ipg_index].narrow(0, self.elements_in_ipg_bucket, param.numel()) new_grad_tensor.copy_(param.grad.view(-1)) param.grad.data = new_grad_tensor.data.view_as(param.grad) @@ -969,13 +919,13 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): #Otherwise averaging is done at the entire buffer level at the end of the loop # MoE param have different groups if self.ipg_bucket_has_moe_params: - process_group = self.expert_dp_process_group[ - param.group_name] if is_moe_param( - param) else self.dp_process_group + process_group = self.expert_dp_process_group[param.group_name] if is_moe_param( + param) else self.dp_process_group param.grad.data.div_(dist.get_world_size(group=process_group)) partition_ids = self.param_to_partition_ids[i][param_id] - assert all([p_id < dist.get_world_size(group=process_group) for p_id in partition_ids]), f"world size {dist.get_world_size(group=process_group)} and p_ids: {partition_ids}" + assert all([p_id < dist.get_world_size(group=process_group) for p_id in partition_ids + ]), f"world size {dist.get_world_size(group=process_group)} and p_ids: {partition_ids}" partition_size = self.partition_size[i] # Get all partition ids + their offsets partition_ids_w_offsets = [] @@ -1025,10 +975,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): # dist.barrier() #dist.barrier() dst_rank = dist.get_global_rank(real_dp_process_group[i], dst) - async_handle = dist.reduce(grad_slice, - dst=dst_rank, - group=real_dp_process_group[i], - async_op=True) + async_handle = dist.reduce(grad_slice, dst=dst_rank, group=real_dp_process_group[i], async_op=True) async_handles.append(async_handle) for handle in async_handles: @@ -1060,10 +1007,8 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): num_elements = partition_size - current_offset self.grad_position[param_id] = [ - int(group_id), - int(param_start_offset), - int(current_offset), - int(num_elements) + int(group_id), int(param_start_offset), + int(current_offset), int(num_elements) ] current_offset += num_elements @@ -1077,10 +1022,8 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): for lp_param in self.params_in_partition[param_group_index]: param_id = self.get_param_id(lp_param) [_, _, dest_offset, num_elements] = self.grad_position[param_id] - dest_tensor = self.single_partition_of_fp32_groups[ - param_group_index].grad.view(-1).narrow(0, - dest_offset, - num_elements) + dest_tensor = self.single_partition_of_fp32_groups[param_group_index].grad.view(-1).narrow( + 0, dest_offset, num_elements) self.offload_gradient_dict[param_group_index].append(dest_tensor) def async_accumulate_grad_in_cpu_via_gpu(self, param): @@ -1089,55 +1032,35 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): [i, source_offset, dest_offset, num_elements] = self.grad_position[param_id] # copy to a preexisiting buffer to avoid memory allocation penalty - dest_buffer = self.temp_grad_buffer_for_gpu_offload.view(-1).narrow( - 0, - 0, - param.numel()) + dest_buffer = self.temp_grad_buffer_for_gpu_offload.view(-1).narrow(0, 0, param.numel()) #buffer for storing gradients for this parameter in CPU def buffer_to_accumulate_to_in_cpu(): if not self.fp16_master_weights_and_gradients: - return get_accelerator().pin_memory( - torch.zeros(param.numel(), - dtype=param.dtype, - device=self.device)) + return get_accelerator().pin_memory(torch.zeros(param.numel(), dtype=param.dtype, device=self.device)) else: - return self.single_partition_of_fp32_groups[i].grad.view(-1).narrow( - 0, - dest_offset, - num_elements) + return self.single_partition_of_fp32_groups[i].grad.view(-1).narrow(0, dest_offset, num_elements) #accumulate gradients into param.grad or parts of it that belongs to this partition def accumulate_gradients(): if not self.fp16_master_weights_and_gradients: - dest_buffer.copy_(self.accumulated_grads_in_cpu[param_id].view(-1), - non_blocking=True) + dest_buffer.copy_(self.accumulated_grads_in_cpu[param_id].view(-1), non_blocking=True) param.grad.data.view(-1).add_(dest_buffer) else: - dest_buffer.narrow(0, - source_offset, - num_elements).copy_( - self.accumulated_grads_in_cpu[param_id].view(-1), - non_blocking=True) - param.grad.data.view(-1).narrow( - 0, - source_offset, - num_elements).add_(dest_buffer.narrow(0, - source_offset, - num_elements)) + dest_buffer.narrow(0, source_offset, + num_elements).copy_(self.accumulated_grads_in_cpu[param_id].view(-1), + non_blocking=True) + param.grad.data.view(-1).narrow(0, source_offset, + num_elements).add_(dest_buffer.narrow(0, source_offset, num_elements)) #move accumulated gradients back to CPU def copy_gradients_to_cpu(): if not self.fp16_master_weights_and_gradients: - self.accumulated_grads_in_cpu[param_id].data.copy_( - param.grad.data.view(-1), - non_blocking=True) + self.accumulated_grads_in_cpu[param_id].data.copy_(param.grad.data.view(-1), non_blocking=True) else: - self.accumulated_grads_in_cpu[param_id].data.copy_( - param.grad.data.view(-1).narrow(0, - source_offset, - num_elements), - non_blocking=True) + self.accumulated_grads_in_cpu[param_id].data.copy_(param.grad.data.view(-1).narrow( + 0, source_offset, num_elements), + non_blocking=True) if param_id not in self.accumulated_grads_in_cpu: self.accumulated_grads_in_cpu[param_id] = buffer_to_accumulate_to_in_cpu() @@ -1177,10 +1100,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): [i, source_offset, dest_offset, num_elements] = self.grad_position[param_id] - dest_tensor = self.single_partition_of_fp32_groups[i].grad.view(-1).narrow( - 0, - dest_offset, - num_elements) + dest_tensor = self.single_partition_of_fp32_groups[i].grad.view(-1).narrow(0, dest_offset, num_elements) src_tensor = param.grad.view(-1).narrow(0, source_offset, num_elements) if not self.fp16_master_weights_and_gradients: @@ -1220,16 +1140,13 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): # Sum across all model parallel GPUs. total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)]) - dist.all_reduce(total_norm_cuda, - op=dist.ReduceOp.SUM, - group=self.dp_process_group) + dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=self.dp_process_group) self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.SUM) total_norm = total_norm_cuda[0].item()**(1. / norm_type) - if total_norm == float( - 'inf') or total_norm == -float('inf') or total_norm != total_norm: + if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm: total_norm = -1 return total_norm @@ -1258,17 +1175,13 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): total_size += param_in_partition.numel() see_memory_usage(f"before copying {total_size} gradients into partition") - self.grads_in_partition = torch.empty( - int(total_size), - dtype=self.dtype, - device=get_accelerator().current_device_name()) + self.grads_in_partition = torch.empty(int(total_size), + dtype=self.dtype, + device=get_accelerator().current_device_name()) see_memory_usage(f"after copying {total_size} gradients into partition") # The allreduce buffer will be rewritten. Copy the gradients in partition to a new buffer - new_grad_tensor = self.grads_in_partition.view(-1).narrow( - 0, - self.grads_in_partition_offset, - param.numel()) + new_grad_tensor = self.grads_in_partition.view(-1).narrow(0, self.grads_in_partition_offset, param.numel()) new_grad_tensor.copy_(param.grad.view(-1)) param.grad.data = new_grad_tensor.data.view_as(param.grad) #print(f"Grad norm after copy to contiguous_buffer {param.grad.data.norm()}") @@ -1279,17 +1192,16 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): if self.extra_large_param_to_reduce is not None: assert len(self.params_in_ipg_bucket) == 1, "more than 1 param in ipg bucket, this shouldn't happen" _, _, param_id = self.params_in_ipg_bucket[0] - assert self.get_param_id( - self.extra_large_param_to_reduce) == param_id, "param in ipg bucket does not match extra-large param" + assert self.get_param_id(self.extra_large_param_to_reduce + ) == param_id, "param in ipg bucket does not match extra-large param" self.average_tensor(self.extra_large_param_to_reduce.grad.view(-1)) self.extra_large_param_to_reduce = None else: self.average_tensor(self.ipg_buffer[self.ipg_index]) else: - self.buffered_reduce_fallback( - None, - self.grads_in_ipg_bucket, - elements_per_buffer=self.elements_in_ipg_bucket) + self.buffered_reduce_fallback(None, + self.grads_in_ipg_bucket, + elements_per_buffer=self.elements_in_ipg_bucket) if self.overlap_comm: stream = self.reduction_stream @@ -1324,8 +1236,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): elif self.contiguous_gradients: self.copy_grads_in_partition(param) else: # zero stage 1 - partition only optimizer state - if self.contiguous_gradients and self.is_param_in_current_partition[ - param_id]: + if self.contiguous_gradients and self.is_param_in_current_partition[param_id]: self.copy_grads_in_partition(param) self.grads_in_ipg_bucket = [] @@ -1339,6 +1250,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): self.reduce_independent_p_g_buckets_and_remove_grads(param, i) def zero_reduced_gradients(self, partition_id, i): + def are_all_related_partitions_reduced(params_id): for partition_id in self.param_to_partition_ids[i][params_id]: if not self.is_partition_reduced[i][partition_id]: @@ -1358,29 +1270,23 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): self.sequential_execution(print_func, message) def get_grads_to_reduce(self, i, partition_id): + def get_reducible_portion(key): grad = self.param_dict[key].grad total_elements = grad.numel() start = self.grad_start_offset[i][partition_id][key] - num_elements = min( - total_elements - start, - self.partition_size[i] - - self.grad_partition_insertion_offset[i][partition_id][key]) + num_elements = min(total_elements - start, + self.partition_size[i] - self.grad_partition_insertion_offset[i][partition_id][key]) if not pg_correctness_test: if num_elements == total_elements: return grad else: - return grad.contiguous().view(-1).narrow(0, - int(start), - int(num_elements)) + return grad.contiguous().view(-1).narrow(0, int(start), int(num_elements)) else: if num_elements == total_elements: return grad.clone() else: - return grad.clone().contiguous().view(-1).narrow( - 0, - int(start), - int(num_elements)) + return grad.clone().contiguous().view(-1).narrow(0, int(start), int(num_elements)) grads_to_reduce = [] for key in self.is_grad_computed[i][partition_id]: @@ -1456,11 +1362,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): for buf, synced in zip(small_bucket, self.unflatten(allreduced, small_bucket)): buf.copy_(synced) - def allreduce_no_retain(self, - bucket, - numel_per_bucket=500000000, - rank=None, - log=None): + def allreduce_no_retain(self, bucket, numel_per_bucket=500000000, rank=None, log=None): small_bucket = [] numel = 0 for tensor in bucket: @@ -1475,18 +1377,11 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): # allows using reduction of gradients instead of using all_reduce - def buffered_reduce_fallback(self, - rank, - grads, - elements_per_buffer=500000000, - log=None): + def buffered_reduce_fallback(self, rank, grads, elements_per_buffer=500000000, log=None): split_buckets = split_half_float_double(grads) for i, bucket in enumerate(split_buckets): - self.allreduce_no_retain(bucket, - numel_per_bucket=elements_per_buffer, - rank=rank, - log=log) + self.allreduce_no_retain(bucket, numel_per_bucket=elements_per_buffer, rank=rank, log=log) ############################################################################# ############################################################################# @@ -1531,11 +1426,11 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): if (current_index >= start_index and current_index < end_index): params_in_partition.append(tensor) - elif start_index > current_index and start_index < (current_index + - tensor_size): + elif start_index > current_index and start_index < (current_index + tensor_size): params_in_partition.append(tensor) - assert (first_offset == 0), "This can happen either zero or only once as this must be the first tensor in the partition" + assert (first_offset == 0 + ), "This can happen either zero or only once as this must be the first tensor in the partition" first_offset = start_index - current_index else: @@ -1589,9 +1484,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): if norm_type == inf: total_norm = max(g.data.abs().max() for g in gradients) total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)]) - dist.all_reduce(total_norm_cuda, - op=dist.ReduceOp.MAX, - group=self.dp_process_group) + dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=self.dp_process_group) # Take max across all GPUs. self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.MAX) @@ -1609,16 +1502,13 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): total_norm += param_norm.item()**2 # Sum across all model parallel GPUs. total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)]) - dist.all_reduce(total_norm_cuda, - op=dist.ReduceOp.SUM, - group=self.dp_process_group) + dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=self.dp_process_group) self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.SUM) total_norm = total_norm_cuda[0].item()**(1. / norm_type) - if total_norm == float( - 'inf') or total_norm == -float('inf') or total_norm != total_norm: + if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm: total_norm = -1 return total_norm @@ -1626,13 +1516,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): # creates a flat fused tensor from the tensor list starting at the first_offset # in the first tensor of the list. If there are not enough elements in the tensor # list then the flat tensor will be padded with zeros - def get_flat_partition(self, - tensor_list, - first_offset, - partition_size, - dtype, - device, - return_tensor_list=False): + def get_flat_partition(self, tensor_list, first_offset, partition_size, dtype, device, return_tensor_list=False): flat_tensor_list = [] current_size = 0 for i, tensor in enumerate(tensor_list): @@ -1655,10 +1539,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): # we need a narrow view of the tensor based on the tensor offset and number of elements that # we need from this tensor if tensor_offset > 0 or num_elements < tensor.numel(): - flat_tensor_list.append(tensor.contiguous().view(-1).narrow( - 0, - int(tensor_offset), - int(num_elements))) + flat_tensor_list.append(tensor.contiguous().view(-1).narrow(0, int(tensor_offset), int(num_elements))) else: flat_tensor_list.append(tensor) @@ -1666,10 +1547,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): # this means its the last partition and does not align with the dp boundary. We need to pad before flattening if current_size < partition_size: - flat_tensor_list.append( - torch.zeros(int(partition_size - current_size), - dtype=dtype, - device=device)) + flat_tensor_list.append(torch.zeros(int(partition_size - current_size), dtype=dtype, device=device)) if return_tensor_list: return flat_tensor_list @@ -1715,9 +1593,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): def override_loss_scale(self, loss_scale): if loss_scale != self.external_loss_scale: - logger.info( - f'[deepspeed] setting loss scale from {self.external_loss_scale} -> {loss_scale}' - ) + logger.info(f'[deepspeed] setting loss scale from {self.external_loss_scale} -> {loss_scale}') self.custom_loss_scaler = True self.external_loss_scale = loss_scale @@ -1727,14 +1603,10 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): for i, group in enumerate(self.bit16_groups): partition_id = dist.get_rank(group=self.real_dp_process_group[i]) if self.cpu_offload: - norm_groups.append( - self.complete_grad_norm_calculation_for_cpu_offload( - self.params_in_partition[i])) + norm_groups.append(self.complete_grad_norm_calculation_for_cpu_offload(self.params_in_partition[i])) single_grad_partition = self.single_partition_of_fp32_groups[i].grad else: - norm_groups.append( - self.get_grad_norm_direct(self.averaged_gradients[i], - self.params_in_partition[i])) + norm_groups.append(self.get_grad_norm_direct(self.averaged_gradients[i], self.params_in_partition[i])) if self.has_moe_layers: self._average_expert_grad_norms(norm_groups) @@ -1745,9 +1617,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): def get_bit16_param_group(self, group_no): bit16_partitions = self.parallel_partitioned_bit16_groups[group_no] partition_id = dist.get_rank(group=self.real_dp_process_group[group_no]) - return [ - bit16_partitions[dist.get_rank(group=self.real_dp_process_group[group_no])] - ] + return [bit16_partitions[dist.get_rank(group=self.real_dp_process_group[group_no])]] def _optimizer_step(self, group_no): original_param_groups = self.optimizer.param_groups @@ -1802,15 +1672,13 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): partition_id = dist.get_rank(group=self.real_dp_process_group[i]) if self.cpu_offload: single_grad_partition = self.single_partition_of_fp32_groups[i].grad - self.unscale_and_clip_grads([single_grad_partition], - scaled_global_grad_norm) + self.unscale_and_clip_grads([single_grad_partition], scaled_global_grad_norm) self.stop_timers([OPTIMIZER_GRADIENTS]) self.start_timers([OPTIMIZER_STEP]) self._optimizer_step(i) from deepspeed.ops.adam import DeepSpeedCPUAdam - if not (type(self.optimizer) == DeepSpeedCPUAdam - and self.dtype == torch.half): + if not (type(self.optimizer) == DeepSpeedCPUAdam and self.dtype == torch.half): bit16_partitions = self.parallel_partitioned_bit16_groups[i] fp32_partition = self.single_partition_of_fp32_groups[i] bit16_partitions[partition_id].data.copy_(fp32_partition.data) @@ -1822,12 +1690,10 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): # create a flat gradients for parameters updated by this process # If we are last partition, ensure we have same size grads and partition size, if not pad with zero tensors - if partition_id == dist.get_world_size( - group=self.real_dp_process_group[i]) - 1: + if partition_id == dist.get_world_size(group=self.real_dp_process_group[i]) - 1: single_grad_partition = self.flatten_dense_tensors_aligned( self.averaged_gradients[i], - int(self.partition_size[i])).to( - self.single_partition_of_fp32_groups[i].dtype) + int(self.partition_size[i])).to(self.single_partition_of_fp32_groups[i].dtype) else: single_grad_partition = self.flatten(self.averaged_gradients[i]).to( self.single_partition_of_fp32_groups[i].dtype) @@ -1841,8 +1707,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): self.averaged_gradients[i] = None - self.unscale_and_clip_grads([single_grad_partition], - scaled_global_grad_norm) + self.unscale_and_clip_grads([single_grad_partition], scaled_global_grad_norm) self.stop_timers([OPTIMIZER_GRADIENTS]) # Step 3:- run the optimizer if no offloading @@ -1863,11 +1728,10 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): self.start_timers([OPTIMIZER_ALLGATHER]) # Gather the updated weights from everyone. # Then all partitions of the model parameters are updated and ready for next round forward. - all_gather_dp_groups( - partitioned_param_groups=self.parallel_partitioned_bit16_groups, - dp_process_group=self.real_dp_process_group, - start_alignment_factor=self.nccl_start_alignment_factor, - allgather_bucket_size=self.allgather_bucket_size) + all_gather_dp_groups(partitioned_param_groups=self.parallel_partitioned_bit16_groups, + dp_process_group=self.real_dp_process_group, + start_alignment_factor=self.nccl_start_alignment_factor, + allgather_bucket_size=self.allgather_bucket_size) self.stop_timers([OPTIMIZER_ALLGATHER]) @@ -1882,24 +1746,23 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): @torch.no_grad() def update_lp_params(self): - for i, (bit16_partitions, fp32_partition) in enumerate(zip(self.parallel_partitioned_bit16_groups, self.single_partition_of_fp32_groups)): + for i, (bit16_partitions, fp32_partition) in enumerate( + zip(self.parallel_partitioned_bit16_groups, self.single_partition_of_fp32_groups)): partition_id = dist.get_rank(group=self.real_dp_process_group[i]) bit16_partitions[partition_id].data.copy_(fp32_partition.data) # print_rank_0(f'update_lp_params {i=} {partition_id=}', force=True) # if i == 0: # print_rank_0(f'{fp32_partition[:10]=}', force=True) - all_gather_dp_groups( - partitioned_param_groups=self.parallel_partitioned_bit16_groups, - dp_process_group=self.real_dp_process_group, - start_alignment_factor=self.nccl_start_alignment_factor, - allgather_bucket_size=self.allgather_bucket_size) + all_gather_dp_groups(partitioned_param_groups=self.parallel_partitioned_bit16_groups, + dp_process_group=self.real_dp_process_group, + start_alignment_factor=self.nccl_start_alignment_factor, + allgather_bucket_size=self.allgather_bucket_size) def _average_expert_grad_norms(self, norm_groups): for i, norm in enumerate(norm_groups): if self.is_moe_param_group[i]: - scaled_norm = norm * 1.0 / float( - dist.get_world_size(group=self.real_dp_process_group[i])) + scaled_norm = norm * 1.0 / float(dist.get_world_size(group=self.real_dp_process_group[i])) scaled_norm_tensor = torch.tensor(scaled_norm, device=get_accelerator().device_name(), dtype=torch.float) @@ -1943,14 +1806,11 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): def has_overflow(self, partition_gradients=True): if partition_gradients: - overflow = self.local_overflow if self.cpu_offload else self.has_overflow_partitioned_grads_serial( - ) + overflow = self.local_overflow if self.cpu_offload else self.has_overflow_partitioned_grads_serial() overflow_gpu = get_accelerator().ByteTensor([overflow]) '''This will capture overflow across all data parallel and expert parallel process Since expert parallel process are a subset of data parallel process''' - dist.all_reduce(overflow_gpu, - op=dist.ReduceOp.MAX, - group=self.dp_process_group) + dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=self.dp_process_group) else: params = [] @@ -2087,9 +1947,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): optimizer_groups_state = [] for i, group in enumerate(self.optimizer.param_groups): p = group['params'][0] - lean_optimizer_state = self._get_state_without_padding( - self.optimizer.state[p], - self.groups_padding[i]) + lean_optimizer_state = self._get_state_without_padding(self.optimizer.state[p], self.groups_padding[i]) optimizer_groups_state.append(lean_optimizer_state) return optimizer_groups_state @@ -2117,8 +1975,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): state_dict[BASE_OPTIMIZER_STATE] = self.optimizer.state_dict() # Remove paddings for DP alignment to enable loading for other alignment values - fp32_groups_without_padding = self._get_groups_without_padding( - self.single_partition_of_fp32_groups) + fp32_groups_without_padding = self._get_groups_without_padding(self.single_partition_of_fp32_groups) state_dict[SINGLE_PARTITION_OF_FP32_GROUPS] = fp32_groups_without_padding state_dict[ @@ -2140,17 +1997,13 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): for i in range(len(self.single_partition_of_fp32_groups)): partition_id = dist.get_rank(group=self.real_dp_process_group[i]) - merged_partitions = [ - sd[SINGLE_PARTITION_OF_FP32_GROUPS][i] for sd in all_state_dict - ] + merged_partitions = [sd[SINGLE_PARTITION_OF_FP32_GROUPS][i] for sd in all_state_dict] if self.is_moe_group(self.optimizer.param_groups[i]): - ranks = self.get_ep_ranks( - group_name=self.optimizer.param_groups[i]['name']) + ranks = self.get_ep_ranks(group_name=self.optimizer.param_groups[i]['name']) merged_partitions = [merged_partitions[i] for i in ranks] flat_merged_partitions = self.flatten_dense_tensors_aligned( merged_partitions, - self.nccl_start_alignment_factor * - dist.get_world_size(group=self.real_dp_process_group[i])) + self.nccl_start_alignment_factor * dist.get_world_size(group=self.real_dp_process_group[i])) dp_partitions = self.get_data_parallel_partitions(flat_merged_partitions, i) merged_single_partition_of_fp32_groups.append(dp_partitions[partition_id]) @@ -2159,7 +2012,8 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): # Restore base optimizer fp32 weights from ZeRO fp16 or bfloat16 weights def _restore_from_bit16_weights(self): - for group_id, (bit16_partitions, fp32_partition) in enumerate(zip(self.parallel_partitioned_bit16_groups, self.single_partition_of_fp32_groups)): + for group_id, (bit16_partitions, fp32_partition) in enumerate( + zip(self.parallel_partitioned_bit16_groups, self.single_partition_of_fp32_groups)): partition_id = dist.get_rank(group=self.real_dp_process_group[group_id]) fp32_partition.data.copy_(bit16_partitions[partition_id].data) @@ -2172,11 +2026,8 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): partition_id = dist.get_rank(group=self.real_dp_process_group[group_id]) alignment = dist.get_world_size(group=self.real_dp_process_group[group_id]) if torch.is_tensor(all_partition_states[0]): - flat_merged_partitions = self.flatten_dense_tensors_aligned( - all_partition_states, - alignment) - dp_partitions = self.get_data_parallel_partitions(flat_merged_partitions, - group_id) + flat_merged_partitions = self.flatten_dense_tensors_aligned(all_partition_states, alignment) + dp_partitions = self.get_data_parallel_partitions(flat_merged_partitions, group_id) return dp_partitions[partition_id] else: # Assume non-tensor states are not partitioned and equal across ranks, so return first one @@ -2211,25 +2062,15 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): base_optimizer_group_states = [] for i in range(len(self.optimizer.param_groups)): partition_states = {} - all_partition_group_states = [ - sd[BASE_OPTIMIZER_STATE][i] for sd in all_state_dict - ] + all_partition_group_states = [sd[BASE_OPTIMIZER_STATE][i] for sd in all_state_dict] if self.is_moe_group(self.optimizer.param_groups[i]): - ranks = self.get_ep_ranks( - group_name=self.optimizer.param_groups[i]['name']) - all_partition_group_states = [ - all_partition_group_states[i] for i in ranks - ] + ranks = self.get_ep_ranks(group_name=self.optimizer.param_groups[i]['name']) + all_partition_group_states = [all_partition_group_states[i] for i in ranks] for key in all_partition_group_states[0].keys(): - all_partition_states = [ - all_states[key] for all_states in all_partition_group_states - ] - partition_states[key] = self._partition_base_optimizer_state( - key, - all_partition_states, - i) + all_partition_states = [all_states[key] for all_states in all_partition_group_states] + partition_states[key] = self._partition_base_optimizer_state(key, all_partition_states, i) base_optimizer_group_states.append(partition_states) self._restore_base_optimizer_state(base_optimizer_group_states) @@ -2240,18 +2081,11 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): load_from_fp32_weights=False, checkpoint_folder=None): if checkpoint_folder: - self._load_universal_checkpoint(checkpoint_folder, - load_optimizer_states, - load_from_fp32_weights) + self._load_universal_checkpoint(checkpoint_folder, load_optimizer_states, load_from_fp32_weights) else: - self._load_legacy_checkpoint(state_dict_list, - load_optimizer_states, - load_from_fp32_weights) + self._load_legacy_checkpoint(state_dict_list, load_optimizer_states, load_from_fp32_weights) - def _load_universal_checkpoint(self, - checkpoint_folder, - load_optimizer_states, - load_from_fp32_weights): + def _load_universal_checkpoint(self, checkpoint_folder, load_optimizer_states, load_from_fp32_weights): self._load_hp_checkpoint_state(checkpoint_folder) @property @@ -2268,16 +2102,10 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): for lp in self.bit16_groups[i]: if lp._hp_mapping is not None: #print(f"Loading {self.param_names[lp]} {tp_rank=} {tp_world_size=}") - lp.load_hp_checkpoint_state( - os.path.join(checkpoint_dir, - self.param_names[lp]), - tp_rank, - tp_world_size) + lp.load_hp_checkpoint_state(os.path.join(checkpoint_dir, self.param_names[lp]), tp_rank, + tp_world_size) - def _load_legacy_checkpoint(self, - state_dict_list, - load_optimizer_states=True, - load_from_fp32_weights=False): + def _load_legacy_checkpoint(self, state_dict_list, load_optimizer_states=True, load_from_fp32_weights=False): r"""Loading ZeRO checkpoint Arguments: @@ -2308,8 +2136,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): dp_rank = dist.get_rank(group=self.dp_process_group) current_rank_sd = state_dict_list[dp_rank] self.loss_scaler = current_rank_sd.get('loss_scaler', self.loss_scaler) - self.dynamic_loss_scale = current_rank_sd.get('dynamic_loss_scale', - self.dynamic_loss_scale) + self.dynamic_loss_scale = current_rank_sd.get('dynamic_loss_scale', self.dynamic_loss_scale) self.overflow = current_rank_sd.get('overflow', self.overflow) self.clip_grad = current_rank_sd.get(CLIP_GRAD, self.clip_grad) @@ -2347,8 +2174,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): self._restore_elastic_base_optimizer_state(state_dict_list) else: # loading an elastic checkpoint into rigid exec - self._restore_base_optimizer_state( - current_rank_sd[BASE_OPTIMIZER_STATE]) + self._restore_base_optimizer_state(current_rank_sd[BASE_OPTIMIZER_STATE]) # At this point, the optimizer's references to the model's fp32 parameters are up to date. # The optimizer's hyperparameters and internal buffers are also up to date. @@ -2371,7 +2197,8 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): self._restore_from_elastic_fp32_weights(state_dict_list) else: # For non-elastic checkpoint, simply copying from saved weights of current rank is sufficient. - for current, saved in zip(self.single_partition_of_fp32_groups, current_rank_sd[SINGLE_PARTITION_OF_FP32_GROUPS]): + for current, saved in zip(self.single_partition_of_fp32_groups, + current_rank_sd[SINGLE_PARTITION_OF_FP32_GROUPS]): src_tensor = _get_padded_tensor(saved, current.numel()) current.data.copy_(src_tensor.data) else: @@ -2391,9 +2218,7 @@ def _handle_overflow(cpu_sum, x, i): if not math.isfinite(float(v)): t_i = v_i break - logger.info( - f"rank {rank} detected overflow {cpu_sum} in tensor {i}:{t_i} shape {x.shape}" - ) + logger.info(f"rank {rank} detected overflow {cpu_sum} in tensor {i}:{t_i} shape {x.shape}") def estimate_zero2_model_states_mem_needs(total_params, @@ -2416,9 +2241,7 @@ def estimate_zero2_model_states_mem_needs(total_params, def model_to_params(model): # shared params calculated only once - total_params = sum( - dict((p.data_ptr(), - p.numel()) for p in model.parameters()).values()) + total_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values()) return total_params @@ -2446,11 +2269,10 @@ def estimate_zero2_model_states_mem_needs_all_live(model, total_params = model_to_params(model) - estimate_zero2_model_states_mem_needs_all_cold( - total_params=total_params, - num_gpus_per_node=num_gpus_per_node, - num_nodes=num_nodes, - additional_buffer_factor=additional_buffer_factor) + estimate_zero2_model_states_mem_needs_all_cold(total_params=total_params, + num_gpus_per_node=num_gpus_per_node, + num_nodes=num_nodes, + additional_buffer_factor=additional_buffer_factor) def estimate_zero2_model_states_mem_needs_all_cold(total_params, @@ -2474,6 +2296,7 @@ def estimate_zero2_model_states_mem_needs_all_cold(total_params, - ``additional_buffer_factor``: estimation factor (defaults to 1.5): """ + def format_options(cpu_offload): enabled = [] device = f'{OffloadDeviceEnum.cpu:4}' if cpu_offload else "none" @@ -2482,19 +2305,16 @@ def estimate_zero2_model_states_mem_needs_all_cold(total_params, nodes_str = "nodes" if num_nodes > 1 else "node" gpus_str = "GPUs" if num_gpus_per_node > 1 else "GPU" - print( - "Estimated memory needed for params, optim states and gradients for a:\n" - f"HW: Setup with {num_nodes} {nodes_str}, {num_gpus_per_node} {gpus_str} per node.\n" - f"SW: Model with {int(total_params/1e6)}M total params.") + print("Estimated memory needed for params, optim states and gradients for a:\n" + f"HW: Setup with {num_nodes} {nodes_str}, {num_gpus_per_node} {gpus_str} per node.\n" + f"SW: Model with {int(total_params/1e6)}M total params.") print(" per CPU | per GPU | Options") for cpu_offload in [True, False]: - cpu_mem, gpu_mem = estimate_zero2_model_states_mem_needs( - total_params=total_params, - num_gpus_per_node=num_gpus_per_node, - num_nodes=num_nodes, - cpu_offload=cpu_offload, - additional_buffer_factor=additional_buffer_factor - ) + cpu_mem, gpu_mem = estimate_zero2_model_states_mem_needs(total_params=total_params, + num_gpus_per_node=num_gpus_per_node, + num_nodes=num_nodes, + cpu_offload=cpu_offload, + additional_buffer_factor=additional_buffer_factor) options_str = format_options(cpu_offload=cpu_offload) print(f" {cpu_mem/2**30:7.2f}GB | {gpu_mem/2**30:6.2f}GB | {options_str}") diff --git a/deepspeed/runtime/zero/tiling.py b/deepspeed/runtime/zero/tiling.py index d66caea85..8c447d781 100644 --- a/deepspeed/runtime/zero/tiling.py +++ b/deepspeed/runtime/zero/tiling.py @@ -27,6 +27,7 @@ def split_tensor_along_last_dim(tensor, partitions, contiguous_split_chunks=Fals class TiledLinear(torch.nn.Module): + def __init__(self, in_features, out_features, @@ -114,10 +115,7 @@ class TiledLinear(torch.nn.Module): local_bias = bias if in_id == (in_splits - 1) else False local_in_dim = self.in_parts[in_id + 1] - self.in_parts[in_id] - local = linear_cls(local_in_dim, - local_out_dim, - bias=local_bias, - **kwargs) + local = linear_cls(local_in_dim, local_out_dim, bias=local_bias, **kwargs) self.linears[out_id].append(local) # Optionally initialize with a known tensor @@ -127,13 +125,12 @@ class TiledLinear(torch.nn.Module): def forward(self, input_): if self.in_splits > 1 and not self.input_is_already_split: input_parts = partition(input_.shape[-1], self.in_splits) - split_sizes = [ - input_parts[p + 1] - input_parts[p] for p in range(self.in_splits) - ] + split_sizes = [input_parts[p + 1] - input_parts[p] for p in range(self.in_splits)] inputs = self._split_global_input(input_, split_sizes) elif self.in_splits > 1: inputs = input_ - assert len(inputs) == self.in_splits, f"Col splits {self.in_splits} does not match input splits {len(inputs)}" + assert len( + inputs) == self.in_splits, f"Col splits {self.in_splits} does not match input splits {len(inputs)}" else: # no splits inputs = [input_] @@ -260,6 +257,7 @@ class TiledLinearReturnBias(TiledLinear): """Wrapper for a Linear class that returns its own bias parameter, such as used by Megatron-LM. """ + def _reduce_local_output(self, in_id, out_id, current_out, new_out): """Reduces output tensors, but not the returned bias. """ if current_out is not None: @@ -273,10 +271,7 @@ class TiledLinearReturnBias(TiledLinear): tensor, bias = new_out assert tensor is not None - tensor = super()._reduce_local_output(in_id=in_id, - out_id=out_id, - current_out=old_tensor, - new_out=tensor) + tensor = super()._reduce_local_output(in_id=in_id, out_id=out_id, current_out=old_tensor, new_out=tensor) if bias is None: bias = old_bias diff --git a/deepspeed/runtime/zero/utils.py b/deepspeed/runtime/zero/utils.py index eafddad3f..1d68e5226 100755 --- a/deepspeed/runtime/zero/utils.py +++ b/deepspeed/runtime/zero/utils.py @@ -15,9 +15,7 @@ from deepspeed.accelerator import get_accelerator def _initialize_parameter_parallel_groups(parameter_parallel_size=None): data_parallel_size = int(dist.get_world_size()) parameter_parallel_size = parameter_parallel_size or data_parallel_size - logger.info("data_parallel_size: %s, parameter_parallel_size: %s", - data_parallel_size, - parameter_parallel_size) + logger.info("data_parallel_size: %s, parameter_parallel_size: %s", data_parallel_size, parameter_parallel_size) assert data_parallel_size % parameter_parallel_size == 0, \ 'world size should be divisible by parameter parallel size' rank = dist.get_rank() @@ -34,12 +32,7 @@ class ZeRORuntimeException(Exception): pass -ZERO_SUPPORTED_OPTIMIZERS = [ - torch.optim.Adam, - torch.optim.AdamW, - FusedAdam, - DeepSpeedCPUAdam -] +ZERO_SUPPORTED_OPTIMIZERS = [torch.optim.Adam, torch.optim.AdamW, FusedAdam, DeepSpeedCPUAdam] # Add apex FusedAdam to supported list if apex is installed try: @@ -52,9 +45,7 @@ except ImportError: def is_zero_supported_optimizer(optimizer): if dist.get_rank() == 0: - logger.info( - f'Checking ZeRO support for optimizer={optimizer.__class__.__name__} type={type(optimizer)}' - ) + logger.info(f'Checking ZeRO support for optimizer={optimizer.__class__.__name__} type={type(optimizer)}') return type(optimizer) in ZERO_SUPPORTED_OPTIMIZERS diff --git a/deepspeed/utils/comms_logging.py b/deepspeed/utils/comms_logging.py index d6b041064..1592450b3 100644 --- a/deepspeed/utils/comms_logging.py +++ b/deepspeed/utils/comms_logging.py @@ -56,6 +56,7 @@ def calc_bw_log(comm_op, size, duration): class CommsLogger: + def __init__(self): from deepspeed.comm.constants import COMMS_LOGGER_VERBOSE_DEFAULT, COMMS_LOGGER_DEBUG_DEFAULT, COMMS_LOGGER_PROF_OPS_DEFAULT, COMMS_LOGGER_PROF_ALL_DEFAULT, COMMS_LOGGER_ENABLED_DEFAULT self.comms_dict = {} @@ -111,8 +112,7 @@ class CommsLogger: # TODO: Add to tensorboard if self.verbose: n = dist.get_world_size() - log_str = f"rank={dist.get_rank()} | comm op: " + record_name + " | time (ms): {:.2f}".format( - latency) + log_str = f"rank={dist.get_rank()} | comm op: " + record_name + " | time (ms): {:.2f}".format(latency) log_str += " | msg size: " + convert_size(msg_size) log_str += " | algbw (Gbps): {:.2f} ".format(algbw) log_str += " | busbw (Gbps): {:.2f} ".format(busbw) diff --git a/deepspeed/utils/debug.py b/deepspeed/utils/debug.py index f3f3ea56d..58ff6dae1 100644 --- a/deepspeed/utils/debug.py +++ b/deepspeed/utils/debug.py @@ -128,6 +128,7 @@ def log_rank_file(rank, *msgs): def print_backward_tensors(tensor): + def _print_bwd_tensors(grad_fn): print(f"Backward tensors in {grad_fn}") for funcs in grad_fn.next_functions: @@ -135,9 +136,7 @@ def print_backward_tensors(tensor): try: tensor = getattr(funcs[0], 'variable') print(funcs[0]) - print( - f"Tensor - id: {id(tensor)}, shape: {tensor.shape}, data: {tensor}, grad: {tensor.grad}" - ) + print(f"Tensor - id: {id(tensor)}, shape: {tensor.shape}, data: {tensor}, grad: {tensor.grad}") except AttributeError as e: _print_bwd_tensors(funcs[0]) diff --git a/deepspeed/utils/groups.py b/deepspeed/utils/groups.py index a4d403a7d..a2fe86cd8 100644 --- a/deepspeed/utils/groups.py +++ b/deepspeed/utils/groups.py @@ -51,8 +51,7 @@ def initialize(ep_size=1, mpu=None): def _ensure_divisibility(numerator, denominator): """Ensure that numerator is divisible by the denominator.""" - assert numerator % denominator == 0, '{} is not divisible by {}'.format( - numerator, denominator) + assert numerator % denominator == 0, '{} is not divisible by {}'.format(numerator, denominator) # Not currently used. Helper function to create a model (tensor) parallel group. @@ -78,8 +77,7 @@ def _create_model_parallel(model_parallel_size_): with a total of 16 GPUs, rank 0 to 7 belong to the first box and ranks 8 to 15 belong to the second box. """ - log_dist(f'Creating model parallel group with size {model_parallel_size_}', - ranks=[0]) + log_dist(f'Creating model parallel group with size {model_parallel_size_}', ranks=[0]) # Get world size and rank. Ensure some consistencies. assert dist.is_initialized() world_size = dist.get_world_size() @@ -121,9 +119,7 @@ def _create_expert_and_data_parallel(expert_parallel_size_): """ assert dist.is_initialized() - log_dist( - f'Creating expert and data parallel groups with size {expert_parallel_size_}', - ranks=[0]) + log_dist(f'Creating expert and data parallel groups with size {expert_parallel_size_}', ranks=[0]) world_size = dist.get_world_size() rank = dist.get_rank() @@ -139,9 +135,7 @@ def _create_expert_and_data_parallel(expert_parallel_size_): for i in range(expert_parallel_size_): ranks = range(i, world_size, expert_parallel_size_) group = dist.new_group(ranks) - log_dist( - f'Creating expert data parallel process group named {group_name} with ranks: {list(ranks)}', - [0]) + log_dist(f'Creating expert data parallel process group named {group_name} with ranks: {list(ranks)}', [0]) if i == (rank % expert_parallel_size_): _EXPERT_DATA_PARALLEL_GROUP[group_name] = group @@ -153,9 +147,7 @@ def _create_expert_and_data_parallel(expert_parallel_size_): for i in range(world_size // expert_parallel_size_): ranks = range(i * expert_parallel_size_, (i + 1) * expert_parallel_size_) group = dist.new_group(ranks) - log_dist( - f'creating expert parallel process group named {group_name} with ranks: {list(ranks)}', - [0]) + log_dist(f'creating expert parallel process group named {group_name} with ranks: {list(ranks)}', [0]) if i == (rank // expert_parallel_size_): _EXPERT_PARALLEL_GROUP[group_name] = group diff --git a/deepspeed/utils/init_on_device.py b/deepspeed/utils/init_on_device.py index 1b51efe60..8e83f7e31 100644 --- a/deepspeed/utils/init_on_device.py +++ b/deepspeed/utils/init_on_device.py @@ -35,13 +35,10 @@ class OnDevice(object): if device == "meta": if pkg_version.parse('1.10') > pkg_version.parse(torch.__version__): - raise NotImplementedError( - "Meta tensor support is not available, please upgrade to torch 1.10+" - ) + raise NotImplementedError("Meta tensor support is not available, please upgrade to torch 1.10+") + + def fp_tensor_constructor(self, fn: Callable, target_fp_dtype: torch.dtype) -> Callable: - def fp_tensor_constructor(self, - fn: Callable, - target_fp_dtype: torch.dtype) -> Callable: def wrapped_fn(*args, **kwargs) -> Tensor: if kwargs.get("device", None) is None: kwargs['device'] = self.device @@ -53,6 +50,7 @@ class OnDevice(object): return wrapped_fn def get_new_tensor_fn_for_dtype(self, dtype: torch.dtype) -> Callable: + def new_tensor(cls, *args) -> Tensor: tensor = OnDevice._orig_torch_empty(0, device=self.device).new_empty(*args) if tensor.is_floating_point(): diff --git a/deepspeed/utils/logging.py b/deepspeed/utils/logging.py index 95b371840..668c1260e 100644 --- a/deepspeed/utils/logging.py +++ b/deepspeed/utils/logging.py @@ -15,6 +15,7 @@ log_levels = { class LoggerFactory: + @staticmethod def create_logger(name=None, level=logging.INFO): """create a logger @@ -30,9 +31,8 @@ class LoggerFactory: if name is None: raise ValueError("name for logger cannot be None") - formatter = logging.Formatter( - "[%(asctime)s] [%(levelname)s] " - "[%(filename)s:%(lineno)d:%(funcName)s] %(message)s") + formatter = logging.Formatter("[%(asctime)s] [%(levelname)s] " + "[%(filename)s:%(lineno)d:%(funcName)s] %(message)s") logger_ = logging.getLogger(name) logger_.setLevel(level) diff --git a/deepspeed/utils/mixed_precision_linkage.py b/deepspeed/utils/mixed_precision_linkage.py index ac785ed9d..f0f5f5173 100644 --- a/deepspeed/utils/mixed_precision_linkage.py +++ b/deepspeed/utils/mixed_precision_linkage.py @@ -5,32 +5,14 @@ import types from deepspeed.utils import get_full_hp_param, get_full_hp_grad, get_hp_fragment_mapping -def link_hp_params(lp_param_list, - flat_hp_partition, - gradient_dict, - offload_gradient_dict, - use_offload, - param_group_index, - partition_start, - partition_size, - partition_optimizer_state, - dp_group): - local_lp_param_and_offset = _init_lp_to_hp_mapping(lp_param_list, - partition_start, - partition_size, - dp_group) +def link_hp_params(lp_param_list, flat_hp_partition, gradient_dict, offload_gradient_dict, use_offload, + param_group_index, partition_start, partition_size, partition_optimizer_state, dp_group): + local_lp_param_and_offset = _init_lp_to_hp_mapping(lp_param_list, partition_start, partition_size, dp_group) for lp_param, lp_start in local_lp_param_and_offset: - lp_param._hp_mapping = get_hp_fragment_mapping(lp_param, - lp_start, - flat_hp_partition, - gradient_dict, - offload_gradient_dict, - use_offload, - param_group_index, - partition_start, - partition_size, - partition_optimizer_state) + lp_param._hp_mapping = get_hp_fragment_mapping(lp_param, lp_start, flat_hp_partition, gradient_dict, + offload_gradient_dict, use_offload, param_group_index, + partition_start, partition_size, partition_optimizer_state) def _init_lp_to_hp_mapping(lp_param_list, partition_start, partition_size, dp_group): diff --git a/deepspeed/utils/nvtx.py b/deepspeed/utils/nvtx.py index 881000311..0fd62dc7b 100644 --- a/deepspeed/utils/nvtx.py +++ b/deepspeed/utils/nvtx.py @@ -6,6 +6,7 @@ from deepspeed.accelerator import get_accelerator def instrument_w_nvtx(func): """decorator that causes an NVTX range to be recorded for the duration of the function call.""" + def wrapped_fn(*args, **kwargs): get_accelerator().range_push(func.__qualname__) ret_val = func(*args, **kwargs) diff --git a/deepspeed/utils/tensor_fragment.py b/deepspeed/utils/tensor_fragment.py index 820473574..6487b04df 100644 --- a/deepspeed/utils/tensor_fragment.py +++ b/deepspeed/utils/tensor_fragment.py @@ -48,10 +48,7 @@ def get_full_hp_param(self, optim_state_key=None): reduce_buffer = torch.zeros_like(self, dtype=torch.float32).flatten() if self._hp_mapping is not None: lp_frag_address = self._hp_mapping.lp_fragment_address - reduce_fragment = torch.narrow(reduce_buffer, - 0, - lp_frag_address.start, - lp_frag_address.numel) + reduce_fragment = torch.narrow(reduce_buffer, 0, lp_frag_address.start, lp_frag_address.numel) if optim_state_key is None: hp_fragment = self._hp_mapping.hp_fragment else: @@ -72,21 +69,14 @@ def get_full_hp_grad(self): else: gradient_dict = hp_mapping.gradient_dict - if hp_mapping.param_group_index not in gradient_dict or gradient_dict[ - hp_mapping.param_group_index] is None: - raise ValueError( - "Gradients are only available immediately after backward and before engine step" - ) + if hp_mapping.param_group_index not in gradient_dict or gradient_dict[hp_mapping.param_group_index] is None: + raise ValueError("Gradients are only available immediately after backward and before engine step") - lp_grad_fragment = gradient_dict[hp_mapping.param_group_index][ - self._index_in_param_group] + lp_grad_fragment = gradient_dict[hp_mapping.param_group_index][self._index_in_param_group] hp_grad_fragment = lp_grad_fragment.to(torch.float32).flatten() lp_frag_address = self._hp_mapping.lp_fragment_address - reduce_fragment = torch.narrow(reduce_buffer, - 0, - lp_frag_address.start, - lp_frag_address.numel) + reduce_fragment = torch.narrow(reduce_buffer, 0, lp_frag_address.start, lp_frag_address.numel) if self.view(-1).shape == hp_grad_fragment.shape: reduce_buffer.data.copy_(hp_grad_fragment.data) @@ -150,16 +140,8 @@ def safe_get_full_grad(param): return None -def get_hp_fragment_mapping(lp_param, - lp_start, - flat_hp_partition, - gradient_dict, - offload_gradient_dict, - use_offload, - param_group_index, - partition_start, - partition_size, - optimizer_state_dict): +def get_hp_fragment_mapping(lp_param, lp_start, flat_hp_partition, gradient_dict, offload_gradient_dict, use_offload, + param_group_index, partition_start, partition_size, optimizer_state_dict): lp_end = lp_param.numel() + lp_start hp_start = partition_start hp_end = partition_start + partition_size @@ -170,25 +152,16 @@ def get_hp_fragment_mapping(lp_param, f'fragment start {fragment_start} should be < fragment_end {fragment_end}' fragment_numel = fragment_end - fragment_start - hp_frag_address = fragment_address(start=fragment_start - hp_start, - numel=fragment_numel) - hp_fragment_tensor = flat_hp_partition.narrow(0, - hp_frag_address.start, - hp_frag_address.numel) + hp_frag_address = fragment_address(start=fragment_start - hp_start, numel=fragment_numel) + hp_fragment_tensor = flat_hp_partition.narrow(0, hp_frag_address.start, hp_frag_address.numel) optim_fragment = { - key: value.narrow(0, - hp_frag_address.start, - hp_frag_address.numel) - for key, - value in optimizer_state_dict.items() + key: value.narrow(0, hp_frag_address.start, hp_frag_address.numel) + for key, value in optimizer_state_dict.items() if torch.is_tensor(value) and value.shape == flat_hp_partition.shape } - lp_frag_address = fragment_address(start=fragment_start - lp_start, - numel=fragment_numel) - lp_fragment_tensor = lp_param.flatten().narrow(0, - lp_frag_address.start, - lp_frag_address.numel) + lp_frag_address = fragment_address(start=fragment_start - lp_start, numel=fragment_numel) + lp_fragment_tensor = lp_param.flatten().narrow(0, lp_frag_address.start, lp_frag_address.numel) return tensor_fragment(lp_fragment=lp_fragment_tensor, lp_fragment_address=lp_frag_address, diff --git a/deepspeed/utils/timer.py b/deepspeed/utils/timer.py index 16865c5e5..4212a821d 100755 --- a/deepspeed/utils/timer.py +++ b/deepspeed/utils/timer.py @@ -18,9 +18,8 @@ except ImportError: class CudaEventTimer(object): - def __init__(self, - start_event: get_accelerator().Event, - end_event: get_accelerator().Event): + + def __init__(self, start_event: get_accelerator().Event, end_event: get_accelerator().Event): self.start_event = start_event self.end_event = end_event @@ -32,8 +31,10 @@ class CudaEventTimer(object): class SynchronizedWallClockTimer: """Group of timers. Borrowed from Nvidia Megatron code""" + class Timer: """Timer.""" + def __init__(self, name): self.name_ = name self.started_ = False @@ -102,14 +103,12 @@ class SynchronizedWallClockTimer: @staticmethod def memory_usage(): - alloc = "mem_allocated: {:.4f} GB".format(get_accelerator().memory_allocated() / - (1024 * 1024 * 1024)) - max_alloc = "max_mem_allocated: {:.4f} GB".format( - get_accelerator().max_memory_allocated() / (1024 * 1024 * 1024)) - cache = "cache_allocated: {:.4f} GB".format(get_accelerator().memory_cached() / - (1024 * 1024 * 1024)) - max_cache = "max_cache_allocated: {:.4f} GB".format( - get_accelerator().max_memory_cached() / (1024 * 1024 * 1024)) + alloc = "mem_allocated: {:.4f} GB".format(get_accelerator().memory_allocated() / (1024 * 1024 * 1024)) + max_alloc = "max_mem_allocated: {:.4f} GB".format(get_accelerator().max_memory_allocated() / + (1024 * 1024 * 1024)) + cache = "cache_allocated: {:.4f} GB".format(get_accelerator().memory_cached() / (1024 * 1024 * 1024)) + max_cache = "max_cache_allocated: {:.4f} GB".format(get_accelerator().max_memory_cached() / + (1024 * 1024 * 1024)) return " | {} | {} | {} | {}".format(alloc, max_alloc, cache, max_cache) def log(self, names, normalizer=1.0, reset=True, memory_breakdown=False, ranks=None): @@ -135,6 +134,7 @@ class SynchronizedWallClockTimer: class ThroughputTimer: + def __init__( self, batch_size, @@ -203,23 +203,19 @@ class ThroughputTimer: self.global_step_count, self.avg_samples_per_sec(), self.batch_size / self.step_elapsed_time, - round(get_accelerator().memory_allocated() / 1024**3, - 2), - round(get_accelerator().max_memory_allocated() / 1024**3, - 2), + round(get_accelerator().memory_allocated() / 1024**3, 2), + round(get_accelerator().max_memory_allocated() / 1024**3, 2), )) if self.monitor_memory: virt_mem = psutil.virtual_memory() swap = psutil.swap_memory() - self.logging( - "epoch={}/micro_step={}/global_step={}, vm %: {}, swap %: {}" - .format( - self.epoch_count, - self.micro_step_count, - self.global_step_count, - virt_mem.percent, - swap.percent, - )) + self.logging("epoch={}/micro_step={}/global_step={}, vm %: {}, swap %: {}".format( + self.epoch_count, + self.micro_step_count, + self.global_step_count, + virt_mem.percent, + swap.percent, + )) self.step_elapsed_time = 0 def avg_samples_per_sec(self): diff --git a/deepspeed/utils/zero_to_fp32.py b/deepspeed/utils/zero_to_fp32.py index f00e256bb..e84ebbd17 100755 --- a/deepspeed/utils/zero_to_fp32.py +++ b/deepspeed/utils/zero_to_fp32.py @@ -19,14 +19,8 @@ from collections import OrderedDict # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with # DeepSpeed data structures it has to be available in the current python environment. from deepspeed.utils import logger -from deepspeed.checkpoint.constants import (DS_VERSION, - OPTIMIZER_STATE_DICT, - SINGLE_PARTITION_OF_FP32_GROUPS, - FP32_FLAT_GROUPS, - ZERO_STAGE, - PARTITION_COUNT, - PARAM_SHAPES, - BUFFER_NAMES) +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES) debug = 0 @@ -65,13 +59,10 @@ def get_model_state_file(checkpoint_dir, zero_stage): def get_optim_files(checkpoint_dir): # XXX: need to test that this simple glob rule works for multi-node setup too - optim_files = sorted(glob.glob(os.path.join(checkpoint_dir, - "*_optim_states.pt")), - key=natural_keys) + optim_files = sorted(glob.glob(os.path.join(checkpoint_dir, "*_optim_states.pt")), key=natural_keys) if len(optim_files) == 0: - raise FileNotFoundError( - f"can't find '*_optim_states.pt' files in directory '{checkpoint_dir}'") + raise FileNotFoundError(f"can't find '*_optim_states.pt' files in directory '{checkpoint_dir}'") return optim_files @@ -86,11 +77,7 @@ def parse_model_state(file): print("Found buffers:", buffer_names) # recover just the buffers while restoring them to fp32 if they were saved in fp16 - buffers = { - k: v.float() - for k, - v in state_dict["module"].items() if k in buffer_names - } + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} param_shapes = state_dict[PARAM_SHAPES] ds_version = state_dict.get(DS_VERSION, None) @@ -132,10 +119,7 @@ def parse_optim_states(files, ds_checkpoint_dir): raise ValueError(f"unknown zero stage {zero_stage}") if zero_stage == 2: - fp32_flat_groups = [ - state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] - for i in range(len(state_dicts)) - ] + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] elif zero_stage == 3: # if there is more than one param group, there will be multiple flattened tensors - one # flattened tensor per group - for simplicity merge them into a single tensor @@ -144,8 +128,7 @@ def parse_optim_states(files, ds_checkpoint_dir): # will require matching the sub-lists of param_shapes for each param group flattened tensor fp32_flat_groups = [ - torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], - 0) for i in range(len(state_dicts)) + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) ] return zero_stage, world_size, fp32_flat_groups @@ -163,29 +146,19 @@ def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir): optim_files = get_optim_files(ds_checkpoint_dir) zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) - print( - f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") model_file = get_model_state_file(ds_checkpoint_dir, zero_stage) buffers, param_shapes, ds_version = parse_model_state(model_file) print(f'Parsing checkpoint created by deepspeed=={ds_version}') if zero_stage == 2: - return _get_fp32_state_dict_from_zero2_checkpoint(world_size, - param_shapes, - fp32_flat_groups, - buffers) + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, param_shapes, fp32_flat_groups, buffers) elif zero_stage == 3: - return _get_fp32_state_dict_from_zero3_checkpoint(world_size, - param_shapes, - fp32_flat_groups, - buffers) + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, param_shapes, fp32_flat_groups, buffers) -def _get_fp32_state_dict_from_zero2_checkpoint(world_size, - param_shapes, - fp32_flat_groups, - buffers): +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, param_shapes, fp32_flat_groups, buffers): # Reconstruction protocol: # @@ -194,8 +167,7 @@ def _get_fp32_state_dict_from_zero2_checkpoint(world_size, if debug: for i in range(world_size): for j in range(len(fp32_flat_groups[0])): - print( - f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") # XXX: memory usage doubles here (zero2) num_param_groups = len(fp32_flat_groups[0]) @@ -204,15 +176,12 @@ def _get_fp32_state_dict_from_zero2_checkpoint(world_size, merged_partitions = [sd[i] for sd in fp32_flat_groups] full_single_fp32_vector = torch.cat(merged_partitions, 0) merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) - avail_numel = sum([ - full_single_fp32_vector.numel() - for full_single_fp32_vector in merged_single_partition_of_fp32_groups - ]) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) if debug: wanted_params = sum([len(shapes) for shapes in param_shapes]) - wanted_numel = sum( - [sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) # not asserting if there is a mismatch due to possible padding print(f"Have {avail_numel} numels to process.") print(f"Need {wanted_numel} numels in {wanted_params} params.") @@ -239,13 +208,8 @@ def _get_fp32_state_dict_from_zero2_checkpoint(world_size, total_params += 1 if debug: - print( - f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} " - ) - state_dict[name] = full_single_fp32_vector.narrow( - 0, - offset, - unpartitioned_numel).view(shape) + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) offset += unpartitioned_numel # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and @@ -268,12 +232,9 @@ def _get_fp32_state_dict_from_zero2_checkpoint(world_size, # Sanity check if offset != avail_numel: - raise ValueError( - f"consumed {offset} numels out of {avail_numel} - something is wrong") + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") - print( - f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements" - ) + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") return state_dict @@ -285,10 +246,7 @@ def zero3_partitioned_param_info(unpartitioned_numel, world_size): return partitioned_numel, padding_numel -def _get_fp32_state_dict_from_zero3_checkpoint(world_size, - param_shapes, - fp32_flat_groups, - buffers): +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, param_shapes, fp32_flat_groups, buffers): # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each # param, re-consolidating each param, while dealing with padding if any @@ -335,25 +293,17 @@ def _get_fp32_state_dict_from_zero3_checkpoint(world_size, # XXX: memory usage doubles here state_dict[name] = torch.cat( - tuple(fp32_flat_groups[i].narrow(0, - offset, - partitioned_numel) - for i in range(world_size)), - 0).narrow(0, - 0, - unpartitioned_numel).view(shape) + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) offset += partitioned_numel offset *= world_size # Sanity check if offset != avail_numel: - raise ValueError( - f"consumed {offset} numels out of {avail_numel} - something is wrong") + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") - print( - f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements" - ) + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") return state_dict @@ -465,16 +415,13 @@ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument( - "checkpoint_dir", - type=str, - help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") parser.add_argument( "output_file", type=str, - help= - "path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)" - ) + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") parser.add_argument("-d", "--debug", action='store_true', help="enable debug") args = parser.parse_args() diff --git a/op_builder/__init__.py b/op_builder/__init__.py index b41f074d9..08337c752 100644 --- a/op_builder/__init__.py +++ b/op_builder/__init__.py @@ -45,9 +45,7 @@ for _, module_name, _ in pkgutil.iter_modules([os.path.dirname(this_module.__fil if module_name != 'all_ops' and module_name != 'builder': module = importlib.import_module(f".{module_name}", package=op_builder_dir) for member_name in module.__dir__(): - if member_name.endswith( - 'Builder' - ) and member_name != "OpBuilder" and member_name != "CUDAOpBuilder": + if member_name.endswith('Builder') and member_name != "OpBuilder" and member_name != "CUDAOpBuilder": # assign builder name to variable with same name # the following is equivalent to i.e. TransformerBuilder = "TransformerBuilder" this_module.__dict__[member_name] = builder_closure(member_name) diff --git a/op_builder/async_io.py b/op_builder/async_io.py index 3075e9179..ab666d89b 100644 --- a/op_builder/async_io.py +++ b/op_builder/async_io.py @@ -19,14 +19,10 @@ class AsyncIOBuilder(OpBuilder): def sources(self): return [ - 'csrc/aio/py_lib/deepspeed_py_copy.cpp', - 'csrc/aio/py_lib/py_ds_aio.cpp', - 'csrc/aio/py_lib/deepspeed_py_aio.cpp', - 'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp', - 'csrc/aio/py_lib/deepspeed_aio_thread.cpp', - 'csrc/aio/common/deepspeed_aio_utils.cpp', - 'csrc/aio/common/deepspeed_aio_common.cpp', - 'csrc/aio/common/deepspeed_aio_types.cpp', + 'csrc/aio/py_lib/deepspeed_py_copy.cpp', 'csrc/aio/py_lib/py_ds_aio.cpp', + 'csrc/aio/py_lib/deepspeed_py_aio.cpp', 'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp', + 'csrc/aio/py_lib/deepspeed_aio_thread.cpp', 'csrc/aio/common/deepspeed_aio_utils.cpp', + 'csrc/aio/common/deepspeed_aio_common.cpp', 'csrc/aio/common/deepspeed_aio_types.cpp', 'csrc/aio/py_lib/deepspeed_pin_tensor.cpp' ] @@ -56,15 +52,9 @@ class AsyncIOBuilder(OpBuilder): def check_for_libaio_pkg(self): libs = dict( - dpkg=["-l", - "libaio-dev", - "apt"], - pacman=["-Q", - "libaio", - "pacman"], - rpm=["-q", - "libaio-devel", - "yum"], + dpkg=["-l", "libaio-dev", "apt"], + pacman=["-Q", "libaio", "pacman"], + rpm=["-q", "libaio-devel", "yum"], ) found = False @@ -73,15 +63,11 @@ class AsyncIOBuilder(OpBuilder): path = distutils.spawn.find_executable(pkgmgr) if path is not None: cmd = f"{pkgmgr} {flag} {lib}" - result = subprocess.Popen(cmd, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - shell=True) + result = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) if result.wait() == 0: found = True else: - self.warning( - f"{self.NAME}: please install the {lib} package with {tool}") + self.warning(f"{self.NAME}: please install the {lib} package with {tool}") break return found @@ -93,9 +79,7 @@ class AsyncIOBuilder(OpBuilder): # respectively to specify the directories for libaio.h and libaio.so. aio_compatible = self.has_function('io_submit', ('aio', )) if verbose and not aio_compatible: - self.warning( - f"{self.NAME} requires the dev libaio .so object and headers but these were not found." - ) + self.warning(f"{self.NAME} requires the dev libaio .so object and headers but these were not found.") # Check for the libaio package via known package managers # to print suggestions on which package to install. diff --git a/op_builder/builder.py b/op_builder/builder.py index 198e8471a..bf67eb3fd 100644 --- a/op_builder/builder.py +++ b/op_builder/builder.py @@ -27,9 +27,7 @@ DEFAULT_COMPUTE_CAPABILITIES = "6.0;6.1;7.0" try: import torch except ImportError: - print( - f"{WARNING} unable to import torch, please install it if you want to pre-compile any deepspeed ops." - ) + print(f"{WARNING} unable to import torch, please install it if you want to pre-compile any deepspeed ops.") else: TORCH_MAJOR = int(torch.__version__.split('.')[0]) TORCH_MINOR = int(torch.__version__.split('.')[1]) @@ -43,9 +41,7 @@ def installed_cuda_version(name=""): cuda_home = torch.utils.cpp_extension.CUDA_HOME assert cuda_home is not None, "CUDA_HOME does not exist, unable to compile CUDA op(s)" # Ensure there is not a cuda version mismatch between torch and nvcc compiler - output = subprocess.check_output([cuda_home + "/bin/nvcc", - "-V"], - universal_newlines=True) + output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"], universal_newlines=True) output_split = output.split() release_idx = output_split.index("release") release = output_split[release_idx + 1].replace(',', '').split(".") @@ -57,8 +53,7 @@ def installed_cuda_version(name=""): def get_default_compute_capabilities(): compute_caps = DEFAULT_COMPUTE_CAPABILITIES import torch.utils.cpp_extension - if torch.utils.cpp_extension.CUDA_HOME is not None and installed_cuda_version( - )[0] >= 11: + if torch.utils.cpp_extension.CUDA_HOME is not None and installed_cuda_version()[0] >= 11: if installed_cuda_version()[0] == 11 and installed_cuda_version()[1] == 0: # Special treatment of CUDA 11.0 because compute_86 is not supported. compute_caps += ";8.0" @@ -75,15 +70,7 @@ cuda_minor_mismatch_ok = { "10.1", "10.2", ], - 11: ["11.0", - "11.1", - "11.2", - "11.3", - "11.4", - "11.5", - "11.6", - "11.7", - "11.8"], + 11: ["11.0", "11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8"], } @@ -95,17 +82,15 @@ def assert_no_cuda_mismatch(name=""): torch_cuda_version = ".".join(torch.version.cuda.split('.')[:2]) # This is a show-stopping error, should probably not proceed past this if sys_cuda_version != torch_cuda_version: - if (cuda_major in cuda_minor_mismatch_ok - and sys_cuda_version in cuda_minor_mismatch_ok[cuda_major] + if (cuda_major in cuda_minor_mismatch_ok and sys_cuda_version in cuda_minor_mismatch_ok[cuda_major] and torch_cuda_version in cuda_minor_mismatch_ok[cuda_major]): print(f"Installed CUDA version {sys_cuda_version} does not match the " f"version torch was compiled with {torch.version.cuda} " "but since the APIs are compatible, accepting this combination") return True - raise Exception( - f">- DeepSpeed Op Builder: Installed CUDA version {sys_cuda_version} does not match the " - f"version torch was compiled with {torch.version.cuda}, unable to compile " - "cuda/cpp extensions without a matching cuda version.") + raise Exception(f">- DeepSpeed Op Builder: Installed CUDA version {sys_cuda_version} does not match the " + f"version torch was compiled with {torch.version.cuda}, unable to compile " + "cuda/cpp extensions without a matching cuda version.") return True @@ -142,12 +127,11 @@ class OpBuilder(ABC): install_torch_version = torch_info['version'] current_torch_version = ".".join(torch.__version__.split('.')[:2]) if install_torch_version != current_torch_version: - raise RuntimeError( - "PyTorch version mismatch! DeepSpeed ops were compiled and installed " - "with a different version than what is being used at runtime. " - f"Please re-install DeepSpeed or switch torch versions. " - f"Install torch version={install_torch_version}, " - f"Runtime torch version={current_torch_version}") + raise RuntimeError("PyTorch version mismatch! DeepSpeed ops were compiled and installed " + "with a different version than what is being used at runtime. " + f"Please re-install DeepSpeed or switch torch versions. " + f"Install torch version={install_torch_version}, " + f"Runtime torch version={current_torch_version}") @staticmethod def validate_torch_op_version(torch_info): @@ -155,22 +139,20 @@ class OpBuilder(ABC): current_cuda_version = ".".join(torch.version.cuda.split('.')[:2]) install_cuda_version = torch_info['cuda_version'] if install_cuda_version != current_cuda_version: - raise RuntimeError( - "CUDA version mismatch! DeepSpeed ops were compiled and installed " - "with a different version than what is being used at runtime. " - f"Please re-install DeepSpeed or switch torch versions. " - f"Install CUDA version={install_cuda_version}, " - f"Runtime CUDA version={current_cuda_version}") + raise RuntimeError("CUDA version mismatch! DeepSpeed ops were compiled and installed " + "with a different version than what is being used at runtime. " + f"Please re-install DeepSpeed or switch torch versions. " + f"Install CUDA version={install_cuda_version}, " + f"Runtime CUDA version={current_cuda_version}") else: current_hip_version = ".".join(torch.version.hip.split('.')[:2]) install_hip_version = torch_info['hip_version'] if install_hip_version != current_hip_version: - raise RuntimeError( - "HIP version mismatch! DeepSpeed ops were compiled and installed " - "with a different version than what is being used at runtime. " - f"Please re-install DeepSpeed or switch torch versions. " - f"Install HIP version={install_hip_version}, " - f"Runtime HIP version={current_hip_version}") + raise RuntimeError("HIP version mismatch! DeepSpeed ops were compiled and installed " + "with a different version than what is being used at runtime. " + f"Please re-install DeepSpeed or switch torch versions. " + f"Install HIP version={install_hip_version}, " + f"Runtime HIP version={current_hip_version}") @staticmethod def is_rocm_pytorch(): @@ -184,8 +166,7 @@ class OpBuilder(ABC): pass else: if TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 5): - _is_rocm_pytorch = hasattr(torch.version, - 'hip') and torch.version.hip is not None + _is_rocm_pytorch = hasattr(torch.version, 'hip') and torch.version.hip is not None if _is_rocm_pytorch: from torch.utils.cpp_extension import ROCM_HOME _is_rocm_pytorch = ROCM_HOME is not None @@ -246,10 +227,7 @@ class OpBuilder(ABC): valid = False check_cmd = 'dpkg -l' for lib in libraries: - result = subprocess.Popen(f'dpkg -l {lib}', - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - shell=True) + result = subprocess.Popen(f'dpkg -l {lib}', stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) valid = valid or result.wait() == 0 return valid @@ -280,9 +258,7 @@ class OpBuilder(ABC): tempdir = tempfile.mkdtemp() # Define a simple C program that calls the function in question - prog = "void %s(void); int main(int argc, char** argv) { %s(); return 0; }" % ( - funcname, - funcname) + prog = "void %s(void); int main(int argc, char** argv) { %s(); return 0; }" % (funcname, funcname) # Write the test program to a file. filename = os.path.join(tempdir, 'test.c') @@ -303,16 +279,13 @@ class OpBuilder(ABC): # Attempt to compile the C program into an object file. cflags = shlex.split(os.environ.get('CFLAGS', "")) - objs = compiler.compile([filename], - output_dir=output_dir, - extra_preargs=self.strip_empty_entries(cflags)) + objs = compiler.compile([filename], output_dir=output_dir, extra_preargs=self.strip_empty_entries(cflags)) # Attempt to link the object file into an executable. # Be sure to tack on any libraries that have been specified. ldflags = shlex.split(os.environ.get('LDFLAGS', "")) compiler.link_executable(objs, - os.path.join(tempdir, - 'a.out'), + os.path.join(tempdir, 'a.out'), extra_preargs=self.strip_empty_entries(ldflags), libraries=libraries) @@ -356,9 +329,8 @@ class OpBuilder(ABC): try: cpu_info = get_cpu_info() except Exception as e: - self.warning( - f"{self.name} attempted to use `py-cpuinfo` but failed (exception type: {type(e)}, {e}), " - "falling back to `lscpu` to get this information.") + self.warning(f"{self.name} attempted to use `py-cpuinfo` but failed (exception type: {type(e)}, {e}), " + "falling back to `lscpu` to get this information.") cpu_info = self._backup_cpuinfo() if cpu_info is None: return "-march=native" @@ -373,20 +345,16 @@ class OpBuilder(ABC): if torch.cuda.is_available(): return '-D__ENABLE_CUDA__' except: - print( - f"{WARNING} {self.name} torch.cuda is missing, only cpu ops can be compiled!" - ) + print(f"{WARNING} {self.name} torch.cuda is missing, only cpu ops can be compiled!") return '-D__DISABLE_CUDA__' return '-D__DISABLE_CUDA__' def _backup_cpuinfo(self): # Construct cpu_info dict from lscpu that is similar to what py-cpuinfo provides if not self.command_exists('lscpu'): - self.warning( - f"{self.name} attempted to query 'lscpu' after failing to use py-cpuinfo " - "to detect the CPU architecture. 'lscpu' does not appear to exist on " - "your system, will fall back to use -march=native and non-vectorized execution." - ) + self.warning(f"{self.name} attempted to query 'lscpu' after failing to use py-cpuinfo " + "to detect the CPU architecture. 'lscpu' does not appear to exist on " + "your system, will fall back to use -march=native and non-vectorized execution.") return None result = subprocess.check_output('lscpu', shell=True) result = result.decode('utf-8').strip().lower() @@ -418,9 +386,8 @@ class OpBuilder(ABC): try: cpu_info = get_cpu_info() except Exception as e: - self.warning( - f"{self.name} attempted to use `py-cpuinfo` but failed (exception type: {type(e)}, {e}), " - "falling back to `lscpu` to get this information.") + self.warning(f"{self.name} attempted to use `py-cpuinfo` but failed (exception type: {type(e)}, {e}), " + "falling back to `lscpu` to get this information.") cpu_info = self._backup_cpuinfo() if cpu_info is None: return '-D__SCALAR__' @@ -443,13 +410,9 @@ class OpBuilder(ABC): valid = valid or result.wait() == 0 if not valid and len(cmds) > 1: - print( - f"{WARNING} {self.name} requires one of the following commands '{cmds}', but it does not exist!" - ) + print(f"{WARNING} {self.name} requires one of the following commands '{cmds}', but it does not exist!") elif not valid and len(cmds) == 1: - print( - f"{WARNING} {self.name} requires the '{cmd}' command, but it does not exist!" - ) + print(f"{WARNING} {self.name} requires the '{cmd}' command, but it does not exist!") return valid def warning(self, msg): @@ -464,12 +427,11 @@ class OpBuilder(ABC): def builder(self): from torch.utils.cpp_extension import CppExtension - return CppExtension( - name=self.absolute_name(), - sources=self.strip_empty_entries(self.sources()), - include_dirs=self.strip_empty_entries(self.include_paths()), - extra_compile_args={'cxx': self.strip_empty_entries(self.cxx_args())}, - extra_link_args=self.strip_empty_entries(self.extra_ldflags())) + return CppExtension(name=self.absolute_name(), + sources=self.strip_empty_entries(self.sources()), + include_dirs=self.strip_empty_entries(self.include_paths()), + extra_compile_args={'cxx': self.strip_empty_entries(self.cxx_args())}, + extra_link_args=self.strip_empty_entries(self.extra_ldflags())) def load(self, verbose=True): from deepspeed.git_version_info import installed_ops, torch_info @@ -492,9 +454,7 @@ class OpBuilder(ABC): try: import ninja # noqa: F401 except ImportError: - raise RuntimeError( - f"Unable to JIT load the {self.name} op due to ninja not being installed." - ) + raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.") if isinstance(self, CUDAOpBuilder) and not self.is_rocm_pytorch(): self.build_for_cpu = not assert_no_cuda_mismatch(self.name) @@ -504,9 +464,7 @@ class OpBuilder(ABC): start_build = time.time() sources = [self.deepspeed_src_path(path) for path in self.sources()] - extra_include_paths = [ - self.deepspeed_src_path(path) for path in self.include_paths() - ] + extra_include_paths = [self.deepspeed_src_path(path) for path in self.include_paths()] # Torch will try and apply whatever CCs are in the arch list at compile time, # we have already set the intended targets ourselves we know that will be @@ -517,14 +475,13 @@ class OpBuilder(ABC): torch_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST") os.environ["TORCH_CUDA_ARCH_LIST"] = "" - op_module = load( - name=self.name, - sources=self.strip_empty_entries(sources), - extra_include_paths=self.strip_empty_entries(extra_include_paths), - extra_cflags=self.strip_empty_entries(self.cxx_args()), - extra_cuda_cflags=self.strip_empty_entries(self.nvcc_args()), - extra_ldflags=self.strip_empty_entries(self.extra_ldflags()), - verbose=verbose) + op_module = load(name=self.name, + sources=self.strip_empty_entries(sources), + extra_include_paths=self.strip_empty_entries(extra_include_paths), + extra_cflags=self.strip_empty_entries(self.cxx_args()), + extra_cuda_cflags=self.strip_empty_entries(self.nvcc_args()), + extra_ldflags=self.strip_empty_entries(self.extra_ldflags()), + verbose=verbose) build_duration = time.time() - start_build if verbose: @@ -538,6 +495,7 @@ class OpBuilder(ABC): class CUDAOpBuilder(OpBuilder): + def compute_capability_args(self, cross_compile_archs=None): """ Returns nvcc compute capability compile flags. @@ -584,8 +542,7 @@ class CUDAOpBuilder(OpBuilder): ccs = self.filter_ccs(ccs) if len(ccs) == 0: raise RuntimeError( - f"Unable to load {self.name} op due to no compute capabilities remaining after filtering" - ) + f"Unable to load {self.name} op due to no compute capabilities remaining after filtering") args = [] for cc in ccs: @@ -630,12 +587,11 @@ class CUDAOpBuilder(OpBuilder): {'cxx': self.strip_empty_entries(self.cxx_args()), \ 'nvcc': self.strip_empty_entries(self.nvcc_args())} - cuda_ext = ExtensionBuilder( - name=self.absolute_name(), - sources=self.strip_empty_entries(self.sources()), - include_dirs=self.strip_empty_entries(self.include_paths()), - libraries=self.strip_empty_entries(self.libraries_args()), - extra_compile_args=compile_args) + cuda_ext = ExtensionBuilder(name=self.absolute_name(), + sources=self.strip_empty_entries(self.sources()), + include_dirs=self.strip_empty_entries(self.include_paths()), + libraries=self.strip_empty_entries(self.libraries_args()), + extra_compile_args=compile_args) if self.is_rocm_pytorch(): # hip converts paths to absolute, this converts back to relative @@ -654,8 +610,7 @@ class CUDAOpBuilder(OpBuilder): project_directory=os.getcwd(), output_directory=os.getcwd(), header_include_dirs=self.include_paths(), - includes=[os.path.join(os.getcwd(), - '*')], + includes=[os.path.join(os.getcwd(), '*')], extra_files=[os.path.abspath(s) for s in self.sources()], show_detailed=True, is_pytorch_extension=True, @@ -675,9 +630,7 @@ class CUDAOpBuilder(OpBuilder): if self.is_rocm_pytorch(): ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version() args += [ - '-std=c++14', - '-U__HIP_NO_HALF_OPERATORS__', - '-U__HIP_NO_HALF_CONVERSIONS__', + '-std=c++14', '-U__HIP_NO_HALF_OPERATORS__', '-U__HIP_NO_HALF_CONVERSIONS__', '-U__HIP_NO_HALF2_OPERATORS__', '-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR, '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR @@ -685,13 +638,9 @@ class CUDAOpBuilder(OpBuilder): else: cuda_major, _ = installed_cuda_version() args += [ - '-allow-unsupported-compiler' if sys.platform == "win32" else '', - '--use_fast_math', - '-std=c++17' - if sys.platform == "win32" and cuda_major > 10 else '-std=c++14', - '-U__CUDA_NO_HALF_OPERATORS__', - '-U__CUDA_NO_HALF_CONVERSIONS__', - '-U__CUDA_NO_HALF2_OPERATORS__' + '-allow-unsupported-compiler' if sys.platform == "win32" else '', '--use_fast_math', + '-std=c++17' if sys.platform == "win32" and cuda_major > 10 else '-std=c++14', + '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__' ] if os.environ.get('DS_DEBUG_CUDA_BUILD', '0') == '1': args.append('--ptxas-options=-v') @@ -709,6 +658,7 @@ class CUDAOpBuilder(OpBuilder): class TorchCPUOpBuilder(CUDAOpBuilder): + def extra_ldflags(self): if self.build_for_cpu: return ['-fopenmp'] diff --git a/op_builder/cpu_adagrad.py b/op_builder/cpu_adagrad.py index bf7c98052..a3dc000ca 100644 --- a/op_builder/cpu_adagrad.py +++ b/op_builder/cpu_adagrad.py @@ -38,13 +38,8 @@ class CPUAdagradBuilder(TorchCPUOpBuilder): CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")] else: CUDA_INCLUDE = [ - os.path.join(torch.utils.cpp_extension.ROCM_HOME, - "include"), - os.path.join(torch.utils.cpp_extension.ROCM_HOME, - "include", - "rocrand"), - os.path.join(torch.utils.cpp_extension.ROCM_HOME, - "include", - "hiprand"), + os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include"), + os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "rocrand"), + os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "hiprand"), ] return ['csrc/includes'] + CUDA_INCLUDE diff --git a/op_builder/cpu_adam.py b/op_builder/cpu_adam.py index bdf1166d1..c1e639162 100644 --- a/op_builder/cpu_adam.py +++ b/op_builder/cpu_adam.py @@ -39,13 +39,8 @@ class CPUAdamBuilder(TorchCPUOpBuilder): CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")] else: CUDA_INCLUDE = [ - os.path.join(torch.utils.cpp_extension.ROCM_HOME, - "include"), - os.path.join(torch.utils.cpp_extension.ROCM_HOME, - "include", - "rocrand"), - os.path.join(torch.utils.cpp_extension.ROCM_HOME, - "include", - "hiprand"), + os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include"), + os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "rocrand"), + os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "hiprand"), ] return ['csrc/includes'] + CUDA_INCLUDE diff --git a/op_builder/fused_adam.py b/op_builder/fused_adam.py index 2883d417e..7e8bea274 100644 --- a/op_builder/fused_adam.py +++ b/op_builder/fused_adam.py @@ -29,9 +29,7 @@ class FusedAdamBuilder(CUDAOpBuilder): def nvcc_args(self): nvcc_flags = ['-O3'] + self.version_dependent_macros() if not self.is_rocm_pytorch(): - nvcc_flags.extend([ - '-allow-unsupported-compiler' if sys.platform == "win32" else '', - '-lineinfo', - '--use_fast_math' - ] + self.compute_capability_args()) + nvcc_flags.extend( + ['-allow-unsupported-compiler' if sys.platform == "win32" else '', '-lineinfo', '--use_fast_math'] + + self.compute_capability_args()) return nvcc_flags diff --git a/op_builder/fused_lamb.py b/op_builder/fused_lamb.py index d5f88d0b1..33d2ae126 100644 --- a/op_builder/fused_lamb.py +++ b/op_builder/fused_lamb.py @@ -30,14 +30,9 @@ class FusedLambBuilder(CUDAOpBuilder): nvcc_flags = ['-O3'] + self.version_dependent_macros() if self.is_rocm_pytorch(): ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version() - nvcc_flags += [ - '-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR, - '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR - ] + nvcc_flags += ['-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR, '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR] else: - nvcc_flags.extend([ - '-allow-unsupported-compiler' if sys.platform == "win32" else '', - '-lineinfo', - '--use_fast_math' - ] + self.compute_capability_args()) + nvcc_flags.extend( + ['-allow-unsupported-compiler' if sys.platform == "win32" else '', '-lineinfo', '--use_fast_math'] + + self.compute_capability_args()) return nvcc_flags diff --git a/op_builder/random_ltd.py b/op_builder/random_ltd.py index 79c86c134..877c5aea3 100644 --- a/op_builder/random_ltd.py +++ b/op_builder/random_ltd.py @@ -23,18 +23,13 @@ class RandomLTDBuilder(CUDAOpBuilder): def sources(self): return [ - 'csrc/random_ltd/pt_binding.cpp', - 'csrc/random_ltd/gather_scatter.cu', - 'csrc/random_ltd/slice_attn_masks.cu', - 'csrc/random_ltd/token_sort.cu' + 'csrc/random_ltd/pt_binding.cpp', 'csrc/random_ltd/gather_scatter.cu', + 'csrc/random_ltd/slice_attn_masks.cu', 'csrc/random_ltd/token_sort.cu' ] def include_paths(self): includes = ['csrc/includes'] if self.is_rocm_pytorch(): from torch.utils.cpp_extension import ROCM_HOME - includes += [ - '{}/hiprand/include'.format(ROCM_HOME), - '{}/rocrand/include'.format(ROCM_HOME) - ] + includes += ['{}/hiprand/include'.format(ROCM_HOME), '{}/rocrand/include'.format(ROCM_HOME)] return includes diff --git a/op_builder/sparse_attn.py b/op_builder/sparse_attn.py index 414bc212e..4b4b80197 100644 --- a/op_builder/sparse_attn.py +++ b/op_builder/sparse_attn.py @@ -47,8 +47,7 @@ class SparseAttnBuilder(OpBuilder): self.warning(f"{self.NAME} cuda is not available from torch") else: major, minor = torch.version.cuda.split('.')[:2] - cuda_compatible = (int(major) == 10 - and int(minor) >= 1) or (int(major) >= 11) + cuda_compatible = (int(major) == 10 and int(minor) >= 1) or (int(major) >= 11) if not cuda_compatible: self.warning(f"{self.NAME} requires CUDA version 10.1+") @@ -56,17 +55,14 @@ class SparseAttnBuilder(OpBuilder): TORCH_MINOR = int(torch.__version__.split('.')[1]) torch_compatible = TORCH_MAJOR == 1 and TORCH_MINOR >= 5 if not torch_compatible: - self.warning( - f'{self.NAME} requires a torch version >= 1.5 but detected {TORCH_MAJOR}.{TORCH_MINOR}' - ) + self.warning(f'{self.NAME} requires a torch version >= 1.5 but detected {TORCH_MAJOR}.{TORCH_MINOR}') try: import triton except ImportError: # auto-install of triton is broken on some systems, reverting to manual install for now # see this issue: https://github.com/microsoft/DeepSpeed/issues/1710 - self.warning( - f"please install triton==1.0.0 if you want to use sparse attention") + self.warning(f"please install triton==1.0.0 if you want to use sparse attention") return False if pkg_version: @@ -77,9 +73,7 @@ class SparseAttnBuilder(OpBuilder): triton_mismatch = installed_triton != "1.0.0" if triton_mismatch: - self.warning( - f"using untested triton version ({installed_triton}), only 1.0.0 is known to be compatible" - ) + self.warning(f"using untested triton version ({installed_triton}), only 1.0.0 is known to be compatible") return False return super().is_compatible(verbose) and torch_compatible and cuda_compatible diff --git a/op_builder/spatial_inference.py b/op_builder/spatial_inference.py index 18d19d40e..96f6843a1 100644 --- a/op_builder/spatial_inference.py +++ b/op_builder/spatial_inference.py @@ -19,8 +19,7 @@ class SpatialInferenceBuilder(CUDAOpBuilder): try: import torch except ImportError: - self.warning( - "Please install torch if trying to pre-compile inference kernels") + self.warning("Please install torch if trying to pre-compile inference kernels") return False cuda_okay = True @@ -30,8 +29,7 @@ class SpatialInferenceBuilder(CUDAOpBuilder): cuda_capability = torch.cuda.get_device_properties(0).major if cuda_capability >= 8: if torch_cuda_major < 11 or sys_cuda_major < 11: - self.warning( - "On Ampere and higher architectures please use CUDA 11+") + self.warning("On Ampere and higher architectures please use CUDA 11+") cuda_okay = False return super().is_compatible(verbose) and cuda_okay diff --git a/op_builder/transformer.py b/op_builder/transformer.py index 239f29552..f619122a0 100644 --- a/op_builder/transformer.py +++ b/op_builder/transformer.py @@ -23,22 +23,15 @@ class TransformerBuilder(CUDAOpBuilder): def sources(self): return [ - 'csrc/transformer/ds_transformer_cuda.cpp', - 'csrc/transformer/cublas_wrappers.cu', - 'csrc/transformer/transform_kernels.cu', - 'csrc/transformer/gelu_kernels.cu', - 'csrc/transformer/dropout_kernels.cu', - 'csrc/transformer/normalize_kernels.cu', - 'csrc/transformer/softmax_kernels.cu', - 'csrc/transformer/general_kernels.cu' + 'csrc/transformer/ds_transformer_cuda.cpp', 'csrc/transformer/cublas_wrappers.cu', + 'csrc/transformer/transform_kernels.cu', 'csrc/transformer/gelu_kernels.cu', + 'csrc/transformer/dropout_kernels.cu', 'csrc/transformer/normalize_kernels.cu', + 'csrc/transformer/softmax_kernels.cu', 'csrc/transformer/general_kernels.cu' ] def include_paths(self): includes = ['csrc/includes'] if self.is_rocm_pytorch(): from torch.utils.cpp_extension import ROCM_HOME - includes += [ - '{}/hiprand/include'.format(ROCM_HOME), - '{}/rocrand/include'.format(ROCM_HOME) - ] + includes += ['{}/hiprand/include'.format(ROCM_HOME), '{}/rocrand/include'.format(ROCM_HOME)] return includes diff --git a/op_builder/transformer_inference.py b/op_builder/transformer_inference.py index 9bb9bbb95..a7e7c883c 100755 --- a/op_builder/transformer_inference.py +++ b/op_builder/transformer_inference.py @@ -18,8 +18,7 @@ class InferenceBuilder(CUDAOpBuilder): try: import torch except ImportError: - self.warning( - "Please install torch if trying to pre-compile inference kernels") + self.warning("Please install torch if trying to pre-compile inference kernels") return False cuda_okay = True @@ -28,14 +27,11 @@ class InferenceBuilder(CUDAOpBuilder): torch_cuda_major = int(torch.version.cuda.split('.')[0]) cuda_capability = torch.cuda.get_device_properties(0).major if cuda_capability < 6: - self.warning( - "NVIDIA Inference is only supported on Pascal and newer architectures" - ) + self.warning("NVIDIA Inference is only supported on Pascal and newer architectures") cuda_okay = False if cuda_capability >= 8: if torch_cuda_major < 11 or sys_cuda_major < 11: - self.warning( - "On Ampere and higher architectures please use CUDA 11+") + self.warning("On Ampere and higher architectures please use CUDA 11+") cuda_okay = False return super().is_compatible(verbose) and cuda_okay diff --git a/scripts/check-license.py b/scripts/check-license.py index 519827d7d..59c01f6da 100755 --- a/scripts/check-license.py +++ b/scripts/check-license.py @@ -16,14 +16,7 @@ def err(s: str) -> None: success = True failures = [] for f in sys.argv[1:]: - res = subprocess.run( - ["git", - "grep", - "--quiet", - "-e", - r"Copyright .* DeepSpeed Team", - f], - capture_output=True) + res = subprocess.run(["git", "grep", "--quiet", "-e", r"Copyright .* DeepSpeed Team", f], capture_output=True) if res.returncode == 1: success = False failures.append(f) diff --git a/scripts/check-torchcuda.py b/scripts/check-torchcuda.py index 773db41c9..55c6c598b 100755 --- a/scripts/check-torchcuda.py +++ b/scripts/check-torchcuda.py @@ -21,19 +21,7 @@ def err(s: str) -> None: # - unlike plain grep, which is slower and has different flags on MacOS versus # Linux, git grep is always the same. res = subprocess.run( - [ - "git", - "grep", - "-Hn", - "--no-index", - "-e", - r"torch\.cuda", - "--and", - "--not", - "-e", - "#ignore-cuda", - *sys.argv[1:] - ], + ["git", "grep", "-Hn", "--no-index", "-e", r"torch\.cuda", "--and", "--not", "-e", "#ignore-cuda", *sys.argv[1:]], capture_output=True, ) if res.returncode == 0: @@ -47,12 +35,7 @@ elif res.returncode == 2: sys.exit(2) res = subprocess.run( - ["git", - "grep", - "-Hn", - "--no-index", - r"\.cuda()", - *sys.argv[1:]], + ["git", "grep", "-Hn", "--no-index", r"\.cuda()", *sys.argv[1:]], capture_output=True, ) if res.returncode == 0: diff --git a/scripts/check-torchdist.py b/scripts/check-torchdist.py index d655b7b90..470510205 100755 --- a/scripts/check-torchdist.py +++ b/scripts/check-torchdist.py @@ -21,12 +21,7 @@ def err(s: str) -> None: # - unlike plain grep, which is slower and has different flags on MacOS versus # Linux, git grep is always the same. res = subprocess.run( - ["git", - "grep", - "-Hn", - "--no-index", - r"torch\.distributed", - *sys.argv[1:]], + ["git", "grep", "-Hn", "--no-index", r"torch\.distributed", *sys.argv[1:]], capture_output=True, ) if res.returncode == 0: diff --git a/setup.py b/setup.py index ba570805e..1e39b26c6 100755 --- a/setup.py +++ b/setup.py @@ -55,7 +55,7 @@ def fetch_requirements(path): install_requires = fetch_requirements('requirements/requirements.txt') extras_require = { - '1bit': [], # add cupy based on cuda/rocm version + '1bit': [], # add cupy based on cuda/rocm version '1bit_mpi': fetch_requirements('requirements/requirements-1bit-mpi.txt'), 'readthedocs': fetch_requirements('requirements/requirements-readthedocs.txt'), 'dev': fetch_requirements('requirements/requirements-dev.txt'), @@ -92,8 +92,7 @@ cmdclass = {} # For any pre-installed ops force disable ninja if torch_available: from accelerator import get_accelerator - cmdclass['build_ext'] = get_accelerator().build_extension().with_options( - use_ninja=False) + cmdclass['build_ext'] = get_accelerator().build_extension().with_options(use_ninja=False) if torch_available: TORCH_MAJOR = torch.__version__.split('.')[0] @@ -104,10 +103,9 @@ else: if torch_available and not torch.cuda.is_available(): # Fix to allow docker builds, similar to https://github.com/NVIDIA/apex/issues/486 - print( - "[WARNING] Torch did not find cuda available, if cross-compiling or running with cpu only " - "you can ignore this message. Adding compute capability for Pascal, Volta, and Turing " - "(compute capabilities 6.0, 6.1, 6.2)") + print("[WARNING] Torch did not find cuda available, if cross-compiling or running with cpu only " + "you can ignore this message. Adding compute capability for Pascal, Volta, and Turing " + "(compute capabilities 6.0, 6.1, 6.2)") if os.environ.get("TORCH_CUDA_ARCH_LIST", None) is None: os.environ["TORCH_CUDA_ARCH_LIST"] = get_default_compute_capabilities() @@ -281,24 +279,15 @@ setup(name='deepspeed', }, install_requires=install_requires, extras_require=extras_require, - packages=find_packages(include=['deepspeed', - 'deepspeed.*']), + packages=find_packages(include=['deepspeed', 'deepspeed.*']), include_package_data=True, scripts=[ - 'bin/deepspeed', - 'bin/deepspeed.pt', - 'bin/ds', - 'bin/ds_ssh', - 'bin/ds_report', - 'bin/ds_bench', - 'bin/dsr', + 'bin/deepspeed', 'bin/deepspeed.pt', 'bin/ds', 'bin/ds_ssh', 'bin/ds_report', 'bin/ds_bench', 'bin/dsr', 'bin/ds_elastic' ], classifiers=[ - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10' ], license='MIT', diff --git a/tests/accelerator/test_ds_init.py b/tests/accelerator/test_ds_init.py index 6c4e90e2a..9f0bed93d 100644 --- a/tests/accelerator/test_ds_init.py +++ b/tests/accelerator/test_ds_init.py @@ -6,6 +6,7 @@ from deepspeed.accelerator import get_accelerator class OneLayerNet(torch.nn.Module): + def __init__(self, D_in, D_out): """ In the constructor we instantiate two nn.Linear modules and assign them as diff --git a/tests/benchmarks/flatten_bench.py b/tests/benchmarks/flatten_bench.py index 1082554f8..e6cb51746 100755 --- a/tests/benchmarks/flatten_bench.py +++ b/tests/benchmarks/flatten_bench.py @@ -26,12 +26,9 @@ unflatten = util_ops.unflatten torch.manual_seed(0) # emulate a small typical model weights x = [ - torch.rand((512, - 512)).to(get_accelerator().device_name()), - torch.rand((512, - 1024)).to(get_accelerator().device_name()), - torch.rand((512, - 30000)).to(get_accelerator().device_name()) + torch.rand((512, 512)).to(get_accelerator().device_name()), + torch.rand((512, 1024)).to(get_accelerator().device_name()), + torch.rand((512, 30000)).to(get_accelerator().device_name()) ] t = x * 30 diff --git a/tests/benchmarks/unflatten_bench.py b/tests/benchmarks/unflatten_bench.py index a4a1b63b3..f87e411f3 100755 --- a/tests/benchmarks/unflatten_bench.py +++ b/tests/benchmarks/unflatten_bench.py @@ -26,12 +26,9 @@ unflatten = util_ops.unflatten torch.manual_seed(0) # emulate a small typical model weights x = [ - torch.rand((512, - 512)).to(get_accelerator().device_name()), - torch.rand((512, - 1024)).to(get_accelerator().device_name()), - torch.rand((512, - 30000)).to(get_accelerator().device_name()) + torch.rand((512, 512)).to(get_accelerator().device_name()), + torch.rand((512, 1024)).to(get_accelerator().device_name()), + torch.rand((512, 30000)).to(get_accelerator().device_name()) ] unflat_t = x * 30 diff --git a/tests/conftest.py b/tests/conftest.py index 2ceaa034b..f493c657e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -42,16 +42,14 @@ def check_environment(pytestconfig): expected_cuda_version = pytestconfig.getoption("cuda_ver") if expected_torch_version is None: warnings.warn( - "Running test without verifying torch version, please provide an expected torch version with --torch_ver" - ) + "Running test without verifying torch version, please provide an expected torch version with --torch_ver") elif not validate_version(expected_torch_version, torch.__version__): pytest.exit( f"expected torch version {expected_torch_version} did not match found torch version {torch.__version__}", returncode=2) if expected_cuda_version is None: warnings.warn( - "Running test without verifying cuda version, please provide an expected cuda version with --cuda_ver" - ) + "Running test without verifying cuda version, please provide an expected cuda version with --cuda_ver") elif not validate_version(expected_cuda_version, torch.version.cuda): pytest.exit( f"expected cuda version {expected_cuda_version} did not match found cuda version {torch.version.cuda}", diff --git a/tests/lightning/test_simple.py b/tests/lightning/test_simple.py index 482aa1113..b7efb8fd8 100644 --- a/tests/lightning/test_simple.py +++ b/tests/lightning/test_simple.py @@ -7,6 +7,7 @@ from torch.utils.data import DataLoader, Dataset class RandomDataset(Dataset): + def __init__(self, size, length): self.len = length self.data = torch.randn(length, size) @@ -19,6 +20,7 @@ class RandomDataset(Dataset): class BoringModel(LightningModule): + def __init__(self): super().__init__() self.layer = torch.nn.Linear(32, 2) @@ -53,9 +55,5 @@ def test_lightning_model(): """Test that DeepSpeed works with a simple LightningModule and LightningDataModule.""" model = BoringModel() - trainer = Trainer(strategy=DeepSpeedStrategy(), - max_epochs=1, - precision=16, - accelerator="gpu", - devices=1) + trainer = Trainer(strategy=DeepSpeedStrategy(), max_epochs=1, precision=16, accelerator="gpu", devices=1) trainer.fit(model) diff --git a/tests/model/BingBertSquad/BingBertSquad_run_func_test.py b/tests/model/BingBertSquad/BingBertSquad_run_func_test.py index 828771cd3..2035c22ce 100755 --- a/tests/model/BingBertSquad/BingBertSquad_run_func_test.py +++ b/tests/model/BingBertSquad/BingBertSquad_run_func_test.py @@ -28,6 +28,7 @@ def grep_loss_from_file(file_name): class BingBertSquadFuncTestCase(BaseTestCase): + def __init__(self, methodName="DeepSpeed function test on BingBertSquad model"): super(BingBertSquadFuncTestCase, self).__init__(methodName) @@ -112,8 +113,7 @@ class BingBertSquadFuncTestCase(BaseTestCase): prefix = "BingBertSquad_func" test_config['other_args'] += f" --max_steps {test_config['max_steps']}" - test_config[ - 'other_args'] += f" --max_steps_per_epoch {test_config['max_epoch_steps']}" + test_config['other_args'] += f" --max_steps_per_epoch {test_config['max_epoch_steps']}" # baseline run... test_config["deepspeed"] = False diff --git a/tests/model/BingBertSquad/BingBertSquad_test_common.py b/tests/model/BingBertSquad/BingBertSquad_test_common.py index b6069d76e..f5ba4baf6 100755 --- a/tests/model/BingBertSquad/BingBertSquad_test_common.py +++ b/tests/model/BingBertSquad/BingBertSquad_test_common.py @@ -8,6 +8,7 @@ import time class BaseTestCase(unittest.TestCase): + def __init__(self, methodName="DeepSpeed performance test"): super(BaseTestCase, self).__init__(methodName) self.test_dir = "./test" @@ -23,10 +24,7 @@ class BaseTestCase(unittest.TestCase): other_args = "_" + other_args if test_config["deepspeed"]: - file_name = "_gpu{0}_{1}_ds{2}-{3}.log".format(test_config["gpus"], - other_args, - zero_args, - self.timestr) + file_name = "_gpu{0}_{1}_ds{2}-{3}.log".format(test_config["gpus"], other_args, zero_args, self.timestr) save_dir = self.test_dir else: file_name = "_gpu{0}_{1}.log".format(test_config["gpus"], other_args) @@ -46,22 +44,12 @@ class BaseTestCase(unittest.TestCase): time.sleep(20) def run_BingBertSquad_test(self, test_config, output): - ds_flag = " -d --deepspeed_config " + test_config["json"] if test_config[ - "deepspeed"] else " " - other_args = " " + test_config[ - "other_args"] if "other_args" in test_config else " " + ds_flag = " -d --deepspeed_config " + test_config["json"] if test_config["deepspeed"] else " " + other_args = " " + test_config["other_args"] if "other_args" in test_config else " " - cmd = "./run_BingBertSquad_sanity.sh -e 1 -g {0} {1} {2}".format( - test_config["gpus"], - other_args, - ds_flag) + cmd = "./run_BingBertSquad_sanity.sh -e 1 -g {0} {1} {2}".format(test_config["gpus"], other_args, ds_flag) self.ensure_directory_exists(output) with open(output, "w") as f: print(cmd) - subprocess.run(cmd, - shell=True, - check=False, - executable='/bin/bash', - stdout=f, - stderr=f) + subprocess.run(cmd, shell=True, check=False, executable='/bin/bash', stdout=f, stderr=f) diff --git a/tests/model/BingBertSquad/test_e2e_squad.py b/tests/model/BingBertSquad/test_e2e_squad.py index 7dfd718bc..53e82b662 100644 --- a/tests/model/BingBertSquad/test_e2e_squad.py +++ b/tests/model/BingBertSquad/test_e2e_squad.py @@ -63,16 +63,7 @@ def test_e2e_squad_deepspeed_base(tmpdir): output_dir = os.path.join(tmpdir, "output") pred_file = os.path.join(output_dir, pred_file_name) - proc = sp.Popen([ - "bash", - script_file_name, - num_gpus, - model_file, - squad_dir, - output_dir, - config_file - ], - cwd=base_dir) + proc = sp.Popen(["bash", script_file_name, num_gpus, model_file, squad_dir, output_dir, config_file], cwd=base_dir) try: proc.communicate(timeout=timeout_sec) @@ -82,9 +73,7 @@ def test_e2e_squad_deepspeed_base(tmpdir): print("evaluation result: ", json.dumps(eval_result)) - assert isclose(eval_result["exact_match"], - expected_exact_match, - abs_tol=1e-2) + assert isclose(eval_result["exact_match"], expected_exact_match, abs_tol=1e-2) assert isclose(eval_result["f1"], expected_f1, abs_tol=1e-2) else: @@ -110,16 +99,7 @@ def test_e2e_squad_deepspeed_zero(tmpdir): output_dir = os.path.join(tmpdir, "output") pred_file = os.path.join(output_dir, pred_file_name) - proc = sp.Popen([ - "bash", - script_file_name, - num_gpus, - model_file, - squad_dir, - output_dir, - config_file - ], - cwd=base_dir) + proc = sp.Popen(["bash", script_file_name, num_gpus, model_file, squad_dir, output_dir, config_file], cwd=base_dir) try: proc.communicate(timeout=timeout_sec) @@ -129,9 +109,7 @@ def test_e2e_squad_deepspeed_zero(tmpdir): print("evaluation result: ", json.dumps(eval_result)) - assert isclose(eval_result["exact_match"], - expected_exact_match, - abs_tol=1e-2) + assert isclose(eval_result["exact_match"], expected_exact_match, abs_tol=1e-2) assert isclose(eval_result["f1"], expected_f1, abs_tol=1e-2) else: diff --git a/tests/model/Megatron_GPT2/run_checkpoint_test.py b/tests/model/Megatron_GPT2/run_checkpoint_test.py index 628547ef2..c80e0fd20 100755 --- a/tests/model/Megatron_GPT2/run_checkpoint_test.py +++ b/tests/model/Megatron_GPT2/run_checkpoint_test.py @@ -39,6 +39,7 @@ def grep_loss_from_file(file_name): class GPT2CheckpointTestCase(BaseTestCase): + def __init__(self, methodName="DeepSpeed function test on GPT2 model"): super(GPT2CheckpointTestCase, self).__init__(methodName) @@ -480,8 +481,7 @@ class GPT2CheckpointTestCase(BaseTestCase): #-----------------Loading Checkpoint-----------------# # building checkpoint arguments - test_config[ - "other_args"] = f"\"--load {checkpoint_folder} {cpu_optimizer_flag} \"" + test_config["other_args"] = f"\"--load {checkpoint_folder} {cpu_optimizer_flag} \"" # set checkpoint load iteration try: @@ -543,24 +543,20 @@ def checkpoint_suite(): # Shrink DP suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu1_node1_with_zero1')) suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu1_node1_with_zero2')) - suite.addTest( - GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu1_node1_with_zero2_offload')) + suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu1_node1_with_zero2_offload')) suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_load_gpu2_node1_with_zero1')) suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_load_gpu2_node1_with_zero2')) - suite.addTest( - GPT2CheckpointTestCase('test_mp2_gpu4_load_gpu2_node1_with_zero2_offload')) + suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_load_gpu2_node1_with_zero2_offload')) # Expand DP suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu4_node1_with_zero1')) suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu4_node1_with_zero2')) - suite.addTest( - GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu4_node1_with_zero2_offload')) + suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu4_node1_with_zero2_offload')) suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu2_load_gpu4_node1_with_zero1')) suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu2_load_gpu4_node1_with_zero2')) - suite.addTest( - GPT2CheckpointTestCase('test_mp2_gpu2_load_gpu4_node1_with_zero2_offload')) + suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu2_load_gpu4_node1_with_zero2_offload')) suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_node1_without_zero')) diff --git a/tests/model/Megatron_GPT2/run_func_test.py b/tests/model/Megatron_GPT2/run_func_test.py index 78a685e0f..bc70b64e6 100755 --- a/tests/model/Megatron_GPT2/run_func_test.py +++ b/tests/model/Megatron_GPT2/run_func_test.py @@ -34,6 +34,7 @@ def grep_loss_from_file(file_name): class GPT2FuncTestCase(BaseTestCase): + def __init__(self, methodName="DeepSpeed function test on GPT2 model"): super(GPT2FuncTestCase, self).__init__(methodName) @@ -454,9 +455,7 @@ class GPT2FuncTestCase(BaseTestCase): baseline_deepspeed_config = True test_config["other_args"] = f"\"{cpu_optimizer_flag}\"" - base_file = self.gen_output_name(test_config, - baseline_prefix, - baseline_config=baseline_deepspeed_config) + base_file = self.gen_output_name(test_config, baseline_prefix, baseline_config=baseline_deepspeed_config) # skip baseline run if it exists. if not self.has_loss_data(base_file): @@ -468,8 +467,7 @@ class GPT2FuncTestCase(BaseTestCase): # DeepSpeed run... test_config["deepspeed"] = True cpu_optimizer_flag = self.gen_cpu_optimizer_flag(test_config, False) - test_config[ - "other_args"] = f"\"--deepspeed-activation-checkpointing {cpu_optimizer_flag}\"" + test_config["other_args"] = f"\"--deepspeed-activation-checkpointing {cpu_optimizer_flag}\"" test_config["json"] = deepspeed_config print("{0}: DeepSpeed run.".format(self.id())) @@ -502,9 +500,7 @@ class GPT2FuncTestCase(BaseTestCase): test_config["other_args"] = f"\"{cpu_optimizer_flag}\"" # baseline run... - base_file = self.gen_output_name(test_config, - baseline_prefix, - baseline_config=baseline_deepspeed_config) + base_file = self.gen_output_name(test_config, baseline_prefix, baseline_config=baseline_deepspeed_config) # skip baseline run if it exists. if not self.has_loss_data(base_file): diff --git a/tests/model/Megatron_GPT2/run_perf_baseline.py b/tests/model/Megatron_GPT2/run_perf_baseline.py index 0c7233d5d..5b553fee5 100755 --- a/tests/model/Megatron_GPT2/run_perf_baseline.py +++ b/tests/model/Megatron_GPT2/run_perf_baseline.py @@ -8,6 +8,7 @@ from test_common import BaseTestCase class GPT2PerfBaselineTestCase(BaseTestCase): + def __init__(self, methodName="DeepSpeed performance test on GPT2 model"): super(GPT2PerfBaselineTestCase, self).__init__(methodName) @@ -88,9 +89,7 @@ class GPT2PerfBaselineTestCase(BaseTestCase): if exec_time == 0.0: print("{0}: no latency found in file {1}".format(self.id(), test_file)) else: - print("{0}: execution time per iteration is {1}ms.".format( - self.id(), - exec_time)) + print("{0}: execution time per iteration is {1}ms.".format(self.id(), exec_time)) def grep_latency_from_file(self, file_name): latency = 0.0 @@ -99,9 +98,7 @@ class GPT2PerfBaselineTestCase(BaseTestCase): with open(file_name, 'r') as f: lines = f.readlines() line_filter = "elapsed time per iteration" - match_number = re.compile( - r'elapsed time per iteration \(ms\): ([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)' - ) + match_number = re.compile(r'elapsed time per iteration \(ms\): ([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)') for line in lines: if line_filter in line: diff --git a/tests/model/Megatron_GPT2/run_perf_test.py b/tests/model/Megatron_GPT2/run_perf_test.py index 623f945a4..2bd9657c5 100755 --- a/tests/model/Megatron_GPT2/run_perf_test.py +++ b/tests/model/Megatron_GPT2/run_perf_test.py @@ -8,6 +8,7 @@ from test_common import BaseTestCase class GPT2PerfTestCase(BaseTestCase): + def __init__(self, methodName="DeepSpeed performance test on GPT2 model"): super(GPT2PerfTestCase, self).__init__(methodName) @@ -92,9 +93,7 @@ class GPT2PerfTestCase(BaseTestCase): if exec_time == 0.0: print("{0}: no latency found in file {1}".format(self.id(), test_file)) else: - print("{0}: execution time per iteration is {1}ms.".format( - self.id(), - exec_time)) + print("{0}: execution time per iteration is {1}ms.".format(self.id(), exec_time)) def grep_latency_from_file(self, file_name): latency = 0.0 @@ -103,9 +102,7 @@ class GPT2PerfTestCase(BaseTestCase): with open(file_name, 'r') as f: lines = f.readlines() line_filter = "elapsed time per iteration" - match_number = re.compile( - r'elapsed time per iteration \(ms\): ([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)' - ) + match_number = re.compile(r'elapsed time per iteration \(ms\): ([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)') for line in lines: if line_filter in line: diff --git a/tests/model/Megatron_GPT2/test_common.py b/tests/model/Megatron_GPT2/test_common.py index 6f9bec89e..e2006e16e 100755 --- a/tests/model/Megatron_GPT2/test_common.py +++ b/tests/model/Megatron_GPT2/test_common.py @@ -8,6 +8,7 @@ import time class BaseTestCase(unittest.TestCase): + def __init__(self, methodName="DeepSpeed performance test"): super(BaseTestCase, self).__init__(methodName) self.test_dir = "./test" @@ -24,30 +25,14 @@ class BaseTestCase(unittest.TestCase): if test_config["deepspeed"] and not baseline_config: file_name = "_mp{0}_gpu{1}_node{2}_bs{3}_step{4}_layer{5}_hidden{6}_seq{7}_head{8}{9}_ds{10}-{11}.log".format( - test_config["mp"], - test_config["gpus"], - test_config["nodes"], - test_config["bs"], - test_config["steps"], - test_config["layers"], - test_config["hidden_size"], - test_config["seq_length"], - test_config["heads"], - other_args, - zero_args, - self.timestr) + test_config["mp"], test_config["gpus"], test_config["nodes"], test_config["bs"], test_config["steps"], + test_config["layers"], test_config["hidden_size"], test_config["seq_length"], test_config["heads"], + other_args, zero_args, self.timestr) save_dir = self.test_dir else: file_name = "_mp{0}_gpu{1}_node{2}_bs{3}_step{4}_layer{5}_hidden{6}_seq{7}_head{8}{9}.log".format( - test_config["mp"], - test_config["gpus"], - test_config["nodes"], - test_config["bs"], - test_config["steps"], - test_config["layers"], - test_config["hidden_size"], - test_config["seq_length"], - test_config["heads"], + test_config["mp"], test_config["gpus"], test_config["nodes"], test_config["bs"], test_config["steps"], + test_config["layers"], test_config["hidden_size"], test_config["seq_length"], test_config["heads"], other_args) save_dir = self.baseline_dir @@ -66,31 +51,15 @@ class BaseTestCase(unittest.TestCase): def run_gpt2_test(self, test_config, output): ds_flag = "-d " + test_config["json"] if test_config["deepspeed"] else "" - ckpt_num = test_config[ - "ckpt_num_layers"] if "ckpt_num_layers" in test_config else 1 - other_args = "-o " + test_config[ - "other_args"] if "other_args" in test_config else "" + ckpt_num = test_config["ckpt_num_layers"] if "ckpt_num_layers" in test_config else 1 + other_args = "-o " + test_config["other_args"] if "other_args" in test_config else "" cmd = "./ds_gpt2_test.sh -m {0} -g {1} -n {2} -b {3} -s {4} -l {5} -h {6} -q {7} -e {8} -c {9} {10} {11}".format( - test_config["mp"], - test_config["gpus"], - test_config["nodes"], - test_config["bs"], - test_config["steps"], - test_config["layers"], - test_config["hidden_size"], - test_config["seq_length"], - test_config["heads"], - ckpt_num, - other_args, - ds_flag) + test_config["mp"], test_config["gpus"], test_config["nodes"], test_config["bs"], test_config["steps"], + test_config["layers"], test_config["hidden_size"], test_config["seq_length"], test_config["heads"], + ckpt_num, other_args, ds_flag) self.ensure_directory_exists(output) with open(output, "w") as f: print(cmd) - subprocess.run(cmd, - shell=True, - check=False, - executable='/bin/bash', - stdout=f, - stderr=f) + subprocess.run(cmd, shell=True, check=False, executable='/bin/bash', stdout=f, stderr=f) diff --git a/tests/onebit/test_mpi_backend.py b/tests/onebit/test_mpi_backend.py index bb8915f2c..414cc032b 100644 --- a/tests/onebit/test_mpi_backend.py +++ b/tests/onebit/test_mpi_backend.py @@ -35,8 +35,7 @@ def torch_sim(a): a_list = torch.chunk(a_compressed, chunks=dist.get_world_size()) server_scale = [chunk_a.norm() / np.sqrt(chunk_a.numel()) for chunk_a in a_list] a_sign_list = torch.chunk(a_server_sign, dist.get_world_size()) - a_server_compressed = torch.cat( - [server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())]) + a_server_compressed = torch.cat([server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())]) rank = dist.get_rank() server_error = a_list[rank] - server_scale[rank] * a_sign_list[rank] get_accelerator().synchronize() diff --git a/tests/onebit/test_nccl_backend.py b/tests/onebit/test_nccl_backend.py index e544865b7..6d5bbcc54 100644 --- a/tests/onebit/test_nccl_backend.py +++ b/tests/onebit/test_nccl_backend.py @@ -40,8 +40,7 @@ def torch_sim(a): a_list = torch.chunk(a_compressed, chunks=dist.get_world_size()) server_scale = [chunk_a.norm() / np.sqrt(chunk_a.numel()) for chunk_a in a_list] a_sign_list = torch.chunk(a_server_sign, dist.get_world_size()) - a_server_compressed = torch.cat( - [server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())]) + a_server_compressed = torch.cat([server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())]) rank = dist.get_rank() server_error = a_list[rank] - server_scale[rank] * a_sign_list[rank] get_accelerator().synchronize() @@ -86,7 +85,6 @@ if test_correctness: else: check_mag_mask = mpi_server[diff_server_mask] > magnitude_threshold if torch.sum(check_mag_mask) == 0: - print( - 'Successfully passed the test for NCCL Backend at Rank {}'.format(rank)) + print('Successfully passed the test for NCCL Backend at Rank {}'.format(rank)) else: print('Fails at {} of positions'.format(torch.sum(check_mag_mask))) diff --git a/tests/onebit/test_nccl_perf.py b/tests/onebit/test_nccl_perf.py index aab93efac..6a027137d 100644 --- a/tests/onebit/test_nccl_perf.py +++ b/tests/onebit/test_nccl_perf.py @@ -83,9 +83,7 @@ if rank == 0: minlat = round(min(time_list) * convert) maxlat = round(max(time_list) * convert) meanlat = round(mean(time_list) * convert, places) -print("min, max, and mean = {} ms, {} ms, {} ms".format(minlat, - maxlat, - meanlat)) if rank == 0 else None +print("min, max, and mean = {} ms, {} ms, {} ms".format(minlat, maxlat, meanlat)) if rank == 0 else None #print("tensor shape", a.shape) duration = meanlat / 1e3 tput = ((tensor_size * 4) / duration) diff --git a/tests/perf/adam_test1.py b/tests/perf/adam_test1.py index 13d486d4d..16976ac5d 100755 --- a/tests/perf/adam_test1.py +++ b/tests/perf/adam_test1.py @@ -8,10 +8,7 @@ from deepspeed.accelerator import get_accelerator device = 'cpu' model_size = 1 * 1024**3 param = torch.nn.Parameter(torch.ones(model_size, device=device)) -param_fp16 = torch.nn.Parameter( - torch.ones(model_size, - dtype=torch.half, - device=get_accelerator().device_name(0))) +param_fp16 = torch.nn.Parameter(torch.ones(model_size, dtype=torch.half, device=get_accelerator().device_name(0))) optimizer = DeepSpeedCPUAdam([param]) #torch.set_num_threads(128) diff --git a/tests/small_model_debugging/stage3_test.py b/tests/small_model_debugging/stage3_test.py index ca85c00be..4f1cdb92d 100644 --- a/tests/small_model_debugging/stage3_test.py +++ b/tests/small_model_debugging/stage3_test.py @@ -10,6 +10,7 @@ import deepspeed class VerboseLinear(torch.nn.Linear): + def __init__(self, **kwargs): print(f'Begin VerboseLinear.__init__') super().__init__(**kwargs) @@ -17,21 +18,19 @@ class VerboseLinear(torch.nn.Linear): class LinearStack(torch.nn.Module): + def __init__(self, input_dim=2, hidden_dim=4, output_dim=4, num_layers=2): super().__init__() self.input_dim = input_dim self.output_dim = output_dim self.hidden_dim = hidden_dim - self.input_layer = VerboseLinear(in_features=self.input_dim, - out_features=self.hidden_dim) + self.input_layer = VerboseLinear(in_features=self.input_dim, out_features=self.hidden_dim) self.layers = torch.nn.ModuleList([ - torch.nn.Linear(in_features=self.hidden_dim, - out_features=self.hidden_dim, - bias=False) for x in range(num_layers) + torch.nn.Linear(in_features=self.hidden_dim, out_features=self.hidden_dim, bias=False) + for x in range(num_layers) ]) - self.output_layer = torch.nn.Linear(in_features=self.hidden_dim, - out_features=self.output_dim) + self.output_layer = torch.nn.Linear(in_features=self.hidden_dim, out_features=self.output_dim) self.identity = torch.nn.Identity() def forward(self, x): diff --git a/tests/small_model_debugging/test.py b/tests/small_model_debugging/test.py index a97792df5..554aafd1a 100644 --- a/tests/small_model_debugging/test.py +++ b/tests/small_model_debugging/test.py @@ -28,10 +28,7 @@ def see_memory_usage(message): ) -tens = torch.rand(1024, - 16384, - dtype=torch.half, - device=torch.device(get_accelerator().device_name())) +tens = torch.rand(1024, 16384, dtype=torch.half, device=torch.device(get_accelerator().device_name())) tens_back = tens.detach().clone() #linear_bk = torch.nn.functional.linear @@ -45,9 +42,7 @@ y = model(tens) see_memory_usage("After forward") -model.weight.data = torch.zeros(1, - dtype=torch.half, - device=torch.device(get_accelerator().device_name())) +model.weight.data = torch.zeros(1, dtype=torch.half, device=torch.device(get_accelerator().device_name())) see_memory_usage("After weight zero") diff --git a/tests/small_model_debugging/test_model.py b/tests/small_model_debugging/test_model.py index 792d683ce..86337a355 100755 --- a/tests/small_model_debugging/test_model.py +++ b/tests/small_model_debugging/test_model.py @@ -10,6 +10,7 @@ import deepspeed.comm as dist class SimpleModel(torch.nn.Module): + def __init__(self, hidden_dim, empty_grad=False): super(SimpleModel, self).__init__() self.linear = torch.nn.Linear(hidden_dim, hidden_dim) @@ -33,14 +34,10 @@ def create_config_from_dict(tmpdir, config_dict): def get_data_loader(model, total_samples, hidden_dim, device): batch_size = model.train_micro_batch_size_per_gpu() train_data = torch.randn(total_samples, hidden_dim, device=device, dtype=torch.half) - train_label = torch.empty(total_samples, - dtype=torch.long, - device=device).random_(hidden_dim) + train_label = torch.empty(total_samples, dtype=torch.long, device=device).random_(hidden_dim) train_dataset = torch.utils.data.TensorDataset(train_data, train_label) sampler = DistributedSampler(train_dataset) - train_loader = torch.utils.data.DataLoader(train_dataset, - batch_size=batch_size, - sampler=sampler) + train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=sampler) return train_loader @@ -91,10 +88,10 @@ hidden_dim = 4 model = SimpleModel(hidden_dim, empty_grad=False) -model, _, _,_ = deepspeed.initialize(args=args, - model=model, - model_parameters=model.parameters(), - dist_init_required=True) +model, _, _, _ = deepspeed.initialize(args=args, + model=model, + model_parameters=model.parameters(), + dist_init_required=True) def print_params(tag, model): @@ -103,10 +100,7 @@ def print_params(tag, model): print0("{} {}:{}".format(tag, n, p)) -data_loader = get_data_loader(model=model, - total_samples=1000, - hidden_dim=hidden_dim, - device=model.device) +data_loader = get_data_loader(model=model, total_samples=1000, hidden_dim=hidden_dim, device=model.device) #print_params('pre-train', model) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) diff --git a/tests/unit/alexnet_model.py b/tests/unit/alexnet_model.py index bdbaf0292..c1b0f7173 100644 --- a/tests/unit/alexnet_model.py +++ b/tests/unit/alexnet_model.py @@ -12,41 +12,23 @@ from deepspeed.runtime.pipe.module import PipelineModule, LayerSpec class AlexNet(nn.Module): + def __init__(self, num_classes=10): super(AlexNet, self).__init__() self.features = nn.Sequential( - nn.Conv2d(3, - 64, - kernel_size=11, - stride=4, - padding=5), + nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=5), nn.ReLU(inplace=True), - nn.MaxPool2d(kernel_size=2, - stride=2), - nn.Conv2d(64, - 192, - kernel_size=5, - padding=2), + nn.MaxPool2d(kernel_size=2, stride=2), + nn.Conv2d(64, 192, kernel_size=5, padding=2), nn.ReLU(inplace=True), - nn.MaxPool2d(kernel_size=2, - stride=2), - nn.Conv2d(192, - 384, - kernel_size=3, - padding=1), + nn.MaxPool2d(kernel_size=2, stride=2), + nn.Conv2d(192, 384, kernel_size=3, padding=1), nn.ReLU(inplace=True), - nn.Conv2d(384, - 256, - kernel_size=3, - padding=1), + nn.Conv2d(384, 256, kernel_size=3, padding=1), nn.ReLU(inplace=True), - nn.Conv2d(256, - 256, - kernel_size=3, - padding=1), + nn.Conv2d(256, 256, kernel_size=3, padding=1), nn.ReLU(inplace=True), - nn.MaxPool2d(kernel_size=2, - stride=2), + nn.MaxPool2d(kernel_size=2, stride=2), ) self.classifier = nn.Linear(256, num_classes) self.loss_fn = nn.CrossEntropyLoss() @@ -59,12 +41,14 @@ class AlexNet(nn.Module): class AlexNetPipe(AlexNet): + def to_layers(self): layers = [*self.features, lambda x: x.view(x.size(0), -1), self.classifier] return layers class AlexNetPipeSpec(PipelineModule): + def __init__(self, num_classes=10, **kwargs): self.num_classes = num_classes specs = [ @@ -81,9 +65,8 @@ class AlexNetPipeSpec(PipelineModule): LayerSpec(nn.Conv2d, 256, 256, kernel_size=3, padding=1), F.relu, LayerSpec(nn.MaxPool2d, kernel_size=2, stride=2), - lambda x: x.view(x.size(0), -1), - LayerSpec(nn.Linear, 256, self.num_classes), # classifier + LayerSpec(nn.Linear, 256, self.num_classes), # classifier ] super().__init__(layers=specs, loss_fn=nn.CrossEntropyLoss(), **kwargs) @@ -99,12 +82,7 @@ def cifar_trainset(fp16=False): transform_list = [ transforms.ToTensor(), - transforms.Normalize((0.5, - 0.5, - 0.5), - (0.5, - 0.5, - 0.5)), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), ] if fp16: transform_list.append(torchvision.transforms.Lambda(cast_to_half)) @@ -117,23 +95,14 @@ def cifar_trainset(fp16=False): dist.barrier() if local_rank != 0: dist.barrier() - trainset = torchvision.datasets.CIFAR10(root='/blob/cifar10-data', - train=True, - download=True, - transform=transform) + trainset = torchvision.datasets.CIFAR10(root='/blob/cifar10-data', train=True, download=True, transform=transform) if local_rank == 0: dist.barrier() return trainset -def train_cifar(model, - config, - num_steps=400, - average_dp_losses=True, - fp16=True, - seed=123): - with get_accelerator().random().fork_rng( - devices=[get_accelerator().current_device_name()]): +def train_cifar(model, config, num_steps=400, average_dp_losses=True, fp16=True, seed=123): + with get_accelerator().random().fork_rng(devices=[get_accelerator().current_device_name()]): ds_utils.set_random_seed(seed) # disable dropout @@ -142,11 +111,10 @@ def train_cifar(model, trainset = cifar_trainset(fp16=fp16) config['local_rank'] = dist.get_rank() - engine, _, _, _ = deepspeed.initialize( - config=config, - model=model, - model_parameters=[p for p in model.parameters()], - training_data=trainset) + engine, _, _, _ = deepspeed.initialize(config=config, + model=model, + model_parameters=[p for p in model.parameters()], + training_data=trainset) losses = [] for step in range(num_steps): diff --git a/tests/unit/autotuning/test_autotuning.py b/tests/unit/autotuning/test_autotuning.py index 90b9c5b3a..ac4c13aa1 100644 --- a/tests/unit/autotuning/test_autotuning.py +++ b/tests/unit/autotuning/test_autotuning.py @@ -14,13 +14,11 @@ TUNE_OPTION = 'tune' def test_command_line(): '''Validate handling of command line arguments''' for opt in [RUN_OPTION, TUNE_OPTION]: - dsrun.parse_args( - args=f"--num_nodes 1 --num_gpus 1 --autotuning {opt} foo.py".split()) + dsrun.parse_args(args=f"--num_nodes 1 --num_gpus 1 --autotuning {opt} foo.py".split()) for error_opts in [ "--autotuning --num_nodes 1 --num_gpus 1 foo.py".split(), - "--autotuning test --num_nodes 1 -- num_gpus 1 foo.py".split(), - "--autotuning".split() + "--autotuning test --num_nodes 1 -- num_gpus 1 foo.py".split(), "--autotuning".split() ]: with pytest.raises(SystemExit): dsrun.parse_args(args=error_opts) @@ -65,18 +63,9 @@ def test_resource_manager_arg_mappings(arg_mappings): ] ) # yapf: disable def test_autotuner_resources(tmpdir, active_resources): - config_dict = { - "autotuning": { - "enabled": True, - "exps_dir": os.path.join(tmpdir, - 'exps_dir'), - "arg_mappings": {} - } - } + config_dict = {"autotuning": {"enabled": True, "exps_dir": os.path.join(tmpdir, 'exps_dir'), "arg_mappings": {}}} config_path = create_config_from_dict(tmpdir, config_dict) - args = dsrun.parse_args( - args=f'--autotuning {TUNE_OPTION} foo.py --deepspeed_config {config_path}'.split( - )) + args = dsrun.parse_args(args=f'--autotuning {TUNE_OPTION} foo.py --deepspeed_config {config_path}'.split()) tuner = Autotuner(args=args, active_resources=active_resources) expected_num_nodes = len(list(active_resources.keys())) diff --git a/tests/unit/checkpoint/common.py b/tests/unit/checkpoint/common.py index 5b89d6811..5da890d5c 100644 --- a/tests/unit/checkpoint/common.py +++ b/tests/unit/checkpoint/common.py @@ -22,10 +22,7 @@ def compare_deepspeed_states(saved_model, loaded_model): assert saved_model.global_steps == loaded_model.global_steps -def compare_model_states(saved_model, - loaded_model, - compare_optimizer=True, - load_module_only=False): +def compare_model_states(saved_model, loaded_model, compare_optimizer=True, load_module_only=False): if not load_module_only: compare_deepspeed_states(saved_model, loaded_model) @@ -38,7 +35,8 @@ def compare_model_states(saved_model, p0 = p0.half() assert id(p0) != id(p1), f'Comparing fp16 model state tensor against itself : {id(p0)} <====> {id(p1)}' try: - assert torch.allclose(p0, p1, atol=1e-07), f"FP16 model state {p0} is not equal to {p1}, names:{np0}, {np1}" + assert torch.allclose(p0, p1, + atol=1e-07), f"FP16 model state {p0} is not equal to {p1}, names:{np0}, {np1}" except RuntimeError as err: print(f"FP16 model state {p0} is not equal to {p1}, names:{np0}, {np1}") raise err @@ -46,14 +44,14 @@ def compare_model_states(saved_model, if not compare_optimizer: return - if DeepSpeedZeroOptimizer_Stage3 is not None and isinstance( - saved_model.optimizer, - DeepSpeedZeroOptimizer_Stage3): - for p0, p1 in zip(saved_model.optimizer.fp32_partitioned_groups_flat, loaded_model.optimizer.fp32_partitioned_groups_flat): + if DeepSpeedZeroOptimizer_Stage3 is not None and isinstance(saved_model.optimizer, DeepSpeedZeroOptimizer_Stage3): + for p0, p1 in zip(saved_model.optimizer.fp32_partitioned_groups_flat, + loaded_model.optimizer.fp32_partitioned_groups_flat): assert torch.allclose(p0, p1, atol=1e-07), f"Fp32 model states {p0} is not equal to {p1}" elif isinstance(saved_model.optimizer, DeepSpeedZeroOptimizer): - for p0, p1 in zip(saved_model.optimizer.single_partition_of_fp32_groups, loaded_model.optimizer.single_partition_of_fp32_groups): + for p0, p1 in zip(saved_model.optimizer.single_partition_of_fp32_groups, + loaded_model.optimizer.single_partition_of_fp32_groups): assert id(p0) != id(p1), f'Comparing fp32 model state tensor against itself: {id(p0)} <====> {id(p1)}' assert torch.allclose(p0, p1, atol=1e-07), f"Fp32 model states {p0} is not equal to {p1}" @@ -89,8 +87,7 @@ def compare_optimizer_states(saved_model, loaded_model, hidden_dim, fp16=True): saved_optimizer = saved_model.optimizer.optimizer if fp16 else saved_model.optimizer loaded_optimizer = loaded_model.optimizer.optimizer if fp16 else loaded_model.optimizer - for state0, state1 in zip(saved_optimizer.state.values(), - loaded_optimizer.state.values()): + for state0, state1 in zip(saved_optimizer.state.values(), loaded_optimizer.state.values()): compare_state_dicts(state0, state1) @@ -141,15 +138,12 @@ def checkpoint_correctness_verification(config_dict, load_lr_scheduler_states=False, fp16=True, train_batch=False, - base_optimizers=[None, - None], + base_optimizers=[None, None], empty_tag=False, seq_dataloader=False, load_module_only=False): dtype = torch.half if fp16 else torch.float32 - ds_model = create_deepspeed_model(config_dict=config_dict, - model=models[0], - base_optimizer=base_optimizers[0]) + ds_model = create_deepspeed_model(config_dict=config_dict, model=models[0], base_optimizer=base_optimizers[0]) if seq_dataloader: data_loader = sequence_dataloader(model=ds_model, @@ -196,11 +190,8 @@ def checkpoint_correctness_verification(config_dict, stored = sum(v for _, v in storages.items()) assert needed == stored, f"MoE expert checkpoint uses more storage than required: {f}" - loaded_model = create_deepspeed_model(config_dict=config_dict, - model=models[1], - base_optimizer=base_optimizers[1]) - assert list(trained_model.parameters())[0].dtype == list( - loaded_model.parameters())[0].dtype + loaded_model = create_deepspeed_model(config_dict=config_dict, model=models[1], base_optimizer=base_optimizers[1]) + assert list(trained_model.parameters())[0].dtype == list(loaded_model.parameters())[0].dtype loaded_model.load_checkpoint(save_folder, tag=save_tag, diff --git a/tests/unit/checkpoint/test_latest_checkpoint.py b/tests/unit/checkpoint/test_latest_checkpoint.py index 955edfdec..8f72b60d5 100644 --- a/tests/unit/checkpoint/test_latest_checkpoint.py +++ b/tests/unit/checkpoint/test_latest_checkpoint.py @@ -46,8 +46,6 @@ class TestLatestCheckpoint(DistributedTest): } hidden_dim = 10 model = SimpleModel(hidden_dim) - model, _, _,_ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) # should be no-op, since latest doesn't exist model.load_checkpoint(tmpdir) diff --git a/tests/unit/checkpoint/test_lr_scheduler.py b/tests/unit/checkpoint/test_lr_scheduler.py index f6a8f5ebd..9a04d28d6 100644 --- a/tests/unit/checkpoint/test_lr_scheduler.py +++ b/tests/unit/checkpoint/test_lr_scheduler.py @@ -11,19 +11,8 @@ from unit.checkpoint.common import checkpoint_correctness_verification import pytest -@pytest.mark.parametrize('zero_stage, use_cpu_offload', - [(0, - False), - (1, - False), - (2, - False), - (2, - True), - (3, - False), - (3, - True)]) +@pytest.mark.parametrize('zero_stage, use_cpu_offload', [(0, False), (1, False), (2, False), (2, True), (3, False), + (3, True)]) class TestLRSchedulerCheckpoint(DistributedTest): world_size = 2 @@ -38,8 +27,7 @@ class TestLRSchedulerCheckpoint(DistributedTest): "type": 'Adam', "params": { "lr": 0.00015, - "betas": [0.8, - 0.999], + "betas": [0.8, 0.999], "eps": 1e-8, "weight_decay": 3e-7 } diff --git a/tests/unit/checkpoint/test_moe_checkpoint.py b/tests/unit/checkpoint/test_moe_checkpoint.py index edce2959a..72d4e11cd 100644 --- a/tests/unit/checkpoint/test_moe_checkpoint.py +++ b/tests/unit/checkpoint/test_moe_checkpoint.py @@ -19,20 +19,10 @@ class TestMoECheckpoint(DistributedTest): if not required_torch_version(): pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly") - config_dict = { - "train_batch_size": 8, - "steps_per_print": 1, - "fp16": { - "enabled": True - } - } + config_dict = {"train_batch_size": 8, "steps_per_print": 1, "fp16": {"enabled": True}} hidden_dim = 16 - models = [ - SimpleMoEModel(hidden_dim=hidden_dim, - num_experts=ep_size, - ep_size=ep_size) for _ in range(2) - ] + models = [SimpleMoEModel(hidden_dim=hidden_dim, num_experts=ep_size, ep_size=ep_size) for _ in range(2)] optimizers = [torch.optim.AdamW(params=model.parameters()) for model in models] checkpoint_correctness_verification(config_dict, models=models, @@ -45,15 +35,7 @@ class TestMoECheckpoint(DistributedTest): base_optimizers=optimizers, seq_dataloader=True) - @pytest.mark.parametrize("ep_size, load_optim_states", - [(4, - True), - (4, - False), - (2, - True), - (2, - False)]) + @pytest.mark.parametrize("ep_size, load_optim_states", [(4, True), (4, False), (2, True), (2, False)]) def test_checkpoint_moe_and_zero(self, tmpdir, ep_size, load_optim_states): if not required_torch_version(): pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly") @@ -65,8 +47,7 @@ class TestMoECheckpoint(DistributedTest): "type": 'Adam', "params": { "lr": 0.00015, - "betas": [0.8, - 0.999], + "betas": [0.8, 0.999], "eps": 1e-8, "weight_decay": 3e-7 } @@ -81,21 +62,11 @@ class TestMoECheckpoint(DistributedTest): } hidden_dim = 16 - models = [ - SimpleMoEModel(hidden_dim=hidden_dim, - num_experts=ep_size, - ep_size=ep_size) for _ in range(2) - ] + models = [SimpleMoEModel(hidden_dim=hidden_dim, num_experts=ep_size, ep_size=ep_size) for _ in range(2)] # param group must have a random unique name (for now) # TODO: clean-up this requirement, the unique name should not be required here - param_groups = [{ - 'params': [p for p in model.parameters()], - 'name': 'random-unique-name' - } for model in models] - params = [ - split_params_into_different_moe_groups_for_optimizer(group) - for group in param_groups - ] + param_groups = [{'params': [p for p in model.parameters()], 'name': 'random-unique-name'} for model in models] + params = [split_params_into_different_moe_groups_for_optimizer(group) for group in param_groups] optimizers = [torch.optim.AdamW(params=param) for param in params] checkpoint_correctness_verification(config_dict, models=models, diff --git a/tests/unit/checkpoint/test_other_optimizer.py b/tests/unit/checkpoint/test_other_optimizer.py index d09157a2c..758746992 100644 --- a/tests/unit/checkpoint/test_other_optimizer.py +++ b/tests/unit/checkpoint/test_other_optimizer.py @@ -14,8 +14,7 @@ import pytest class TestOtherOptimizerCheckpoint(DistributedTest): world_size = 2 - @pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME], - reason="lamb is not compatible") + @pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME], reason="lamb is not compatible") def test_checkpoint_unfused_optimizer(self, tmpdir): config_dict = { "train_batch_size": 2, @@ -74,8 +73,7 @@ class TestOtherOptimizerCheckpoint(DistributedTest): "type": "Adam", "params": { "lr": 0.00015, - "betas": [0.8, - 0.999], + "betas": [0.8, 0.999], "eps": 1e-8, "weight_decay": 3e-7 } @@ -111,8 +109,7 @@ class TestOtherOptimizerCheckpoint(DistributedTest): "type": "Adam", "params": { "lr": 0.00015, - "betas": [0.8, - 0.999], + "betas": [0.8, 0.999], "eps": 1e-8, "weight_decay": 3e-7 } diff --git a/tests/unit/checkpoint/test_sparse.py b/tests/unit/checkpoint/test_sparse.py index 4f07acebc..80db03586 100644 --- a/tests/unit/checkpoint/test_sparse.py +++ b/tests/unit/checkpoint/test_sparse.py @@ -11,33 +11,21 @@ import pytest class TestSparseCheckpoint(DistributedTest): world_size = 2 - @pytest.mark.parametrize(["to_save_model_has_embedding", - "to_save_model_sparse"], - [ - [False, - False], - [True, - False], - [True, - True], - ]) - @pytest.mark.parametrize(["destination_has_embedding", - "destination_sparse"], - [ - [False, - False], - [True, - False], - [True, - True], - ]) - def test_non_strict_load_sparse(self, - tmpdir, - to_save_model_has_embedding, - to_save_model_sparse, - destination_has_embedding, - destination_sparse): + @pytest.mark.parametrize(["to_save_model_has_embedding", "to_save_model_sparse"], [ + [False, False], + [True, False], + [True, True], + ]) + @pytest.mark.parametrize(["destination_has_embedding", "destination_sparse"], [ + [False, False], + [True, False], + [True, True], + ]) + def test_non_strict_load_sparse(self, tmpdir, to_save_model_has_embedding, to_save_model_sparse, + destination_has_embedding, destination_sparse): + class ModelNoEmbedding(torch.nn.Module): + def __init__(self): super().__init__() self.linear = torch.nn.Linear(3, 1) @@ -46,6 +34,7 @@ class TestSparseCheckpoint(DistributedTest): return self.linear(x) class ModelEmbedding(torch.nn.Module): + def __init__(self): super().__init__() self.emb = torch.nn.Embedding(10, 3) @@ -63,22 +52,24 @@ class TestSparseCheckpoint(DistributedTest): else: model_destination = ModelNoEmbedding() - engine_to_save, _, _, _ = deepspeed.initialize( - model=model_to_save, config={"train_batch_size": 2, "sparse_gradients": to_save_model_sparse} - ) - engine_destination, _, _, _ = deepspeed.initialize( - model=model_destination, config={"train_batch_size": 2, "sparse_gradients": destination_sparse} - ) + engine_to_save, _, _, _ = deepspeed.initialize(model=model_to_save, + config={ + "train_batch_size": 2, + "sparse_gradients": to_save_model_sparse + }) + engine_destination, _, _, _ = deepspeed.initialize(model=model_destination, + config={ + "train_batch_size": 2, + "sparse_gradients": destination_sparse + }) save_folder = os.path.join(tmpdir, 'saved_checkpoint') save_tag = '1' engine_to_save.save_checkpoint(save_folder, tag=save_tag) - is_sparse_destination = isinstance(model_destination, - ModelEmbedding) and destination_sparse - if isinstance(model_destination, - ModelEmbedding) and model_destination.emb.sparse: + is_sparse_destination = isinstance(model_destination, ModelEmbedding) and destination_sparse + if isinstance(model_destination, ModelEmbedding) and model_destination.emb.sparse: assert "emb.weight" in engine_destination.sparse_tensor_module_names engine_destination.load_checkpoint(save_folder, tag=save_tag, @@ -86,9 +77,7 @@ class TestSparseCheckpoint(DistributedTest): load_optimizer_states=False, load_lr_scheduler_states=False, load_module_only=False) - if isinstance(model_destination, - ModelEmbedding) and isinstance(model_to_save, - ModelEmbedding): + if isinstance(model_destination, ModelEmbedding) and isinstance(model_to_save, ModelEmbedding): assert engine_destination.sparse_tensor_module_names == engine_to_save.sparse_tensor_module_names elif isinstance(model_destination, ModelEmbedding): assert not is_sparse_destination or "emb.weight" in engine_destination.sparse_tensor_module_names diff --git a/tests/unit/checkpoint/test_tag_validation.py b/tests/unit/checkpoint/test_tag_validation.py index d94896223..76866ffaf 100644 --- a/tests/unit/checkpoint/test_tag_validation.py +++ b/tests/unit/checkpoint/test_tag_validation.py @@ -29,9 +29,7 @@ class TestCheckpointValidationTag(DistributedTest): hidden_dim = 10 model = SimpleModel(hidden_dim) - model, _, _,_ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) if valid_mode == "FAIL": with pytest.raises(AssertionError): model.save_checkpoint(save_dir=tmpdir, tag=f"tag-{dist.get_rank()}") @@ -58,6 +56,4 @@ class TestCheckpointValidationTag(DistributedTest): model = SimpleModel(hidden_dim) with pytest.raises(deepspeed.DeepSpeedConfigError): - model, _, _,_ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) diff --git a/tests/unit/checkpoint/test_zero_optimizer.py b/tests/unit/checkpoint/test_zero_optimizer.py index 7de8e9bff..5d1696232 100644 --- a/tests/unit/checkpoint/test_zero_optimizer.py +++ b/tests/unit/checkpoint/test_zero_optimizer.py @@ -15,27 +15,11 @@ import pytest class TestZeROCheckpoint(DistributedTest): world_size = 2 - @pytest.mark.parametrize('zero_stage, use_cpu_offload, adam_optimizer', - [(1, - False, - 'Adam'), - (2, - False, - 'Adam'), - (2, - True, - 'deepspeed_adam'), - (3, - False, - 'Adam'), - (3, - True, - 'deepspeed_adam')]) - def test_load_optimizer_state(self, - tmpdir, - zero_stage, - use_cpu_offload, - adam_optimizer): + @pytest.mark.parametrize('zero_stage, use_cpu_offload, adam_optimizer', [(1, False, 'Adam'), (2, False, 'Adam'), + (2, True, 'deepspeed_adam'), + (3, False, 'Adam'), + (3, True, 'deepspeed_adam')]) + def test_load_optimizer_state(self, tmpdir, zero_stage, use_cpu_offload, adam_optimizer): if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]: pytest.skip("cpu-adam is not compatible") @@ -46,8 +30,7 @@ class TestZeROCheckpoint(DistributedTest): "type": 'Adam', "params": { "lr": 0.00015, - "betas": [0.8, - 0.999], + "betas": [0.8, 0.999], "eps": 1e-8, "weight_decay": 3e-7 } @@ -70,33 +53,13 @@ class TestZeROCheckpoint(DistributedTest): else: models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)] - checkpoint_correctness_verification(config_dict, - models, - hidden_dim, - tmpdir, - load_optimizer_states=True) + checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, load_optimizer_states=True) - @pytest.mark.parametrize('zero_stage, use_cpu_offload, adam_optimizer', - [(1, - False, - "Adam"), - (2, - False, - "Adam"), - (2, - True, - 'deepspeed_adam'), - (3, - False, - 'Adam'), - (3, - True, - 'deepspeed_adam')]) - def test_not_load_optimizer_state(self, - tmpdir, - zero_stage, - use_cpu_offload, - adam_optimizer): + @pytest.mark.parametrize('zero_stage, use_cpu_offload, adam_optimizer', [(1, False, "Adam"), (2, False, "Adam"), + (2, True, 'deepspeed_adam'), + (3, False, 'Adam'), + (3, True, 'deepspeed_adam')]) + def test_not_load_optimizer_state(self, tmpdir, zero_stage, use_cpu_offload, adam_optimizer): if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]: pytest.skip("cpu-adam is not compatible") @@ -107,8 +70,7 @@ class TestZeROCheckpoint(DistributedTest): "type": 'Adam', "params": { "lr": 0.00015, - "betas": [0.8, - 0.999], + "betas": [0.8, 0.999], "eps": 1e-8, "weight_decay": 3e-7 } @@ -131,11 +93,7 @@ class TestZeROCheckpoint(DistributedTest): else: models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)] - checkpoint_correctness_verification(config_dict, - models, - hidden_dim, - tmpdir, - load_optimizer_states=False) + checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, load_optimizer_states=False) @pytest.mark.parametrize('zero_stage', [1, 2]) def test_hybrid_optimizer_state(self, tmpdir, zero_stage): @@ -186,11 +144,7 @@ class TestZeROCheckpoint(DistributedTest): else: models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)] - checkpoint_correctness_verification(config_dict, - models, - hidden_dim, - tmpdir, - load_module_only=True) + checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, load_module_only=True) class ws4_model_checkpoint(DistributedFixture): @@ -214,22 +168,15 @@ class ws4_model_checkpoint(DistributedFixture): hidden_dim = 10 model = SimpleModel(hidden_dim) - model, _, _, _ = deepspeed.initialize(config=ds_config, - model=model, - model_parameters=model.parameters()) - data_loader = random_dataloader(model=model, - total_samples=8, - hidden_dim=hidden_dim, - device=model.device) + model, _, _, _ = deepspeed.initialize(config=ds_config, model=model, model_parameters=model.parameters()) + data_loader = random_dataloader(model=model, total_samples=8, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step() if load_optim: - torch.save(model.optimizer.optimizer.state_dict(), - os.path.join(class_tmpdir, - 'opt-state-dict')) + torch.save(model.optimizer.optimizer.state_dict(), os.path.join(class_tmpdir, 'opt-state-dict')) model.save_checkpoint(class_tmpdir) @@ -239,11 +186,7 @@ class ws4_model_checkpoint(DistributedFixture): class TestZeROElasticCheckpoint(DistributedTest): world_size = 2 - def test_elastic_checkpoint_fixed_dp(self, - tmpdir, - elastic_save, - elastic_load, - load_optim): + def test_elastic_checkpoint_fixed_dp(self, tmpdir, elastic_save, elastic_load, load_optim): ds_config = { "train_batch_size": 2, "optimizer": { @@ -263,54 +206,39 @@ class TestZeROElasticCheckpoint(DistributedTest): # torch 1.2.* stores raw tensor id numbers in checkpoint state which leads to # false positive mismatches in checkpoint state comparisons. # Newer torch versions store tensor ids as 0, 1, 2, ... - expected_mismatch_keys = [] if required_minimum_torch_version(1, - 4) else ['params'] + expected_mismatch_keys = [] if required_minimum_torch_version(1, 4) else ['params'] models = [SimpleModel(hidden_dim) for _ in range(2)] model, _, _, _ = deepspeed.initialize(config=ds_config, - model=models[0], - model_parameters=models[0].parameters()) - data_loader = random_dataloader(model=model, - total_samples=8, - hidden_dim=hidden_dim, - device=model.device) + model=models[0], + model_parameters=models[0].parameters()) + data_loader = random_dataloader(model=model, total_samples=8, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step() if load_optim: - torch.save(model.optimizer.optimizer.state_dict(), - os.path.join(tmpdir, - 'opt-state-dict')) + torch.save(model.optimizer.optimizer.state_dict(), os.path.join(tmpdir, 'opt-state-dict')) model.save_checkpoint(tmpdir) ds_config["zero_optimization"]["elastic_checkpoint"] = elastic_load model, _, _, _ = deepspeed.initialize(config=ds_config, - model=models[1], - model_parameters=models[1].parameters()) + model=models[1], + model_parameters=models[1].parameters()) model.load_checkpoint(tmpdir, load_optimizer_states=load_optim) if load_optim: saved_sd = torch.load(os.path.join(tmpdir, 'opt-state-dict')) curr_sd = model.optimizer.optimizer.state_dict() for curr_param_group, saved_param_group in zip(curr_sd['param_groups'], saved_sd['param_groups']): - compare_state_dicts(curr_param_group, - saved_param_group, - expected_mismatch_keys) + compare_state_dicts(curr_param_group, saved_param_group, expected_mismatch_keys) - data_loader = random_dataloader(model=model, - total_samples=8, - hidden_dim=hidden_dim, - device=model.device) + data_loader = random_dataloader(model=model, total_samples=8, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step() - def test_elastic_checkpoint_change_dp(self, - ws4_model_checkpoint, - class_tmpdir, - elastic_save, - elastic_load, + def test_elastic_checkpoint_change_dp(self, ws4_model_checkpoint, class_tmpdir, elastic_save, elastic_load, load_optim): ds_config = { "train_batch_size": 4, @@ -330,9 +258,7 @@ class TestZeROElasticCheckpoint(DistributedTest): model = SimpleModel(hidden_dim) # Load checkpoint with dp world size = 2 - model, _, _, _ = deepspeed.initialize(config=ds_config, - model=model, - model_parameters=model.parameters()) + model, _, _, _ = deepspeed.initialize(config=ds_config, model=model, model_parameters=model.parameters()) if load_optim: with pytest.raises(deepspeed.runtime.zero.utils.ZeRORuntimeException): model.load_checkpoint(class_tmpdir, load_optimizer_states=load_optim) @@ -361,9 +287,7 @@ class TestZeROSaveLoadEdgeCase(DistributedTest): hidden_dim = 10 model = SimpleModel(hidden_dim) - ds_model = create_deepspeed_model(config_dict=config_dict, - model=model, - base_optimizer=None) + ds_model = create_deepspeed_model(config_dict=config_dict, model=model, base_optimizer=None) ds_model.save_checkpoint(tmpdir) ds_model.load_checkpoint(tmpdir, load_optimizer_states=False, @@ -390,9 +314,7 @@ class TestZeROSaveLoadEdgeCase(DistributedTest): # 1. pretrain a model and save it dtype = torch.half - ds_model = create_deepspeed_model(config_dict=config_dict, - model=model, - base_optimizer=None) + ds_model = create_deepspeed_model(config_dict=config_dict, model=model, base_optimizer=None) data_loader = random_dataloader(model=ds_model, total_samples=1, hidden_dim=hidden_dim, @@ -405,9 +327,7 @@ class TestZeROSaveLoadEdgeCase(DistributedTest): ds_model.save_checkpoint(tmpdir) # 2. load and immediately save a model with a fresh ds engine - ds_model = create_deepspeed_model(config_dict=config_dict, - model=model, - base_optimizer=None) + ds_model = create_deepspeed_model(config_dict=config_dict, model=model, base_optimizer=None) ds_model.load_checkpoint(tmpdir, load_optimizer_states=False, load_lr_scheduler_states=False, @@ -438,9 +358,7 @@ class TestZeROSaveLoadEdgeCase(DistributedTest): # This test reproduces a bug where one tries to retrieve a 16bit model before grad_accum # cycle was completed. # So we config grad_accum=2 and step only once and save_16bit_model - ds_model = create_deepspeed_model(config_dict=config_dict, - model=model, - base_optimizer=None) + ds_model = create_deepspeed_model(config_dict=config_dict, model=model, base_optimizer=None) data_loader = random_dataloader(model=ds_model, total_samples=2, diff --git a/tests/unit/comm/test_dist.py b/tests/unit/comm/test_dist.py index 6005c926f..54b504ba4 100644 --- a/tests/unit/comm/test_dist.py +++ b/tests/unit/comm/test_dist.py @@ -31,6 +31,7 @@ def greeting(request): class TestDistArgs(DistributedTest): world_size = 2 """ Classes that use DistributedTest class must define a test* method """ + @pytest.mark.parametrize("shape", ["icosahedron"]) def test(self, number, color, shape, greeting): """Ensure that we can parse args to DistributedTest methods. """ @@ -118,8 +119,7 @@ class TestDistInit(DistributedTest): init_distributed = False def test_already_init(self, dist_init_required): - torch.distributed.init_process_group( - get_accelerator().communication_backend_name()) + torch.distributed.init_process_group(get_accelerator().communication_backend_name()) deepspeed.init_distributed(get_accelerator().communication_backend_name(), dist_init_required=dist_init_required) @@ -130,9 +130,8 @@ class TestDistInit(DistributedTest): else: # torch.dist is not done and for some reason the user says they don't want it done with pytest.raises(Exception): - deepspeed.init_distributed( - get_accelerator().communication_backend_name(), - dist_init_required=dist_init_required) + deepspeed.init_distributed(get_accelerator().communication_backend_name(), + dist_init_required=dist_init_required) class TestDistInitNoEnv(DistributedTest): @@ -141,14 +140,12 @@ class TestDistInitNoEnv(DistributedTest): set_dist_env = False def test(self): - torch.distributed.init_process_group( - backend=get_accelerator().communication_backend_name(), - init_method=f"tcp://127.0.0.1:{get_master_port()}", - world_size=1, - rank=0) + torch.distributed.init_process_group(backend=get_accelerator().communication_backend_name(), + init_method=f"tcp://127.0.0.1:{get_master_port()}", + world_size=1, + rank=0) assert torch.distributed.is_initialized() - deepspeed.init_distributed(get_accelerator().communication_backend_name(), - auto_mpi_discovery=True) + deepspeed.init_distributed(get_accelerator().communication_backend_name(), auto_mpi_discovery=True) @pytest.mark.parametrize("dist_init_required", [True, False]) @@ -156,45 +153,26 @@ class TestDistInitWithModel(DistributedTest): init_distributed = False def test_already_init(self, dist_init_required): - torch.distributed.init_process_group( - get_accelerator().communication_backend_name()) + torch.distributed.init_process_group(get_accelerator().communication_backend_name()) model = SimpleModel(4) - config_dict = { - "train_micro_batch_size_per_gpu": 1, - "optimizer": { - "type": "Adam", - "params": {} - } - } - engine, *_ = deepspeed.initialize( - model=model, - config=config_dict, - model_parameters=model.parameters(), - dist_init_required=dist_init_required - ) + config_dict = {"train_micro_batch_size_per_gpu": 1, "optimizer": {"type": "Adam", "params": {}}} + engine, *_ = deepspeed.initialize(model=model, + config=config_dict, + model_parameters=model.parameters(), + dist_init_required=dist_init_required) def test_no_init(self, dist_init_required): model = SimpleModel(4) - config_dict = { - "train_micro_batch_size_per_gpu": 1, - "optimizer": { - "type": "Adam", - "params": {} - } - } + config_dict = {"train_micro_batch_size_per_gpu": 1, "optimizer": {"type": "Adam", "params": {}}} if dist_init_required: - engine, *_ = deepspeed.initialize( - model=model, - config=config_dict, - model_parameters=model.parameters(), - dist_init_required=dist_init_required - ) + engine, *_ = deepspeed.initialize(model=model, + config=config_dict, + model_parameters=model.parameters(), + dist_init_required=dist_init_required) else: # torch.dist is not done and for some reason the user says they don't want it done with pytest.raises(Exception): - engine, *_ = deepspeed.initialize( - model=model, - config=config_dict, - model_parameters=model.parameters(), - dist_init_required=dist_init_required - ) + engine, *_ = deepspeed.initialize(model=model, + config=config_dict, + model_parameters=model.parameters(), + dist_init_required=dist_init_required) diff --git a/tests/unit/common.py b/tests/unit/common.py index acc778a88..0a1c02ce7 100644 --- a/tests/unit/common.py +++ b/tests/unit/common.py @@ -49,12 +49,10 @@ def set_accelerator_visible(): # CUDA_VISIBLE_DEVICES is not set, discover it using accelerator specific command instead import subprocess if get_accelerator().device_name() == 'cuda': - is_rocm_pytorch = hasattr(torch.version, - 'hip') and torch.version.hip is not None + is_rocm_pytorch = hasattr(torch.version, 'hip') and torch.version.hip is not None if is_rocm_pytorch: rocm_smi = subprocess.check_output(['rocm-smi', '--showid']) - gpu_ids = filter(lambda s: 'GPU' in s, - rocm_smi.decode('utf-8').strip().split('\n')) + gpu_ids = filter(lambda s: 'GPU' in s, rocm_smi.decode('utf-8').strip().split('\n')) num_gpus = len(list(gpu_ids)) else: nvidia_smi = subprocess.check_output(['nvidia-smi', '--list-gpus']) @@ -124,8 +122,7 @@ class DistributedExec(ABC): return fixture_kwargs def _launch_procs(self, num_procs): - if get_accelerator().is_available( - ) and get_accelerator().device_count() < num_procs: + if get_accelerator().is_available() and get_accelerator().device_count() < num_procs: pytest.skip( f"Skipping test because not enough GPUs are available: {num_procs} required, {get_accelerator().device_count()} available" ) @@ -165,11 +162,9 @@ class DistributedExec(ABC): p.terminate() pytest.fail(f'Worker {rank} hung.', pytrace=False) if p.exitcode < 0: - pytest.fail(f'Worker {rank} killed by signal {-p.exitcode}', - pytrace=False) + pytest.fail(f'Worker {rank} killed by signal {-p.exitcode}', pytrace=False) if p.exitcode > 0: - pytest.fail(f'Worker {rank} exited with code {p.exitcode}', - pytrace=False) + pytest.fail(f'Worker {rank} exited with code {p.exitcode}', pytrace=False) if not skip_msg.empty(): # This assumed all skip messages are the same, it may be useful to @@ -273,9 +268,7 @@ class DistributedFixture(DistributedExec): def __init__(self): assert isinstance(self.world_size, int), "Only one world size is allowed for distributed fixtures" self.__name__ = type(self).__name__ - _pytestfixturefunction = FixtureFunctionMarker(scope="function", - params=None, - name=self.__name__) + _pytestfixturefunction = FixtureFunctionMarker(scope="function", params=None, name=self.__name__) class DistributedTest(DistributedExec): diff --git a/tests/unit/compression/test_compression.py b/tests/unit/compression/test_compression.py index 6bfca255b..a0cd35d05 100644 --- a/tests/unit/compression/test_compression.py +++ b/tests/unit/compression/test_compression.py @@ -15,9 +15,8 @@ from unit.common import DistributedTest TORCH_MAJOR = int(torch.__version__.split('.')[0]) TORCH_MINOR = int(torch.__version__.split('.')[1]) -pytestmark = pytest.mark.skipif( - TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 5), - reason='Megatron-LM package requires Pytorch version 1.5 or above') +pytestmark = pytest.mark.skipif(TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 5), + reason='Megatron-LM package requires Pytorch version 1.5 or above') def reset_random(seed=1234): @@ -73,6 +72,7 @@ class Conv1D(torch.nn.Module): nf (`int`): The number of output features. nx (`int`): The number of input features. """ + def __init__(self, nf, nx): super().__init__() self.nf = nf @@ -95,6 +95,7 @@ def create_conv1d_model(): class TestCompression(DistributedTest): + def setup_method(self, method): reset_random() @@ -132,8 +133,7 @@ class TestCompression(DistributedTest): "target_bits": 8, "quantization_period": 50 }, - "modules": ["attention.self", - "intermediate"] + "modules": ["attention.self", "intermediate"] }, "wq2": { "params": { @@ -205,9 +205,7 @@ class TestCompression(DistributedTest): "dense_ratio": 0.5 }, "modules": ["attention.output.dense"], - "related_modules": [["self.query", - "self.key", - "self.value"]] + "related_modules": [["self.query", "self.key", "self.value"]] } } } @@ -220,12 +218,9 @@ class TestCompression(DistributedTest): model = create_bert_model() compressed_model = init_compression(model, self.get_ds_config()) - assert isinstance(compressed_model.layer[0].attention.self.query, - LinearLayer_Compress) - assert isinstance(compressed_model.layer[0].attention.self.key, - LinearLayer_Compress) - assert isinstance(compressed_model.layer[0].attention.self.value, - LinearLayer_Compress) + assert isinstance(compressed_model.layer[0].attention.self.query, LinearLayer_Compress) + assert isinstance(compressed_model.layer[0].attention.self.key, LinearLayer_Compress) + assert isinstance(compressed_model.layer[0].attention.self.value, LinearLayer_Compress) def test_mpu_compress(self, tmpdir): TORCH_MAJOR = int(torch.__version__.split(".")[0]) @@ -242,21 +237,14 @@ class TestCompression(DistributedTest): model = get_gpt2_model(args_defaults) compressed_model = init_compression(model, self.get_ds_config(), mpu=mpu) - assert isinstance( - compressed_model.module.language_model.transformer.layers[0].attention. - query_key_value, - ColumnParallelLinear_Compress) - assert isinstance( - compressed_model.module.language_model.transformer.layers[0].attention.dense, - RowParallelLinear_Compress) - assert isinstance( - compressed_model.module.language_model.transformer.layers[0].mlp. - dense_h_to_4h, - ColumnParallelLinear_Compress) - assert isinstance( - compressed_model.module.language_model.transformer.layers[0].mlp. - dense_4h_to_h, - RowParallelLinear_Compress) + assert isinstance(compressed_model.module.language_model.transformer.layers[0].attention.query_key_value, + ColumnParallelLinear_Compress) + assert isinstance(compressed_model.module.language_model.transformer.layers[0].attention.dense, + RowParallelLinear_Compress) + assert isinstance(compressed_model.module.language_model.transformer.layers[0].mlp.dense_h_to_4h, + ColumnParallelLinear_Compress) + assert isinstance(compressed_model.module.language_model.transformer.layers[0].mlp.dense_4h_to_h, + RowParallelLinear_Compress) def test_conv1d_convertion(self, tmpdir): model = create_conv1d_model() diff --git a/tests/unit/elasticity/test_elastic.py b/tests/unit/elasticity/test_elastic.py index e29b2a22e..dd7ef8f62 100644 --- a/tests/unit/elasticity/test_elastic.py +++ b/tests/unit/elasticity/test_elastic.py @@ -14,10 +14,7 @@ def ds_config(): "elasticity": { "enabled": True, "max_train_batch_size": 10000, - "micro_batch_sizes": [8, - 12, - 16, - 17], + "micro_batch_sizes": [8, 12, 16, 17], "min_gpus": 32, "max_gpus": 1500, "min_time": 20, @@ -28,9 +25,8 @@ def ds_config(): def test_basic_10k(ds_config): - final_batch_size, valid_gpus = deepspeed.elasticity.compute_elastic_config( - ds_config=ds_config, - target_deepspeed_version=ds_version) + final_batch_size, valid_gpus = deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, + target_deepspeed_version=ds_version) for gpu_num in valid_gpus: assert final_batch_size % gpu_num == 0, f"Batch {final_batch_size} is not divisible by GPU count {gpu_num}" @@ -49,61 +45,51 @@ def test_basic_10k(ds_config): def test_old_version(ds_config): with pytest.raises(deepspeed.elasticity.config.ElasticityError): - final_batch_size, valid_gpus = deepspeed.elasticity.compute_elastic_config( - ds_config=ds_config, - target_deepspeed_version="0.2") + final_batch_size, valid_gpus = deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, + target_deepspeed_version="0.2") def test_disabled(ds_config): ds_config['elasticity']['enabled'] = False with pytest.raises(deepspeed.elasticity.config.ElasticityError): - final_batch_size, valid_gpus = deepspeed.elasticity.compute_elastic_config( - ds_config=ds_config, - target_deepspeed_version=ds_version) + final_batch_size, valid_gpus = deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, + target_deepspeed_version=ds_version) def test_valid_world_size(ds_config): final_batch_size, valid_gpus, mbsize = deepspeed.elasticity.compute_elastic_config( - ds_config=ds_config, - target_deepspeed_version=ds_version, - world_size=64) + ds_config=ds_config, target_deepspeed_version=ds_version, world_size=64) assert mbsize == 17 def test_invalid_world_size(ds_config): with pytest.raises(deepspeed.elasticity.config.ElasticityIncompatibleWorldSize): final_batch_size, valid_gpus, mbsize = deepspeed.elasticity.compute_elastic_config( - ds_config=ds_config, - target_deepspeed_version=ds_version, - world_size=128) + ds_config=ds_config, target_deepspeed_version=ds_version, world_size=128) def test_future_elastic_version(ds_config): ds_config['elasticity']['version'] = '0.3' with pytest.raises(deepspeed.elasticity.config.ElasticityError): - deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, - target_deepspeed_version=ds_version) + deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version) def test_missing_max_batch(ds_config): del ds_config['elasticity']['max_train_batch_size'] with pytest.raises(deepspeed.elasticity.config.ElasticityError): - deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, - target_deepspeed_version=ds_version) + deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version) def test_missing_micro_batch(ds_config): del ds_config['elasticity']['micro_batch_sizes'] with pytest.raises(deepspeed.elasticity.config.ElasticityError): - deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, - target_deepspeed_version=ds_version) + deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version) def test_empty_config(): ds_config = {"elasticity": {"enabled": True}} with pytest.raises(deepspeed.elasticity.config.ElasticityError): - deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, - target_deepspeed_version=ds_version) + deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version) def test_model_parallel_v1_invalid(ds_config): @@ -112,8 +98,7 @@ def test_model_parallel_v1_invalid(ds_config): ds_config["elasticity"]["version"] = 0.1 with pytest.raises(deepspeed.elasticity.config.ElasticityError): - deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, - target_deepspeed_version=ds_version) + deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version) def test_model_parallel_v2_invalid(ds_config): @@ -133,37 +118,17 @@ def test_model_parallel_v2_valid(ds_config): ds_config["elasticity"]["version"] = 0.2 os.environ["WORLD_SIZE"] = str(16) - deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, - target_deepspeed_version=ds_version) + deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version) os.environ.pop("WORLD_SIZE") -@pytest.mark.parametrize('key, value', - [('micro_batch_sizes', - [1, - 4, - -1, - 2, - -10]), - ('min_gpus', - -1), - ('max_gpus', - -1), - ('micro_batch_sizes', - 5), - ('micro_batch_sizes', - ['a', - None, - 0.5]), - ('micro_batch_sizes', - [2, - 0.5, - 4])]) +@pytest.mark.parametrize('key, value', [('micro_batch_sizes', [1, 4, -1, 2, -10]), ('min_gpus', -1), ('max_gpus', -1), + ('micro_batch_sizes', 5), ('micro_batch_sizes', ['a', None, 0.5]), + ('micro_batch_sizes', [2, 0.5, 4])]) def test_invalid_config_values(key, value, ds_config): ds_config['elasticity'][key] = value with pytest.raises(deepspeed.elasticity.config.ElasticityError): - deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, - target_deepspeed_version=ds_version) + deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version) def test_proper_mbsz(ds_config): @@ -171,9 +136,7 @@ def test_proper_mbsz(ds_config): ds_config["elasticity"]["micro_batch_sizes"] = [1, 2, 3, 7] ds_config["elasticity"]["min_gpus"] = 1 final_batch_size, valid_gpus, mbsize = deepspeed.elasticity.compute_elastic_config( - ds_config=ds_config, - target_deepspeed_version=ds_version, - world_size=7) + ds_config=ds_config, target_deepspeed_version=ds_version, world_size=7) assert mbsize == 3 @@ -194,10 +157,7 @@ class TestNonElasticBatchParams(DistributedTest): "elasticity": { "enabled": True, "max_train_batch_size": 4, - "micro_batch_sizes": [1, - 2, - 3, - 4], + "micro_batch_sizes": [1, 2, 3, 4], "min_gpus": 1, "max_gpus": 4, "min_time": 20, @@ -209,9 +169,7 @@ class TestNonElasticBatchParams(DistributedTest): model = SimpleModel(hidden_dim, empty_grad=False) with pytest.raises(deepspeed.elasticity.config.ElasticityError): - model, _, _,_ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) class TestNonElasticBatchParamsWithOverride(DistributedTest): @@ -231,10 +189,7 @@ class TestNonElasticBatchParamsWithOverride(DistributedTest): "elasticity": { "enabled": True, "max_train_batch_size": 4, - "micro_batch_sizes": [1, - 2, - 3, - 4], + "micro_batch_sizes": [1, 2, 3, 4], "min_gpus": 1, "max_gpus": 4, "min_time": 20, @@ -245,9 +200,7 @@ class TestNonElasticBatchParamsWithOverride(DistributedTest): hidden_dim = 10 model = SimpleModel(hidden_dim, empty_grad=False) - model, _, _,_ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) class TestElasticConfigChanged(DistributedTest): @@ -267,10 +220,7 @@ class TestElasticConfigChanged(DistributedTest): "elasticity": { "enabled": True, "max_train_batch_size": 4, - "micro_batch_sizes": [1, - 2, - 3, - 4], + "micro_batch_sizes": [1, 2, 3, 4], "min_gpus": 1, "max_gpus": 4, "min_time": 20, @@ -287,6 +237,4 @@ class TestElasticConfigChanged(DistributedTest): model = SimpleModel(hidden_dim, empty_grad=False) with pytest.raises(deepspeed.elasticity.config.ElasticityError): - model, _, _,_ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) diff --git a/tests/unit/inference/test_checkpoint_sharding.py b/tests/unit/inference/test_checkpoint_sharding.py index a8ff13122..f8fdcf3c1 100644 --- a/tests/unit/inference/test_checkpoint_sharding.py +++ b/tests/unit/inference/test_checkpoint_sharding.py @@ -10,6 +10,7 @@ from transformers import AutoConfig, AutoModelForCausalLM def check_dtype(model, expected_dtype): + def find_dtype(module): for child in module.children(): if isinstance(child, DeepSpeedTransformerInference): @@ -21,17 +22,11 @@ def check_dtype(model, expected_dtype): found_dtype = find_dtype(model) assert found_dtype, "Did not find DeepSpeedTransformerInference in model" - assert ( - found_dtype == expected_dtype - ), f"Expected transformer dtype {expected_dtype}, but found {found_dtype}" + assert (found_dtype == expected_dtype), f"Expected transformer dtype {expected_dtype}, but found {found_dtype}" -@pytest.fixture(params=[ - "bigscience/bloom-560m", - "EleutherAI/gpt-j-6B", - "EleutherAI/gpt-neo-125M", - "facebook/opt-125m" -]) +@pytest.fixture( + params=["bigscience/bloom-560m", "EleutherAI/gpt-j-6B", "EleutherAI/gpt-neo-125M", "facebook/opt-125m"]) def model_name(request): return request.param @@ -55,13 +50,11 @@ class save_shard(DistributedFixture): "tensor_parallel": { "tp_size": world_size }, - "save_mp_checkpoint_path": os.path.join(str(class_tmpdir), - model_name), + "save_mp_checkpoint_path": os.path.join(str(class_tmpdir), model_name), } # Load model and save sharded checkpoint - model = AutoModelForCausalLM.from_pretrained(model_name, - torch_dtype=torch.float16) + model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) model = deepspeed.init_inference(model, config=inf_config) @@ -78,17 +71,14 @@ class TestCheckpointShard(DistributedTest): "tensor_parallel": { "tp_size": world_size }, - "checkpoint": os.path.join(class_tmpdir, - model_name, - "ds_inference_config.json"), + "checkpoint": os.path.join(class_tmpdir, model_name, "ds_inference_config.json"), } # Load model on meta tensors model_config = AutoConfig.from_pretrained(model_name) # Note that we use half precision to load initially, even for int8 with deepspeed.OnDevice(dtype=torch.float16, device="meta"): - model = AutoModelForCausalLM.from_config(model_config, - torch_dtype=torch.bfloat16) + model = AutoModelForCausalLM.from_config(model_config, torch_dtype=torch.bfloat16) model = model.eval() model = deepspeed.init_inference(model, config=inf_config) check_dtype(model, dtype) diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py index 371ecda71..93efd9206 100644 --- a/tests/unit/inference/test_inference.py +++ b/tests/unit/inference/test_inference.py @@ -50,26 +50,17 @@ _gpt_models = [ "bigscience/bloom-560m", ] _opt_models = [ - "facebook/opt-125m", # 125m, 1.7B, ..., 175B variants have the same model architecture. - "facebook/opt-350m", # 350m applies layer norm after attnention layer which is different than other variants. + "facebook/opt-125m", # 125m, 1.7B, ..., 175B variants have the same model architecture. + "facebook/opt-350m", # 350m applies layer norm after attnention layer which is different than other variants. ] _all_models = HfApi().list_models() test_models = set(_bert_models + _roberta_models + _gpt_models + _opt_models) test_tasks = [ - "fill-mask", - "question-answering", - "text-classification", - "token-classification", - "text-generation", - "text2text-generation", - "summarization", - "translation" + "fill-mask", "question-answering", "text-classification", "token-classification", "text-generation", + "text2text-generation", "summarization", "translation" ] -pytest.all_models = { - task: [m.modelId for m in _all_models if m.pipeline_tag == task] - for task in test_tasks -} +pytest.all_models = {task: [m.modelId for m in _all_models if m.pipeline_tag == task] for task in test_tasks} _model_w_tasks = itertools.product(*[test_models, test_tasks]) @@ -116,8 +107,7 @@ def invalid_model_task_config(model_w_task, dtype, enable_cuda_graph): msg = f"Not a valid model / task combination: {model} / {task}" elif enable_cuda_graph and (torch_info["cuda_version"] == "0.0"): msg = "CUDA not detected, cannot use CUDA Graph" - elif enable_cuda_graph and pkg_version.parse( - torch.__version__) < pkg_version.parse("1.10"): + elif enable_cuda_graph and pkg_version.parse(torch.__version__) < pkg_version.parse("1.10"): msg = "CUDA Graph is only available in torch versions >= 1.10" elif "gpt-j-6B" in model: if dtype != torch.half: @@ -144,16 +134,7 @@ statement for each combination of model /task @pytest.fixture def query(model_w_task): model, task = model_w_task - angle_bracket_mask_models = [ - "roberta", - "camembert", - "esm", - "ibert", - "luke", - "mpnet", - "yoso", - "mpnet" - ] + angle_bracket_mask_models = ["roberta", "camembert", "esm", "ibert", "luke", "mpnet", "yoso", "mpnet"] if task == "fill-mask": if any(map(lambda x: x in model, angle_bracket_mask_models)): @@ -208,18 +189,15 @@ def token_classification_assert(x, y): def text_generation_assert(x, y): - return set(res["generated_text"] for res in x) == set(res["generated_text"] - for res in y) + return set(res["generated_text"] for res in x) == set(res["generated_text"] for res in y) def text2text_generation_assert(x, y): - return set(res["generated_text"] for res in x) == set(res["generated_text"] - for res in y) + return set(res["generated_text"] for res in x) == set(res["generated_text"] for res in y) def translation_assert(x, y): - return set(res["translation_text"] for res in x) == set(res["translation_text"] - for res in y) + return set(res["translation_text"] for res in x) == set(res["translation_text"] for res in y) def summarization_assert(x, y): @@ -246,6 +224,7 @@ def assert_fn(model_w_task): def check_injection(model): + def verify_injection(module): for child in module.children(): if isinstance(child, nn.ModuleList): @@ -331,19 +310,11 @@ class TestModelTask(DistributedTest): @pytest.mark.seq_inference -@pytest.mark.parametrize("model_w_task", - [("EleutherAI/gpt-neo-1.3B", - "text-generation"), - ("EleutherAI/gpt-neox-20b", - "text-generation"), - ("bigscience/bloom-3b", - "text-generation"), - ("EleutherAI/gpt-j-6B", - "text-generation")], - ids=["gpt-neo", - "gpt-neox", - "bloom", - "gpt-j"]) +@pytest.mark.parametrize("model_w_task", [("EleutherAI/gpt-neo-1.3B", "text-generation"), + ("EleutherAI/gpt-neox-20b", "text-generation"), + ("bigscience/bloom-3b", "text-generation"), + ("EleutherAI/gpt-j-6B", "text-generation")], + ids=["gpt-neo", "gpt-neox", "bloom", "gpt-j"]) class TestMPSize(DistributedTest): world_size = 4 @@ -385,21 +356,14 @@ class TestMPSize(DistributedTest): @pytest.mark.parametrize( "model_w_task, injection_policy", [ - (("google/t5-v1_1-small", - "text2text-generation"), - { - T5Block: ('SelfAttention.o', - 'EncDecAttention.o', - 'DenseReluDense.wo') - }), - (("roberta-large", - "fill-mask"), - { - RobertaLayer: ('output.dense') - }), + (("google/t5-v1_1-small", "text2text-generation"), { + T5Block: ('SelfAttention.o', 'EncDecAttention.o', 'DenseReluDense.wo') + }), + (("roberta-large", "fill-mask"), { + RobertaLayer: ('output.dense') + }), ], - ids=["t5", - "roberta"], + ids=["t5", "roberta"], ) @pytest.mark.parametrize("dtype", [torch.float], ids=["fp32"]) @pytest.mark.parametrize("enable_cuda_graph", [False], ids=["noCG"]) @@ -446,8 +410,7 @@ class TestInjectionPolicy(DistributedTest): @pytest.mark.parametrize( "model_w_task", [ - ("Helsinki-NLP/opus-mt-en-de", - "translation"), + ("Helsinki-NLP/opus-mt-en-de", "translation"), ], ids=[ "marian", @@ -480,9 +443,7 @@ class TestAutoTensorParallelism(DistributedTest): pipe = pipeline(task, model=model, device=torch.device("cpu"), framework="pt") bs_output = pipe(query, **inf_kwargs) - pipe.model = deepspeed.init_inference(pipe.model, - mp_size=world_size, - dtype=dtype) + pipe.model = deepspeed.init_inference(pipe.model, mp_size=world_size, dtype=dtype) # Switch device to GPU so that input tensors are not on CPU pipe.device = torch.device(get_accelerator().device_name(local_rank)) ds_output = pipe(query, **inf_kwargs) @@ -496,12 +457,9 @@ class TestAutoTensorParallelism(DistributedTest): @pytest.mark.parametrize( "model_family, model_name", ( - ["gpt2", - "EleutherAI/gpt-neo-2.7B"], - ["gpt2", - "EleutherAI/gpt-j-6B"], - ["gpt2", - "gpt2-xl"], + ["gpt2", "EleutherAI/gpt-neo-2.7B"], + ["gpt2", "EleutherAI/gpt-j-6B"], + ["gpt2", "gpt2-xl"], ), ) @pytest.mark.parametrize("task", ["lambada_standard"]) @@ -522,15 +480,13 @@ class TestLMCorrectness(DistributedTest): if 'gpt-j-6B' in model_name: dtype = torch.half - lm = lm_eval.models.get_model(model_family).create_from_arg_string( - f"pretrained={model_name}", - {"device": "cpu"}) + lm = lm_eval.models.get_model(model_family).create_from_arg_string(f"pretrained={model_name}", + {"device": "cpu"}) setattr(lm, model_family, getattr(lm, model_family).half().to(device)) lm._device = device else: lm = lm_eval.models.get_model(model_family).create_from_arg_string( - f"pretrained={model_name}", - {"device": get_accelerator().device_name()}) + f"pretrained={model_name}", {"device": get_accelerator().device_name()}) get_accelerator().synchronize() start = time.time() @@ -539,8 +495,7 @@ class TestLMCorrectness(DistributedTest): bs_time = time.time() - start ds_model = deepspeed.init_inference( - getattr(lm, - model_family), + getattr(lm, model_family), mp_size=1, dtype=dtype, replace_with_kernel_inject=True, @@ -554,7 +509,6 @@ class TestLMCorrectness(DistributedTest): get_accelerator().synchronize() ds_time = time.time() - start - ppl_diff = abs(bs_output["results"][task]["ppl"] - - ds_output["results"][task]["ppl"]) + ppl_diff = abs(bs_output["results"][task]["ppl"] - ds_output["results"][task]["ppl"]) #assert ds_time <= bs_time assert ppl_diff < 0.01 diff --git a/tests/unit/inference/test_model_profiling.py b/tests/unit/inference/test_model_profiling.py index 07ce83930..d64fe1b7b 100644 --- a/tests/unit/inference/test_model_profiling.py +++ b/tests/unit/inference/test_model_profiling.py @@ -32,32 +32,19 @@ def inf_kwargs(task): @pytest.mark.inference -@pytest.mark.parametrize("model,task", - [ - ("bert-base-cased", - "fill-mask"), - ("roberta-base", - "fill-mask"), - ("gpt2", - "text-generation"), - ("facebook/opt-125m", - "text-generation"), - ("bigscience/bloom-560m", - "text-generation"), - ]) +@pytest.mark.parametrize("model,task", [ + ("bert-base-cased", "fill-mask"), + ("roberta-base", "fill-mask"), + ("gpt2", "text-generation"), + ("facebook/opt-125m", "text-generation"), + ("bigscience/bloom-560m", "text-generation"), +]) @pytest.mark.parametrize("cuda_graphs", [True, False]) @pytest.mark.parametrize("use_cuda_events", [True, False]) class TestModelProfiling(DistributedTest): world_size = 1 - def test(self, - model, - task, - query, - inf_kwargs, - cuda_graphs, - use_cuda_events, - dtype=torch.float16): + def test(self, model, task, query, inf_kwargs, cuda_graphs, use_cuda_events, dtype=torch.float16): if cuda_graphs and "bert" not in model: pytest.skip(f"CUDA Graph not supported for {model}") diff --git a/tests/unit/launcher/test_ds_arguments.py b/tests/unit/launcher/test_ds_arguments.py index 9d7af74f2..505d906de 100644 --- a/tests/unit/launcher/test_ds_arguments.py +++ b/tests/unit/launcher/test_ds_arguments.py @@ -82,12 +82,7 @@ def test_no_ds_parser(): def test_core_deepscale_arguments(): parser = basic_parser() parser = deepspeed.add_config_arguments(parser) - args = parser.parse_args( - ['--num_epochs', - '2', - '--deepspeed', - '--deepspeed_config', - 'foo.json']) + args = parser.parse_args(['--num_epochs', '2', '--deepspeed', '--deepspeed_config', 'foo.json']) assert args assert hasattr(args, 'num_epochs') diff --git a/tests/unit/megatron_model.py b/tests/unit/megatron_model.py index 32faf2244..dfe4cbe1b 100644 --- a/tests/unit/megatron_model.py +++ b/tests/unit/megatron_model.py @@ -31,12 +31,7 @@ def get_gpt2_model(args_others, mp_size=1): args_defaults.update(args_others) # setting "make-vocab-size-divisible-by" to avoid word-embedding size change in resizing testing. - sys.argv.extend([ - '--model-parallel-size', - str(mp_size), - '--make-vocab-size-divisible-by', - str(1) - ]) + sys.argv.extend(['--model-parallel-size', str(mp_size), '--make-vocab-size-divisible-by', str(1)]) initialize_megatron(args_defaults=args_defaults, ignore_unknown_args=True) model = GPT2Model(num_tokentypes=0, parallel_output=False) @@ -44,15 +39,13 @@ def get_gpt2_model(args_others, mp_size=1): from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP from megatron import mpu i = get_accelerator().current_device_name() - model = torchDDP(model, - device_ids=[i], - output_device=i, - process_group=mpu.get_data_parallel_group()) + model = torchDDP(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) return model class MockGPT2ModelPipe(PipelineModule): + def __init__(self, num_layers, mp_size, args_others, topo, **kwargs): from megatron.initialize import initialize_megatron @@ -65,38 +58,25 @@ class MockGPT2ModelPipe(PipelineModule): args_defaults.update(args_others) # setting "make-vocab-size-divisible-by" to avoid word-embedding size change in resizing testing. - sys.argv.extend([ - '--model-parallel-size', - str(mp_size), - '--make-vocab-size-divisible-by', - str(1) - ]) + sys.argv.extend(['--model-parallel-size', str(mp_size), '--make-vocab-size-divisible-by', str(1)]) initialize_megatron(args_defaults=args_defaults, ignore_unknown_args=True) from megatron.model.transformer import ParallelTransformerLayer class ParallelTransformerLayerPipe(ParallelTransformerLayer): + def forward(self, args): # hardcode attn mask for testing, PP requires the attn_mask to be stashed - attention_mask = torch.tensor( - [[True]], - device=get_accelerator().current_device_name()) + attention_mask = torch.tensor([[True]], device=get_accelerator().current_device_name()) return super().forward(args, attention_mask) layers = [] for x in range(num_layers): layers.append( - LayerSpec(ParallelTransformerLayerPipe, - self.gpt2_attention_mask_func, - self.init_method_normal(0.02), - self.scaled_init_method_normal(0.02, - num_layers), - x)) - super().__init__(layers=layers, - loss_fn=torch.nn.CrossEntropyLoss(), - topology=topo, - **kwargs) + LayerSpec(ParallelTransformerLayerPipe, self.gpt2_attention_mask_func, self.init_method_normal(0.02), + self.scaled_init_method_normal(0.02, num_layers), x)) + super().__init__(layers=layers, loss_fn=torch.nn.CrossEntropyLoss(), topology=topo, **kwargs) def gpt2_attention_mask_func(self, attention_scores, ltor_mask): attention_scores.masked_fill_(ltor_mask, -10000.0) @@ -104,6 +84,7 @@ class MockGPT2ModelPipe(PipelineModule): def init_method_normal(self, sigma): """Init method based on N(0, sigma).""" + def init_(tensor): return torch.nn.init.normal_(tensor, mean=0.0, std=sigma) diff --git a/tests/unit/model_parallelism/test_configurable_parallel_mp.py b/tests/unit/model_parallelism/test_configurable_parallel_mp.py index 569b46e57..6a847442e 100644 --- a/tests/unit/model_parallelism/test_configurable_parallel_mp.py +++ b/tests/unit/model_parallelism/test_configurable_parallel_mp.py @@ -13,12 +13,9 @@ from unit.megatron_model import get_gpt2_model, get_megatron_version TORCH_MAJOR = int(torch.__version__.split('.')[0]) TORCH_MINOR = int(torch.__version__.split('.')[1]) -pytestmark = pytest.mark.skipif( - TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 5), - reason='Megatron-LM package requires Pytorch version 1.5 or above') -pytestmark = pytest.mark.skipif( - TORCH_MAJOR > 1, - reason='Megatron-LM package requires Pytorch version 1.13 or below') +pytestmark = pytest.mark.skipif(TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 5), + reason='Megatron-LM package requires Pytorch version 1.5 or above') +pytestmark = pytest.mark.skipif(TORCH_MAJOR > 1, reason='Megatron-LM package requires Pytorch version 1.13 or below') def get_deepspeed_model(model): @@ -33,14 +30,15 @@ def get_deepspeed_model(model): } from megatron import mpu - model, _, _,_ = deepspeed.initialize(model=model, - mpu=mpu, - model_parameters=model.parameters(), - config=ds_config_dict) + model, _, _, _ = deepspeed.initialize(model=model, + mpu=mpu, + model_parameters=model.parameters(), + config=ds_config_dict) return model class ConfigurableMP(DistributedTest): + @pytest.fixture(autouse=True) def reset_random(self, seed=1234): random.seed(seed) @@ -52,15 +50,12 @@ class ConfigurableMP(DistributedTest): def inputs(self, bs=1, seq_len=20): input_ids = torch.randint(low=0, high=1000, size=(bs, seq_len)) position_ids = torch.randint(low=0, high=2, size=(bs, seq_len)) - attention_mask = torch.randint(low=0, - high=2, - size=(bs, - seq_len), - dtype=torch.bool) + attention_mask = torch.randint(low=0, high=2, size=(bs, seq_len), dtype=torch.bool) return [input_ids, position_ids, attention_mask] class TestConfigurableMP(ConfigurableMP): + @pytest.mark.world_size(1) def test_gpt2_basic(self, tmpdir, inputs): args_defaults = { @@ -75,22 +70,18 @@ class TestConfigurableMP(ConfigurableMP): model.eval() device_name = get_accelerator().device_name() - baseline = model(inputs[0].to(device_name), - inputs[1].to(device_name), - inputs[2].to(device_name)) + baseline = model(inputs[0].to(device_name), inputs[1].to(device_name), inputs[2].to(device_name)) tag = 'mp_1' state_dict = {} state_dict['checkpoint_version'] = get_megatron_version() model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict) dist.barrier() - model.load_checkpoint(tmpdir, - tag=tag, - load_optimizer_states=False, - load_lr_scheduler_states=False) + model.load_checkpoint(tmpdir, tag=tag, load_optimizer_states=False, load_lr_scheduler_states=False) test = model(inputs[0], inputs[1], inputs[2]) - assert torch.allclose(baseline, test, atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}" + assert torch.allclose(baseline, test, + atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}" @pytest.mark.world_size(2) def test_gpt2_mp2_no_resize(self, tmpdir, inputs): @@ -107,25 +98,19 @@ class TestConfigurableMP(ConfigurableMP): model.eval() device_name = get_accelerator().device_name() - baseline = model(inputs[0].to(device_name), - inputs[1].to(device_name), - inputs[2].to(device_name)) + baseline = model(inputs[0].to(device_name), inputs[1].to(device_name), inputs[2].to(device_name)) tag = 'mp_2' state_dict = {} state_dict['checkpoint_version'] = get_megatron_version() model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict) dist.barrier() - model.load_checkpoint(tmpdir, - tag=tag, - load_optimizer_states=False, - load_lr_scheduler_states=False) + model.load_checkpoint(tmpdir, tag=tag, load_optimizer_states=False, load_lr_scheduler_states=False) device_name = get_accelerator().device_name() - test = model(inputs[0].to(device_name), - inputs[1].to(device_name), - inputs[2].to(device_name)) - assert torch.allclose(baseline, test, rtol=1.0, atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}" + test = model(inputs[0].to(device_name), inputs[1].to(device_name), inputs[2].to(device_name)) + assert torch.allclose(baseline, test, rtol=1.0, + atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}" # This fixture provides the baseline model with mp=2 to TestConfigurableMPResize @@ -147,9 +132,7 @@ class baseline_mp2(DistributedFixture): with torch.no_grad(): device_name = get_accelerator().device_name() - baseline = model(inputs[0].to(device_name), - inputs[1].to(device_name), - inputs[2].to(device_name)) + baseline = model(inputs[0].to(device_name), inputs[1].to(device_name), inputs[2].to(device_name)) if dist.get_rank() == 0: save_path = os.path.join(class_tmpdir, "output.pt") torch.save(baseline.cpu(), save_path) @@ -177,15 +160,13 @@ class TestConfigurableResizeMP(ConfigurableMP): model.eval() with torch.no_grad(): - model.load_checkpoint(class_tmpdir, - load_optimizer_states=False, - load_lr_scheduler_states=False) + model.load_checkpoint(class_tmpdir, load_optimizer_states=False, load_lr_scheduler_states=False) device_name = get_accelerator().device_name() - test = model(inputs[0].to(device_name), - inputs[1].to(device_name), - inputs[2].to(device_name)) + test = model(inputs[0].to(device_name), inputs[1].to(device_name), inputs[2].to(device_name)) if dist.get_rank() == 0: load_path = os.path.join(class_tmpdir, "output.pt") baseline = torch.load(load_path) test = test.cpu() - assert torch.allclose(baseline, test, atol=1e-03), f"Baseline output {baseline} is not equal to save-then-load output {test}" + assert torch.allclose( + baseline, test, + atol=1e-03), f"Baseline output {baseline} is not equal to save-then-load output {test}" diff --git a/tests/unit/model_parallelism/test_configurable_parallel_pp.py b/tests/unit/model_parallelism/test_configurable_parallel_pp.py index a76680bb3..1f79ed101 100644 --- a/tests/unit/model_parallelism/test_configurable_parallel_pp.py +++ b/tests/unit/model_parallelism/test_configurable_parallel_pp.py @@ -15,12 +15,9 @@ from deepspeed.accelerator import get_accelerator TORCH_MAJOR = int(torch.__version__.split('.')[0]) TORCH_MINOR = int(torch.__version__.split('.')[1]) -pytestmark = pytest.mark.skipif( - TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 5), - reason='Megatron-LM package requires Pytorch version 1.5 or above') -pytestmark = pytest.mark.skipif( - TORCH_MAJOR > 1, - reason='Megatron-LM package requires Pytorch version 1.13 or below') +pytestmark = pytest.mark.skipif(TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 5), + reason='Megatron-LM package requires Pytorch version 1.5 or above') +pytestmark = pytest.mark.skipif(TORCH_MAJOR > 1, reason='Megatron-LM package requires Pytorch version 1.13 or below') def get_deepspeed_model(model): @@ -34,9 +31,7 @@ def get_deepspeed_model(model): }, } - model, _, _,_ = deepspeed.initialize(model=model, - model_parameters=model.parameters(), - config=ds_config_dict) + model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=ds_config_dict) return model.to(get_accelerator().device_name()) @@ -51,6 +46,7 @@ def get_topology(mp, pp, world_size): class ConfigurablePP(DistributedTest): + @pytest.fixture(autouse=True) def reset_random(self, seed=1234): random.seed(seed) @@ -61,11 +57,7 @@ class ConfigurablePP(DistributedTest): @pytest.fixture def inputs(self, bs=1, seq_len=1, hidden_size=128): hidden_states = torch.randn(bs, seq_len, hidden_size) - attention_mask = torch.randint(low=0, - high=2, - size=(bs, - seq_len), - dtype=torch.bool) + attention_mask = torch.randint(low=0, high=2, size=(bs, seq_len), dtype=torch.bool) return (hidden_states, attention_mask) @@ -105,20 +97,13 @@ class TestConfigurablePP(ConfigurablePP): else: data_iter = None - baseline = model.eval_batch(data_iter=data_iter, - compute_loss=False, - reduce_output=None) + baseline = model.eval_batch(data_iter=data_iter, compute_loss=False, reduce_output=None) dist.barrier() - model.load_checkpoint(tmpdir, - tag=tag, - load_optimizer_states=False, - load_lr_scheduler_states=False) + model.load_checkpoint(tmpdir, tag=tag, load_optimizer_states=False, load_lr_scheduler_states=False) dist.barrier() - test = model.eval_batch(data_iter=data_iter, - compute_loss=False, - reduce_output=None) + test = model.eval_batch(data_iter=data_iter, compute_loss=False, reduce_output=None) if test is not None: assert len(baseline) == len(test) @@ -126,7 +111,9 @@ class TestConfigurablePP(ConfigurablePP): for mb in range(len(baseline)): for b, t in zip(baseline[mb], test[mb]): if b.is_floating_point(): # don't compare masks - assert torch.allclose(b, t, atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}" + assert torch.allclose( + b, t, + atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}" # Fixture for defining the checkpoint path since all tests in @@ -142,7 +129,8 @@ class _baseline(DistributedFixture): world_size = None def run(self, inputs, class_tmpdir, checkpoint_tag, mp_size, pp_size): - assert int(os.environ["WORLD_SIZE"]) == (pp_size * mp_size), "world size does not match provided pp_size and mp_size" + assert int(os.environ["WORLD_SIZE"]) == (pp_size * + mp_size), "world size does not match provided pp_size and mp_size" args_defaults = { 'num_layers': 8, 'hidden_size': 128, @@ -166,9 +154,7 @@ class _baseline(DistributedFixture): else: data_iter = None - baseline = model.eval_batch(data_iter=data_iter, - compute_loss=False, - reduce_output=None) + baseline = model.eval_batch(data_iter=data_iter, compute_loss=False, reduce_output=None) if baseline is not None: # baseline should be [[hidden, True]]] @@ -180,9 +166,7 @@ class _baseline(DistributedFixture): state_dict = {} state_dict['checkpoint_version'] = get_megatron_version() - model.save_checkpoint(class_tmpdir, - tag=checkpoint_tag, - client_state=state_dict) + model.save_checkpoint(class_tmpdir, tag=checkpoint_tag, client_state=state_dict) # This may look odd, but there is a limitation with DistributedFixture that @@ -201,14 +185,8 @@ class baseline_ws4(_baseline): class TestConfigurableResizePP(ConfigurablePP): - def _test(self, - inputs, - class_tmpdir, - checkpoint_tag, - mp_size, - pp_size, - mp_resize, - pp_resize): + + def _test(self, inputs, class_tmpdir, checkpoint_tag, mp_size, pp_size, mp_resize, pp_resize): args_defaults = { 'num_layers': 8, 'hidden_size': 128, @@ -236,9 +214,7 @@ class TestConfigurableResizePP(ConfigurablePP): else: data_iter = None - test = model.eval_batch(data_iter=data_iter, - compute_loss=False, - reduce_output=None) + test = model.eval_batch(data_iter=data_iter, compute_loss=False, reduce_output=None) if test is not None: # test should be [[hidden, True]]] @@ -248,108 +224,37 @@ class TestConfigurableResizePP(ConfigurablePP): test = test[0][0].cpu() load_path = os.path.join(class_tmpdir, f"output-{checkpoint_tag}.pt") baseline = torch.load(load_path) - assert torch.allclose(baseline, test, atol=1e-03), f"Baseline output {baseline} is not equal to save-then-load output {test}" + assert torch.allclose( + baseline, test, + atol=1e-03), f"Baseline output {baseline} is not equal to save-then-load output {test}" # These tests are divided by baseline model worldsize and test model worldsize @pytest.mark.world_size(1) @pytest.mark.parametrize("mp_size, pp_size, mp_resize, pp_resize", [(1, 2, 1, 1)]) - def test_world_size_2to1(self, - inputs, - class_tmpdir, - checkpoint_tag, - baseline_ws2, - mp_size, - pp_size, - mp_resize, + def test_world_size_2to1(self, inputs, class_tmpdir, checkpoint_tag, baseline_ws2, mp_size, pp_size, mp_resize, pp_resize): - self._test(inputs, - class_tmpdir, - checkpoint_tag, - mp_size, - pp_size, - mp_resize, - pp_resize) + self._test(inputs, class_tmpdir, checkpoint_tag, mp_size, pp_size, mp_resize, pp_resize) @pytest.mark.world_size(1) @pytest.mark.parametrize("mp_size, pp_size, mp_resize, pp_resize", [(2, 2, 1, 1)]) - def test_world_size_4to1(self, - inputs, - class_tmpdir, - checkpoint_tag, - baseline_ws4, - mp_size, - pp_size, - mp_resize, + def test_world_size_4to1(self, inputs, class_tmpdir, checkpoint_tag, baseline_ws4, mp_size, pp_size, mp_resize, pp_resize): - self._test(inputs, - class_tmpdir, - checkpoint_tag, - mp_size, - pp_size, - mp_resize, - pp_resize) + self._test(inputs, class_tmpdir, checkpoint_tag, mp_size, pp_size, mp_resize, pp_resize) @pytest.mark.world_size(2) @pytest.mark.parametrize("mp_size, pp_size, mp_resize, pp_resize", [(2, 2, 2, 1)]) - def test_world_size_4to2(self, - inputs, - class_tmpdir, - checkpoint_tag, - baseline_ws4, - mp_size, - pp_size, - mp_resize, + def test_world_size_4to2(self, inputs, class_tmpdir, checkpoint_tag, baseline_ws4, mp_size, pp_size, mp_resize, pp_resize): - self._test(inputs, - class_tmpdir, - checkpoint_tag, - mp_size, - pp_size, - mp_resize, - pp_resize) + self._test(inputs, class_tmpdir, checkpoint_tag, mp_size, pp_size, mp_resize, pp_resize) @pytest.mark.world_size(4) @pytest.mark.parametrize("mp_size, pp_size, mp_resize, pp_resize", [(1, 1, 2, 2)]) - def test_world_size_1to4(self, - inputs, - class_tmpdir, - checkpoint_tag, - baseline_ws1, - mp_size, - pp_size, - mp_resize, + def test_world_size_1to4(self, inputs, class_tmpdir, checkpoint_tag, baseline_ws1, mp_size, pp_size, mp_resize, pp_resize): - self._test(inputs, - class_tmpdir, - checkpoint_tag, - mp_size, - pp_size, - mp_resize, - pp_resize) + self._test(inputs, class_tmpdir, checkpoint_tag, mp_size, pp_size, mp_resize, pp_resize) @pytest.mark.world_size(4) - @pytest.mark.parametrize("mp_size, pp_size, mp_resize, pp_resize", - [(1, - 2, - 1, - 4), - (2, - 1, - 2, - 2)]) - def test_world_size_2to4(self, - inputs, - class_tmpdir, - checkpoint_tag, - baseline_ws2, - mp_size, - pp_size, - mp_resize, + @pytest.mark.parametrize("mp_size, pp_size, mp_resize, pp_resize", [(1, 2, 1, 4), (2, 1, 2, 2)]) + def test_world_size_2to4(self, inputs, class_tmpdir, checkpoint_tag, baseline_ws2, mp_size, pp_size, mp_resize, pp_resize): - self._test(inputs, - class_tmpdir, - checkpoint_tag, - mp_size, - pp_size, - mp_resize, - pp_resize) + self._test(inputs, class_tmpdir, checkpoint_tag, mp_size, pp_size, mp_resize, pp_resize) diff --git a/tests/unit/modeling.py b/tests/unit/modeling.py index 94dea4546..09415bb5f 100644 --- a/tests/unit/modeling.py +++ b/tests/unit/modeling.py @@ -48,20 +48,15 @@ from deepspeed.accelerator import get_accelerator logger = logging.getLogger(__name__) PRETRAINED_MODEL_ARCHIVE_MAP = { - 'bert-base-uncased': - "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz", - 'bert-large-uncased': - "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz", - 'bert-base-cased': - "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz", - 'bert-large-cased': - "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz", + 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz", + 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz", + 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz", + 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz", 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz", 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz", - 'bert-base-chinese': - "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz", + 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz", } CONFIG_NAME = 'bert_config.json' WEIGHTS_NAME = 'pytorch_model.bin' @@ -76,9 +71,8 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path): import numpy as np import tensorflow as tf except ImportError: - print( - "Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see " - "https://www.tensorflow.org/install/ for installation instructions.") + print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions.") raise tf_path = os.path.abspath(tf_checkpoint_path) print("Converting TensorFlow checkpoint from {}".format(tf_path)) @@ -183,6 +177,7 @@ ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish} class GPUTimer: + def __init__(self): super().__init__() self.start = get_accelerator().Event() # noqa: F821 @@ -202,13 +197,7 @@ class LinearActivation(Module): """ __constants__ = ['bias'] - def __init__(self, - in_features, - out_features, - weights, - biases, - act='gelu', - bias=True): + def __init__(self, in_features, out_features, weights, biases, act='gelu', bias=True): super(LinearActivation, self).__init__() self.in_features = in_features self.out_features = out_features @@ -256,15 +245,14 @@ class LinearActivation(Module): return self.act_fn(F.linear(input, self.weight, self.bias)) def extra_repr(self): - return 'in_features={}, out_features={}, bias={}'.format( - self.in_features, - self.out_features, - self.bias is not None) + return 'in_features={}, out_features={}, bias={}'.format(self.in_features, self.out_features, self.bias + is not None) class BertConfig(object): """Configuration class to store the configuration of a `BertModel`. """ + def __init__(self, vocab_size_or_config_json_file, hidden_size=768, @@ -361,11 +349,10 @@ try: #apex.amp.register_float_function(apex.normalization.FusedLayerNorm, 'forward') BertLayerNorm = apex.normalization.FusedLayerNorm except ImportError: - print( - "Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex." - ) + print("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.") class BertLayerNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-12): """Construct a layernorm module in the TF style (epsilon inside the square root). """ @@ -384,13 +371,12 @@ except ImportError: class BertEmbeddings(nn.Module): """Construct the embeddings from word, position and token_type embeddings. """ + def __init__(self, config): super(BertEmbeddings, self).__init__() self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size) - self.position_embeddings = nn.Embedding(config.max_position_embeddings, - config.hidden_size) - self.token_type_embeddings = nn.Embedding(config.type_vocab_size, - config.hidden_size) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file @@ -399,9 +385,7 @@ class BertEmbeddings(nn.Module): def forward(self, input_ids, token_type_ids=None): seq_length = input_ids.size(1) - position_ids = torch.arange(seq_length, - dtype=torch.long, - device=input_ids.device) + position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) position_ids = position_ids.unsqueeze(0).expand_as(input_ids) if token_type_ids is None: token_type_ids = torch.zeros_like(input_ids) @@ -417,13 +401,12 @@ class BertEmbeddings(nn.Module): class BertSelfAttention(nn.Module): + def __init__(self, i, config, weights, biases): super(BertSelfAttention, self).__init__() if config.hidden_size % config.num_attention_heads != 0: - raise ValueError( - "The hidden size (%d) is not a multiple of the number of attention " - "heads (%d)" % (config.hidden_size, - config.num_attention_heads)) + raise ValueError("The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (config.hidden_size, config.num_attention_heads)) self.num_attention_heads = config.num_attention_heads self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size @@ -451,14 +434,12 @@ class BertSelfAttention(nn.Module): #self.softmax = DeepSpeedSoftmax(i, self.softmax_config) def transpose_for_scores(self, x): - new_x_shape = x.size()[:-1] + (self.num_attention_heads, - self.attention_head_size) + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) x = x.view(*new_x_shape) return x.permute(0, 2, 1, 3) def transpose_key_for_scores(self, x): - new_x_shape = x.size()[:-1] + (self.num_attention_heads, - self.attention_head_size) + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) x = x.view(*new_x_shape) return x.permute(0, 2, 3, 1) @@ -494,6 +475,7 @@ class BertSelfAttention(nn.Module): class BertSelfOutput(nn.Module): + def __init__(self, config, weights, biases): super(BertSelfOutput, self).__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) @@ -513,6 +495,7 @@ class BertSelfOutput(nn.Module): class BertAttention(nn.Module): + def __init__(self, i, config, weights, biases): super(BertAttention, self).__init__() self.self = BertSelfAttention(i, config, weights, biases) @@ -528,6 +511,7 @@ class BertAttention(nn.Module): class BertIntermediate(nn.Module): + def __init__(self, config, weights, biases): super(BertIntermediate, self).__init__() self.dense_act = LinearActivation(config.hidden_size, @@ -542,6 +526,7 @@ class BertIntermediate(nn.Module): class BertOutput(nn.Module): + def __init__(self, config, weights, biases): super(BertOutput, self).__init__() self.dense = nn.Linear(config.intermediate_size, config.hidden_size) @@ -558,6 +543,7 @@ class BertOutput(nn.Module): class BertLayer(nn.Module): + def __init__(self, i, config, weights, biases): super(BertLayer, self).__init__() self.attention = BertAttention(i, config, weights, biases) @@ -580,26 +566,14 @@ class BertLayer(nn.Module): self.biases[2].register_hook(lambda x, self=self: grads.append([x, "V_B"])) self.weight[3].register_hook(lambda x, self=self: grads.append([x, "O_W"])) self.biases[3].register_hook(lambda x, self=self: grads.append([x, "O_B"])) - self.attention.output.LayerNorm.weight.register_hook( - lambda x, - self=self: grads.append([x, - "N2_W"])) - self.attention.output.LayerNorm.bias.register_hook( - lambda x, - self=self: grads.append([x, - "N2_B"])) + self.attention.output.LayerNorm.weight.register_hook(lambda x, self=self: grads.append([x, "N2_W"])) + self.attention.output.LayerNorm.bias.register_hook(lambda x, self=self: grads.append([x, "N2_B"])) self.weight[5].register_hook(lambda x, self=self: grads.append([x, "int_W"])) self.biases[5].register_hook(lambda x, self=self: grads.append([x, "int_B"])) self.weight[6].register_hook(lambda x, self=self: grads.append([x, "out_W"])) self.biases[6].register_hook(lambda x, self=self: grads.append([x, "out_B"])) - self.output.LayerNorm.weight.register_hook( - lambda x, - self=self: grads.append([x, - "norm_W"])) - self.output.LayerNorm.bias.register_hook( - lambda x, - self=self: grads.append([x, - "norm_B"])) + self.output.LayerNorm.weight.register_hook(lambda x, self=self: grads.append([x, "norm_W"])) + self.output.LayerNorm.bias.register_hook(lambda x, self=self: grads.append([x, "norm_B"])) return layer_output @@ -608,17 +582,14 @@ class BertLayer(nn.Module): class BertEncoder(nn.Module): + def __init__(self, config, weights, biases): super(BertEncoder, self).__init__() #layer = BertLayer(config, weights, biases) self.FinalLayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) - self.layer = nn.ModuleList([ - copy.deepcopy(BertLayer(i, - config, - weights, - biases)) for i in range(config.num_hidden_layers) - ]) + self.layer = nn.ModuleList( + [copy.deepcopy(BertLayer(i, config, weights, biases)) for i in range(config.num_hidden_layers)]) self.grads = [] self.graph = [] @@ -640,14 +611,11 @@ class BertEncoder(nn.Module): self.graph.append(mdl) self.get_modules(self, mdl, input) - def forward(self, - hidden_states, - attention_mask, - output_all_encoded_layers=True, - checkpoint_activations=False): + def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, checkpoint_activations=False): all_encoder_layers = [] def custom(start, end): + def custom_forward(*inputs): layers = self.layer[start:end] x_ = inputs[0] @@ -662,23 +630,13 @@ class BertEncoder(nn.Module): num_layers = len(self.layer) chunk_length = math.ceil(math.sqrt(num_layers)) while l < num_layers: - hidden_states = checkpoint.checkpoint(custom(l, - l + chunk_length), - hidden_states, - attention_mask * 1) + hidden_states = checkpoint.checkpoint(custom(l, l + chunk_length), hidden_states, attention_mask * 1) l += chunk_length # decoder layers else: for i, layer_module in enumerate(self.layer): - hidden_states = layer_module(hidden_states, - attention_mask, - self.grads, - collect_all_grads=True) - hidden_states.register_hook( - lambda x, - i=i, - self=self: self.grads.append([x, - "hidden_state"])) + hidden_states = layer_module(hidden_states, attention_mask, self.grads, collect_all_grads=True) + hidden_states.register_hook(lambda x, i=i, self=self: self.grads.append([x, "hidden_state"])) #print("pytorch weight is: ", layer_module.get_w()) if output_all_encoded_layers: @@ -707,11 +665,10 @@ class BertEncoder(nn.Module): class BertPooler(nn.Module): + def __init__(self, config): super(BertPooler, self).__init__() - self.dense_act = LinearActivation(config.hidden_size, - config.hidden_size, - act="tanh") + self.dense_act = LinearActivation(config.hidden_size, config.hidden_size, act="tanh") def forward(self, hidden_states): # We "pool" the model by simply taking the hidden state corresponding @@ -722,11 +679,10 @@ class BertPooler(nn.Module): class BertPredictionHeadTransform(nn.Module): + def __init__(self, config): super(BertPredictionHeadTransform, self).__init__() - self.dense_act = LinearActivation(config.hidden_size, - config.hidden_size, - act=config.hidden_act) + self.dense_act = LinearActivation(config.hidden_size, config.hidden_size, act=config.hidden_act) self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) def forward(self, hidden_states): @@ -736,6 +692,7 @@ class BertPredictionHeadTransform(nn.Module): class BertLMPredictionHead(nn.Module): + def __init__(self, config, bert_model_embedding_weights): super(BertLMPredictionHead, self).__init__() self.transform = BertPredictionHeadTransform(config) @@ -750,16 +707,15 @@ class BertLMPredictionHead(nn.Module): def forward(self, hidden_states): hidden_states = self.transform(hidden_states) - get_accelerator().range_push( - "decoder input.size() = {}, weight.size() = {}".format( - hidden_states.size(), - self.decoder.weight.size())) + get_accelerator().range_push("decoder input.size() = {}, weight.size() = {}".format( + hidden_states.size(), self.decoder.weight.size())) hidden_states = self.decoder(hidden_states) + self.bias get_accelerator().range_pop() return hidden_states class BertOnlyMLMHead(nn.Module): + def __init__(self, config, bert_model_embedding_weights): super(BertOnlyMLMHead, self).__init__() self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights) @@ -770,6 +726,7 @@ class BertOnlyMLMHead(nn.Module): class BertOnlyNSPHead(nn.Module): + def __init__(self, config): super(BertOnlyNSPHead, self).__init__() self.seq_relationship = nn.Linear(config.hidden_size, 2) @@ -780,6 +737,7 @@ class BertOnlyNSPHead(nn.Module): class BertPreTrainingHeads(nn.Module): + def __init__(self, config, bert_model_embedding_weights): super(BertPreTrainingHeads, self).__init__() self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights) @@ -795,15 +753,14 @@ class BertPreTrainedModel(nn.Module): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ + def __init__(self, config, *inputs, **kwargs): super(BertPreTrainedModel, self).__init__() if not isinstance(config, BertConfig): - raise ValueError( - "Parameter config in `{}(config)` should be an instance of class `BertConfig`. " - "To create a model from a Google pretrained model use " - "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format( - self.__class__.__name__, - self.__class__.__name__)) + raise ValueError("Parameter config in `{}(config)` should be an instance of class `BertConfig`. " + "To create a model from a Google pretrained model use " + "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format( + self.__class__.__name__, self.__class__.__name__)) self.config = config def init_bert_weights(self, module): @@ -860,9 +817,8 @@ class BertPreTrainedModel(nn.Module): if resolved_archive_file == archive_file: # noqa: F821 logger.info("loading archive file {}".format(archive_file)) else: - logger.info("loading archive file {} from cache at {}".format( - archive_file, - resolved_archive_file)) # noqa: F821 + logger.info("loading archive file {} from cache at {}".format(archive_file, + resolved_archive_file)) # noqa: F821 tempdir = None if os.path.isdir(resolved_archive_file) or from_tf: # noqa: F821 serialization_dir = resolved_archive_file # noqa: F821 @@ -883,9 +839,7 @@ class BertPreTrainedModel(nn.Module): model = cls(config, *inputs, **kwargs) if state_dict is None and not from_tf: weights_path = os.path.join(serialization_dir, WEIGHTS_NAME) - state_dict = torch.load( - weights_path, - map_location='cpu' if not get_accelerator().is_available() else None) + state_dict = torch.load(weights_path, map_location='cpu' if not get_accelerator().is_available() else None) if tempdir: # Clean up temp dir shutil.rmtree(tempdir) @@ -919,34 +873,25 @@ class BertPreTrainedModel(nn.Module): def load(module, prefix=''): local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) - module._load_from_state_dict(state_dict, - prefix, - local_metadata, - True, - missing_keys, - unexpected_keys, + module._load_from_state_dict(state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs) for name, child in module._modules.items(): if child is not None: load(child, prefix + name + '.') start_prefix = '' - if not hasattr(model, - 'bert') and any(s.startswith('bert.') for s in state_dict.keys()): + if not hasattr(model, 'bert') and any(s.startswith('bert.') for s in state_dict.keys()): start_prefix = 'bert.' load(model, prefix=start_prefix) if len(missing_keys) > 0: logger.info("Weights of {} not initialized from pretrained model: {}".format( - model.__class__.__name__, - missing_keys)) + model.__class__.__name__, missing_keys)) if len(unexpected_keys) > 0: - logger.info("Weights from pretrained model not used in {}: {}".format( - model.__class__.__name__, - unexpected_keys)) + logger.info("Weights from pretrained model not used in {}: {}".format(model.__class__.__name__, + unexpected_keys)) if len(error_msgs) > 0: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( - model.__class__.__name__, - "\n\t".join(error_msgs))) + model.__class__.__name__, "\n\t".join(error_msgs))) return model @@ -994,6 +939,7 @@ class BertModel(BertPreTrainedModel): all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask) ``` """ + def __init__(self, config): super(BertModel, self).__init__(config) self.embeddings = BertEmbeddings(config) @@ -1024,16 +970,14 @@ class BertModel(BertPreTrainedModel): # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. - extended_attention_mask = extended_attention_mask.to(dtype=next( - self.parameters()).dtype) # fp16 compatibility + extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 embedding_output = self.embeddings(input_ids, token_type_ids) - encoded_layers = self.encoder( - embedding_output, - extended_attention_mask, - output_all_encoded_layers=output_all_encoded_layers, - checkpoint_activations=checkpoint_activations) + encoded_layers = self.encoder(embedding_output, + extended_attention_mask, + output_all_encoded_layers=output_all_encoded_layers, + checkpoint_activations=checkpoint_activations) sequence_output = encoded_layers[-1] pooled_output = self.pooler(sequence_output) if not output_all_encoded_layers: @@ -1091,6 +1035,7 @@ class BertForPreTraining(BertPreTrainedModel): masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask) ``` """ + def __init__(self, config, args): super(BertForPreTraining, self).__init__(config) self.summary_writer = None @@ -1099,17 +1044,14 @@ class BertForPreTraining(BertPreTrainedModel): self.samples_per_step = dist.get_world_size() * args.train_batch_size self.sample_count = self.samples_per_step self.bert = BertModel(config) - self.cls = BertPreTrainingHeads(config, - self.bert.embeddings.word_embeddings.weight) + self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight) self.apply(self.init_bert_weights) def log_summary_writer(self, logs: dict, base='Train'): if dist.get_rank() == 0: module_name = "Samples" #self._batch_module_name.get(batch_type, self._get_batch_type_error(batch_type)) for key, log in logs.items(): - self.summary_writer.add_scalar(f'{base}/{module_name}/{key}', - log, - self.sample_count) + self.summary_writer.add_scalar(f'{base}/{module_name}/{key}', log, self.sample_count) self.sample_count += self.samples_per_step def forward(self, batch, log=True): @@ -1121,18 +1063,17 @@ class BertForPreTraining(BertPreTrainedModel): next_sentence_label = batch[4] checkpoint_activations = False - sequence_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, - output_all_encoded_layers=False, checkpoint_activations=checkpoint_activations) + sequence_output, pooled_output = self.bert(input_ids, + token_type_ids, + attention_mask, + output_all_encoded_layers=False, + checkpoint_activations=checkpoint_activations) prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) if masked_lm_labels is not None and next_sentence_label is not None: loss_fct = CrossEntropyLoss(ignore_index=-1) - masked_lm_loss = loss_fct(prediction_scores.view(-1, - self.config.vocab_size), - masked_lm_labels.view(-1)) - next_sentence_loss = loss_fct(seq_relationship_score.view(-1, - 2), - next_sentence_label.view(-1)) + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) + next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) #print("loss is {} {}".format(masked_lm_loss, next_sentence_loss)) total_loss = masked_lm_loss + next_sentence_loss # if log: @@ -1184,6 +1125,7 @@ class BertForMaskedLM(BertPreTrainedModel): masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask) ``` """ + def __init__(self, config): super(BertForMaskedLM, self).__init__(config) self.bert = BertModel(config) @@ -1196,15 +1138,12 @@ class BertForMaskedLM(BertPreTrainedModel): attention_mask=None, masked_lm_labels=None, checkpoint_activations=False): - sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, - output_all_encoded_layers=False) + sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False) prediction_scores = self.cls(sequence_output) if masked_lm_labels is not None: loss_fct = CrossEntropyLoss(ignore_index=-1) - masked_lm_loss = loss_fct(prediction_scores.view(-1, - self.config.vocab_size), - masked_lm_labels.view(-1)) + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) return masked_lm_loss else: return prediction_scores @@ -1253,6 +1192,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel): seq_relationship_logits = model(input_ids, token_type_ids, input_mask) ``` """ + def __init__(self, config): super(BertForNextSentencePrediction, self).__init__(config) self.bert = BertModel(config) @@ -1265,15 +1205,12 @@ class BertForNextSentencePrediction(BertPreTrainedModel): attention_mask=None, next_sentence_label=None, checkpoint_activations=False): - _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, - output_all_encoded_layers=False) + _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False) seq_relationship_score = self.cls(pooled_output) if next_sentence_label is not None: loss_fct = CrossEntropyLoss(ignore_index=-1) - next_sentence_loss = loss_fct(seq_relationship_score.view(-1, - 2), - next_sentence_label.view(-1)) + next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) return next_sentence_loss else: return seq_relationship_score @@ -1324,6 +1261,7 @@ class BertForSequenceClassification(BertPreTrainedModel): logits = model(input_ids, token_type_ids, input_mask) ``` """ + def __init__(self, config, num_labels): super(BertForSequenceClassification, self).__init__(config) self.num_labels = num_labels @@ -1332,12 +1270,7 @@ class BertForSequenceClassification(BertPreTrainedModel): self.classifier = nn.Linear(config.hidden_size, num_labels) self.apply(self.init_bert_weights) - def forward(self, - input_ids, - token_type_ids=None, - attention_mask=None, - labels=None, - checkpoint_activations=False): + def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, checkpoint_activations=False): _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False) pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) @@ -1394,6 +1327,7 @@ class BertForMultipleChoice(BertPreTrainedModel): logits = model(input_ids, token_type_ids, input_mask) ``` """ + def __init__(self, config, num_choices): super(BertForMultipleChoice, self).__init__(config) self.num_choices = num_choices @@ -1402,16 +1336,14 @@ class BertForMultipleChoice(BertPreTrainedModel): self.classifier = nn.Linear(config.hidden_size, 1) self.apply(self.init_bert_weights) - def forward(self, - input_ids, - token_type_ids=None, - attention_mask=None, - labels=None, - checkpoint_activations=False): + def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, checkpoint_activations=False): flat_input_ids = input_ids.view(-1, input_ids.size(-1)) flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) - _, pooled_output = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, output_all_encoded_layers=False) + _, pooled_output = self.bert(flat_input_ids, + flat_token_type_ids, + flat_attention_mask, + output_all_encoded_layers=False) pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) reshaped_logits = logits.view(-1, self.num_choices) @@ -1469,6 +1401,7 @@ class BertForTokenClassification(BertPreTrainedModel): logits = model(input_ids, token_type_ids, input_mask) ``` """ + def __init__(self, config, num_labels): super(BertForTokenClassification, self).__init__(config) self.num_labels = num_labels @@ -1477,12 +1410,7 @@ class BertForTokenClassification(BertPreTrainedModel): self.classifier = nn.Linear(config.hidden_size, num_labels) self.apply(self.init_bert_weights) - def forward(self, - input_ids, - token_type_ids=None, - attention_mask=None, - labels=None, - checkpoint_activations=False): + def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, checkpoint_activations=False): sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False) sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) @@ -1549,6 +1477,7 @@ class BertForQuestionAnswering(BertPreTrainedModel): start_logits, end_logits = model(input_ids, token_type_ids, input_mask) ``` """ + def __init__(self, config): super(BertForQuestionAnswering, self).__init__(config) self.bert = BertModel(config) diff --git a/tests/unit/modelingpreln.py b/tests/unit/modelingpreln.py index 0069add9a..2d0f512c8 100644 --- a/tests/unit/modelingpreln.py +++ b/tests/unit/modelingpreln.py @@ -48,20 +48,15 @@ from deepspeed.accelerator import get_accelerator logger = logging.getLogger(__name__) PRETRAINED_MODEL_ARCHIVE_MAP = { - 'bert-base-uncased': - "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz", - 'bert-large-uncased': - "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz", - 'bert-base-cased': - "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz", - 'bert-large-cased': - "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz", + 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz", + 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz", + 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz", + 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz", 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz", 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz", - 'bert-base-chinese': - "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz", + 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz", } CONFIG_NAME = 'bert_config.json' WEIGHTS_NAME = 'pytorch_model.bin' @@ -76,9 +71,8 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path): import numpy as np import tensorflow as tf except ImportError: - print( - "Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see " - "https://www.tensorflow.org/install/ for installation instructions.") + print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions.") raise tf_path = os.path.abspath(tf_checkpoint_path) print("Converting TensorFlow checkpoint from {}".format(tf_path)) @@ -183,6 +177,7 @@ ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish} class GPUTimer: + def __init__(self): super().__init__() self.start = get_accelerator().Event() # noqa: F821 @@ -202,13 +197,7 @@ class LinearActivation(Module): """ __constants__ = ['bias'] - def __init__(self, - in_features, - out_features, - weights, - biases, - act='gelu', - bias=True): + def __init__(self, in_features, out_features, weights, biases, act='gelu', bias=True): super(LinearActivation, self).__init__() self.in_features = in_features self.out_features = out_features @@ -256,15 +245,14 @@ class LinearActivation(Module): return self.act_fn(F.linear(input, self.weight, self.bias)) def extra_repr(self): - return 'in_features={}, out_features={}, bias={}'.format( - self.in_features, - self.out_features, - self.bias is not None) + return 'in_features={}, out_features={}, bias={}'.format(self.in_features, self.out_features, self.bias + is not None) class BertConfig(object): """Configuration class to store the configuration of a `BertModel`. """ + def __init__(self, vocab_size_or_config_json_file, hidden_size=768, @@ -361,11 +349,10 @@ try: #apex.amp.register_float_function(apex.normalization.FusedLayerNorm, 'forward') BertLayerNorm = apex.normalization.FusedLayerNorm except ImportError: - print( - "Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex." - ) + print("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.") class BertLayerNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-12): """Construct a layernorm module in the TF style (epsilon inside the square root). """ @@ -392,13 +379,12 @@ except ImportError: class BertEmbeddings(nn.Module): """Construct the embeddings from word, position and token_type embeddings. """ + def __init__(self, config): super(BertEmbeddings, self).__init__() self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size) - self.position_embeddings = nn.Embedding(config.max_position_embeddings, - config.hidden_size) - self.token_type_embeddings = nn.Embedding(config.type_vocab_size, - config.hidden_size) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file @@ -407,9 +393,7 @@ class BertEmbeddings(nn.Module): def forward(self, input_ids, token_type_ids=None): seq_length = input_ids.size(1) - position_ids = torch.arange(seq_length, - dtype=torch.long, - device=input_ids.device) + position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) position_ids = position_ids.unsqueeze(0).expand_as(input_ids) if token_type_ids is None: token_type_ids = torch.zeros_like(input_ids) @@ -425,13 +409,12 @@ class BertEmbeddings(nn.Module): class BertSelfAttention(nn.Module): + def __init__(self, i, config, weights, biases): super(BertSelfAttention, self).__init__() if config.hidden_size % config.num_attention_heads != 0: - raise ValueError( - "The hidden size (%d) is not a multiple of the number of attention " - "heads (%d)" % (config.hidden_size, - config.num_attention_heads)) + raise ValueError("The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (config.hidden_size, config.num_attention_heads)) self.num_attention_heads = config.num_attention_heads self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size @@ -459,14 +442,12 @@ class BertSelfAttention(nn.Module): #self.softmax = DeepSpeedSoftmax(i, self.softmax_config) def transpose_for_scores(self, x): - new_x_shape = x.size()[:-1] + (self.num_attention_heads, - self.attention_head_size) + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) x = x.view(*new_x_shape) return x.permute(0, 2, 1, 3) def transpose_key_for_scores(self, x): - new_x_shape = x.size()[:-1] + (self.num_attention_heads, - self.attention_head_size) + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) x = x.view(*new_x_shape) return x.permute(0, 2, 3, 1) @@ -559,6 +540,7 @@ class BertSelfAttention(nn.Module): class BertSelfOutput(nn.Module): + def __init__(self, config, weights, biases): super(BertSelfOutput, self).__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) @@ -586,6 +568,7 @@ class BertSelfOutput(nn.Module): class BertAttention(nn.Module): + def __init__(self, i, config, weights, biases): super(BertAttention, self).__init__() self.self = BertSelfAttention(i, config, weights, biases) @@ -601,6 +584,7 @@ class BertAttention(nn.Module): class BertIntermediate(nn.Module): + def __init__(self, config, weights, biases): super(BertIntermediate, self).__init__() self.dense_act = LinearActivation(config.hidden_size, @@ -615,6 +599,7 @@ class BertIntermediate(nn.Module): class BertOutput(nn.Module): + def __init__(self, config, weights, biases): super(BertOutput, self).__init__() self.dense = nn.Linear(config.intermediate_size, config.hidden_size) @@ -641,6 +626,7 @@ class BertOutput(nn.Module): class BertLayer(nn.Module): + def __init__(self, i, config, weights, biases): super(BertLayer, self).__init__() self.attention = BertAttention(i, config, weights, biases) @@ -674,26 +660,14 @@ class BertLayer(nn.Module): self.biases[2].register_hook(lambda x, self=self: grads.append([x, "V_B"])) self.weight[3].register_hook(lambda x, self=self: grads.append([x, "O_W"])) self.biases[3].register_hook(lambda x, self=self: grads.append([x, "O_B"])) - self.PostAttentionLayerNorm.weight.register_hook( - lambda x, - self=self: grads.append([x, - "N2_W"])) - self.PostAttentionLayerNorm.bias.register_hook( - lambda x, - self=self: grads.append([x, - "N2_B"])) + self.PostAttentionLayerNorm.weight.register_hook(lambda x, self=self: grads.append([x, "N2_W"])) + self.PostAttentionLayerNorm.bias.register_hook(lambda x, self=self: grads.append([x, "N2_B"])) self.weight[5].register_hook(lambda x, self=self: grads.append([x, "int_W"])) self.biases[5].register_hook(lambda x, self=self: grads.append([x, "int_B"])) self.weight[6].register_hook(lambda x, self=self: grads.append([x, "out_W"])) self.biases[6].register_hook(lambda x, self=self: grads.append([x, "out_B"])) - self.PreAttentionLayerNorm.weight.register_hook( - lambda x, - self=self: grads.append([x, - "norm_W"])) - self.PreAttentionLayerNorm.bias.register_hook( - lambda x, - self=self: grads.append([x, - "norm_B"])) + self.PreAttentionLayerNorm.weight.register_hook(lambda x, self=self: grads.append([x, "norm_W"])) + self.PreAttentionLayerNorm.bias.register_hook(lambda x, self=self: grads.append([x, "norm_B"])) return layer_output + intermediate_input @@ -702,17 +676,14 @@ class BertLayer(nn.Module): class BertEncoder(nn.Module): + def __init__(self, config, weights, biases): super(BertEncoder, self).__init__() #layer = BertLayer(config, weights, biases) self.FinalLayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) - self.layer = nn.ModuleList([ - copy.deepcopy(BertLayer(i, - config, - weights, - biases)) for i in range(config.num_hidden_layers) - ]) + self.layer = nn.ModuleList( + [copy.deepcopy(BertLayer(i, config, weights, biases)) for i in range(config.num_hidden_layers)]) self.grads = [] self.graph = [] @@ -734,14 +705,11 @@ class BertEncoder(nn.Module): self.graph.append(mdl) self.get_modules(self, mdl, input) - def forward(self, - hidden_states, - attention_mask, - output_all_encoded_layers=True, - checkpoint_activations=False): + def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, checkpoint_activations=False): all_encoder_layers = [] def custom(start, end): + def custom_forward(*inputs): layers = self.layer[start:end] x_ = inputs[0] @@ -756,23 +724,13 @@ class BertEncoder(nn.Module): num_layers = len(self.layer) chunk_length = math.ceil(math.sqrt(num_layers)) while l < num_layers: - hidden_states = checkpoint.checkpoint(custom(l, - l + chunk_length), - hidden_states, - attention_mask * 1) + hidden_states = checkpoint.checkpoint(custom(l, l + chunk_length), hidden_states, attention_mask * 1) l += chunk_length # decoder layers else: for i, layer_module in enumerate(self.layer): - hidden_states = layer_module(hidden_states, - attention_mask, - self.grads, - collect_all_grads=True) - hidden_states.register_hook( - lambda x, - i=i, - self=self: self.grads.append([x, - "hidden_state"])) + hidden_states = layer_module(hidden_states, attention_mask, self.grads, collect_all_grads=True) + hidden_states.register_hook(lambda x, i=i, self=self: self.grads.append([x, "hidden_state"])) #print("pytorch weight is: ", layer_module.get_w()) if output_all_encoded_layers: @@ -802,11 +760,10 @@ class BertEncoder(nn.Module): class BertPooler(nn.Module): + def __init__(self, config): super(BertPooler, self).__init__() - self.dense_act = LinearActivation(config.hidden_size, - config.hidden_size, - act="tanh") + self.dense_act = LinearActivation(config.hidden_size, config.hidden_size, act="tanh") def forward(self, hidden_states): # We "pool" the model by simply taking the hidden state corresponding @@ -817,11 +774,10 @@ class BertPooler(nn.Module): class BertPredictionHeadTransform(nn.Module): + def __init__(self, config): super(BertPredictionHeadTransform, self).__init__() - self.dense_act = LinearActivation(config.hidden_size, - config.hidden_size, - act=config.hidden_act) + self.dense_act = LinearActivation(config.hidden_size, config.hidden_size, act=config.hidden_act) self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) def forward(self, hidden_states): @@ -831,6 +787,7 @@ class BertPredictionHeadTransform(nn.Module): class BertLMPredictionHead(nn.Module): + def __init__(self, config, bert_model_embedding_weights): super(BertLMPredictionHead, self).__init__() self.transform = BertPredictionHeadTransform(config) @@ -845,16 +802,15 @@ class BertLMPredictionHead(nn.Module): def forward(self, hidden_states): hidden_states = self.transform(hidden_states) - get_accelerator().range_push( - "decoder input.size() = {}, weight.size() = {}".format( - hidden_states.size(), - self.decoder.weight.size())) + get_accelerator().range_push("decoder input.size() = {}, weight.size() = {}".format( + hidden_states.size(), self.decoder.weight.size())) hidden_states = self.decoder(hidden_states) + self.bias get_accelerator().range_pop() return hidden_states class BertOnlyMLMHead(nn.Module): + def __init__(self, config, bert_model_embedding_weights): super(BertOnlyMLMHead, self).__init__() self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights) @@ -865,6 +821,7 @@ class BertOnlyMLMHead(nn.Module): class BertOnlyNSPHead(nn.Module): + def __init__(self, config): super(BertOnlyNSPHead, self).__init__() self.seq_relationship = nn.Linear(config.hidden_size, 2) @@ -875,6 +832,7 @@ class BertOnlyNSPHead(nn.Module): class BertPreTrainingHeads(nn.Module): + def __init__(self, config, bert_model_embedding_weights): super(BertPreTrainingHeads, self).__init__() self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights) @@ -890,15 +848,14 @@ class BertPreTrainedModel(nn.Module): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ + def __init__(self, config, *inputs, **kwargs): super(BertPreTrainedModel, self).__init__() if not isinstance(config, BertConfig): - raise ValueError( - "Parameter config in `{}(config)` should be an instance of class `BertConfig`. " - "To create a model from a Google pretrained model use " - "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format( - self.__class__.__name__, - self.__class__.__name__)) + raise ValueError("Parameter config in `{}(config)` should be an instance of class `BertConfig`. " + "To create a model from a Google pretrained model use " + "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format( + self.__class__.__name__, self.__class__.__name__)) self.config = config def init_bert_weights(self, module): @@ -955,9 +912,8 @@ class BertPreTrainedModel(nn.Module): if resolved_archive_file == archive_file: # noqa: F821 logger.info("loading archive file {}".format(archive_file)) else: - logger.info("loading archive file {} from cache at {}".format( - archive_file, - resolved_archive_file)) # noqa: F821 + logger.info("loading archive file {} from cache at {}".format(archive_file, + resolved_archive_file)) # noqa: F821 tempdir = None if os.path.isdir(resolved_archive_file) or from_tf: # noqa: F821 serialization_dir = resolved_archive_file # noqa: F821 @@ -978,9 +934,7 @@ class BertPreTrainedModel(nn.Module): model = cls(config, *inputs, **kwargs) if state_dict is None and not from_tf: weights_path = os.path.join(serialization_dir, WEIGHTS_NAME) - state_dict = torch.load( - weights_path, - map_location='cpu' if not get_accelerator().is_available() else None) + state_dict = torch.load(weights_path, map_location='cpu' if not get_accelerator().is_available() else None) if tempdir: # Clean up temp dir shutil.rmtree(tempdir) @@ -1014,34 +968,25 @@ class BertPreTrainedModel(nn.Module): def load(module, prefix=''): local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) - module._load_from_state_dict(state_dict, - prefix, - local_metadata, - True, - missing_keys, - unexpected_keys, + module._load_from_state_dict(state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs) for name, child in module._modules.items(): if child is not None: load(child, prefix + name + '.') start_prefix = '' - if not hasattr(model, - 'bert') and any(s.startswith('bert.') for s in state_dict.keys()): + if not hasattr(model, 'bert') and any(s.startswith('bert.') for s in state_dict.keys()): start_prefix = 'bert.' load(model, prefix=start_prefix) if len(missing_keys) > 0: logger.info("Weights of {} not initialized from pretrained model: {}".format( - model.__class__.__name__, - missing_keys)) + model.__class__.__name__, missing_keys)) if len(unexpected_keys) > 0: - logger.info("Weights from pretrained model not used in {}: {}".format( - model.__class__.__name__, - unexpected_keys)) + logger.info("Weights from pretrained model not used in {}: {}".format(model.__class__.__name__, + unexpected_keys)) if len(error_msgs) > 0: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( - model.__class__.__name__, - "\n\t".join(error_msgs))) + model.__class__.__name__, "\n\t".join(error_msgs))) return model @@ -1089,6 +1034,7 @@ class BertModel(BertPreTrainedModel): all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask) ``` """ + def __init__(self, config): super(BertModel, self).__init__(config) self.embeddings = BertEmbeddings(config) @@ -1119,16 +1065,14 @@ class BertModel(BertPreTrainedModel): # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. - extended_attention_mask = extended_attention_mask.to(dtype=next( - self.parameters()).dtype) # fp16 compatibility + extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 embedding_output = self.embeddings(input_ids, token_type_ids) - encoded_layers = self.encoder( - embedding_output, - extended_attention_mask, - output_all_encoded_layers=output_all_encoded_layers, - checkpoint_activations=checkpoint_activations) + encoded_layers = self.encoder(embedding_output, + extended_attention_mask, + output_all_encoded_layers=output_all_encoded_layers, + checkpoint_activations=checkpoint_activations) sequence_output = encoded_layers[-1] pooled_output = self.pooler(sequence_output) if not output_all_encoded_layers: @@ -1186,6 +1130,7 @@ class BertForPreTraining(BertPreTrainedModel): masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask) ``` """ + def __init__(self, config, args): super(BertForPreTraining, self).__init__(config) self.summary_writer = None @@ -1194,17 +1139,14 @@ class BertForPreTraining(BertPreTrainedModel): self.samples_per_step = dist.get_world_size() * args.train_batch_size self.sample_count = self.samples_per_step self.bert = BertModel(config) - self.cls = BertPreTrainingHeads(config, - self.bert.embeddings.word_embeddings.weight) + self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight) self.apply(self.init_bert_weights) def log_summary_writer(self, logs: dict, base='Train'): if dist.get_rank() == 0: module_name = "Samples" #self._batch_module_name.get(batch_type, self._get_batch_type_error(batch_type)) for key, log in logs.items(): - self.summary_writer.add_scalar(f'{base}/{module_name}/{key}', - log, - self.sample_count) + self.summary_writer.add_scalar(f'{base}/{module_name}/{key}', log, self.sample_count) self.sample_count += self.samples_per_step def forward(self, batch, log=True): @@ -1216,18 +1158,17 @@ class BertForPreTraining(BertPreTrainedModel): next_sentence_label = batch[4] checkpoint_activations = False - sequence_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, - output_all_encoded_layers=False, checkpoint_activations=checkpoint_activations) + sequence_output, pooled_output = self.bert(input_ids, + token_type_ids, + attention_mask, + output_all_encoded_layers=False, + checkpoint_activations=checkpoint_activations) prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) if masked_lm_labels is not None and next_sentence_label is not None: loss_fct = CrossEntropyLoss(ignore_index=-1) - masked_lm_loss = loss_fct(prediction_scores.view(-1, - self.config.vocab_size), - masked_lm_labels.view(-1)) - next_sentence_loss = loss_fct(seq_relationship_score.view(-1, - 2), - next_sentence_label.view(-1)) + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) + next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) #print("loss is {} {}".format(masked_lm_loss, next_sentence_loss)) total_loss = masked_lm_loss + next_sentence_loss # if log: @@ -1279,6 +1220,7 @@ class BertForMaskedLM(BertPreTrainedModel): masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask) ``` """ + def __init__(self, config): super(BertForMaskedLM, self).__init__(config) self.bert = BertModel(config) @@ -1291,15 +1233,12 @@ class BertForMaskedLM(BertPreTrainedModel): attention_mask=None, masked_lm_labels=None, checkpoint_activations=False): - sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, - output_all_encoded_layers=False) + sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False) prediction_scores = self.cls(sequence_output) if masked_lm_labels is not None: loss_fct = CrossEntropyLoss(ignore_index=-1) - masked_lm_loss = loss_fct(prediction_scores.view(-1, - self.config.vocab_size), - masked_lm_labels.view(-1)) + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) return masked_lm_loss else: return prediction_scores @@ -1348,6 +1287,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel): seq_relationship_logits = model(input_ids, token_type_ids, input_mask) ``` """ + def __init__(self, config): super(BertForNextSentencePrediction, self).__init__(config) self.bert = BertModel(config) @@ -1360,15 +1300,12 @@ class BertForNextSentencePrediction(BertPreTrainedModel): attention_mask=None, next_sentence_label=None, checkpoint_activations=False): - _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, - output_all_encoded_layers=False) + _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False) seq_relationship_score = self.cls(pooled_output) if next_sentence_label is not None: loss_fct = CrossEntropyLoss(ignore_index=-1) - next_sentence_loss = loss_fct(seq_relationship_score.view(-1, - 2), - next_sentence_label.view(-1)) + next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) return next_sentence_loss else: return seq_relationship_score @@ -1419,6 +1356,7 @@ class BertForSequenceClassification(BertPreTrainedModel): logits = model(input_ids, token_type_ids, input_mask) ``` """ + def __init__(self, config, num_labels): super(BertForSequenceClassification, self).__init__(config) self.num_labels = num_labels @@ -1427,12 +1365,7 @@ class BertForSequenceClassification(BertPreTrainedModel): self.classifier = nn.Linear(config.hidden_size, num_labels) self.apply(self.init_bert_weights) - def forward(self, - input_ids, - token_type_ids=None, - attention_mask=None, - labels=None, - checkpoint_activations=False): + def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, checkpoint_activations=False): _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False) pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) @@ -1489,6 +1422,7 @@ class BertForMultipleChoice(BertPreTrainedModel): logits = model(input_ids, token_type_ids, input_mask) ``` """ + def __init__(self, config, num_choices): super(BertForMultipleChoice, self).__init__(config) self.num_choices = num_choices @@ -1497,16 +1431,14 @@ class BertForMultipleChoice(BertPreTrainedModel): self.classifier = nn.Linear(config.hidden_size, 1) self.apply(self.init_bert_weights) - def forward(self, - input_ids, - token_type_ids=None, - attention_mask=None, - labels=None, - checkpoint_activations=False): + def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, checkpoint_activations=False): flat_input_ids = input_ids.view(-1, input_ids.size(-1)) flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) - _, pooled_output = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, output_all_encoded_layers=False) + _, pooled_output = self.bert(flat_input_ids, + flat_token_type_ids, + flat_attention_mask, + output_all_encoded_layers=False) pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) reshaped_logits = logits.view(-1, self.num_choices) @@ -1564,6 +1496,7 @@ class BertForTokenClassification(BertPreTrainedModel): logits = model(input_ids, token_type_ids, input_mask) ``` """ + def __init__(self, config, num_labels): super(BertForTokenClassification, self).__init__(config) self.num_labels = num_labels @@ -1572,12 +1505,7 @@ class BertForTokenClassification(BertPreTrainedModel): self.classifier = nn.Linear(config.hidden_size, num_labels) self.apply(self.init_bert_weights) - def forward(self, - input_ids, - token_type_ids=None, - attention_mask=None, - labels=None, - checkpoint_activations=False): + def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, checkpoint_activations=False): sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False) sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) @@ -1644,6 +1572,7 @@ class BertForQuestionAnswering(BertPreTrainedModel): start_logits, end_logits = model(input_ids, token_type_ids, input_mask) ``` """ + def __init__(self, config): super(BertForQuestionAnswering, self).__init__(config) self.bert = BertModel(config) diff --git a/tests/unit/moe/test_moe.py b/tests/unit/moe/test_moe.py index fe5359249..e7746b3bc 100644 --- a/tests/unit/moe/test_moe.py +++ b/tests/unit/moe/test_moe.py @@ -17,13 +17,7 @@ class TestMoE(DistributedTest): if not required_torch_version(): pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly") - config_dict = { - "train_batch_size": 8, - "steps_per_print": 1, - "fp16": { - "enabled": True - } - } + config_dict = {"train_batch_size": 8, "steps_per_print": 1, "fp16": {"enabled": True}} hidden_dim = 16 # E+D -- ep_size = 2 @@ -36,10 +30,7 @@ class TestMoE(DistributedTest): dist_init_required=False) #dist_init_required=False -- parameterize to True/False? - data_loader = sequence_dataloader(model=model, - total_samples=50, - hidden_dim=hidden_dim, - device=model.device) + data_loader = sequence_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) @@ -55,13 +46,7 @@ class TestPRMoE(DistributedTest): if not required_torch_version(): pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly") - config_dict = { - "train_batch_size": 8, - "steps_per_print": 1, - "fp16": { - "enabled": True - } - } + config_dict = {"train_batch_size": 8, "steps_per_print": 1, "fp16": {"enabled": True}} hidden_dim = 16 # E+D -- ep_size = 2 @@ -73,10 +58,7 @@ class TestPRMoE(DistributedTest): optimizer=optimizer, dist_init_required=False) - data_loader = sequence_dataloader(model=model, - total_samples=50, - hidden_dim=hidden_dim, - device=model.device) + data_loader = sequence_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) diff --git a/tests/unit/moe/test_moe_tp.py b/tests/unit/moe/test_moe_tp.py index ba63a102a..f758ecc76 100644 --- a/tests/unit/moe/test_moe_tp.py +++ b/tests/unit/moe/test_moe_tp.py @@ -9,6 +9,7 @@ from deepspeed.moe.layer import MoE class MPU(): + def __init__(self, tp_world_size): self.rank = deepspeed.comm.get_rank() self.world_size = deepspeed.comm.get_world_size() @@ -57,21 +58,12 @@ class TestMOETensorParallel(DistributedTest): if not required_torch_version(): pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly") - config_dict = { - "train_batch_size": 8, - "steps_per_print": 1, - "fp16": { - "enabled": True - } - } + config_dict = {"train_batch_size": 8, "steps_per_print": 1, "fp16": {"enabled": True}} hidden_dim = 16 - tensor_parallel_expert = torch.nn.Sequential( - torch.nn.Linear(hidden_dim, - 4 * hidden_dim // tp_size), - torch.nn.ReLU(), - torch.nn.Linear(4 * hidden_dim // tp_size, - hidden_dim)) + tensor_parallel_expert = torch.nn.Sequential(torch.nn.Linear(hidden_dim, 4 * hidden_dim // tp_size), + torch.nn.ReLU(), + torch.nn.Linear(4 * hidden_dim // tp_size, hidden_dim)) # set num experts to world size world_size = deepspeed.comm.get_world_size() @@ -92,7 +84,6 @@ class TestMOETensorParallel(DistributedTest): assert model.num_local_experts == world_size // ep_size if enable_expert_tp: - assert deepspeed.utils.groups._get_expert_model_parallel_world_size( - ) == tp_size + assert deepspeed.utils.groups._get_expert_model_parallel_world_size() == tp_size else: assert deepspeed.utils.groups._get_expert_model_parallel_world_size() == 1 diff --git a/tests/unit/multi_output_model.py b/tests/unit/multi_output_model.py index 8993813aa..1b4d4c1bb 100644 --- a/tests/unit/multi_output_model.py +++ b/tests/unit/multi_output_model.py @@ -4,6 +4,7 @@ import torch class MultiOutputModel(torch.nn.Module): + def __init__(self, hidden_dim, weight_value): super(MultiOutputModel, self).__init__() self.linear = torch.nn.Linear(hidden_dim, hidden_dim, bias=False) @@ -24,19 +25,11 @@ def multi_output_dataloader(model, total_samples, hidden_dim, device, inputs, ta batch_size = model.train_micro_batch_size_per_gpu() train_data = [ - torch.full(size=(total_samples, - hidden_dim), - fill_value=x, - device=device, - dtype=torch.half, - requires_grad=True) for x in inputs + torch.full(size=(total_samples, hidden_dim), fill_value=x, device=device, dtype=torch.half, requires_grad=True) + for x in inputs ] - train_label = [ - torch.empty(total_samples, - device=device, - dtype=torch.long).fill_(y) for y in targets - ] + train_label = [torch.empty(total_samples, device=device, dtype=torch.long).fill_(y) for y in targets] train_dataset = torch.utils.data.TensorDataset(*train_data, *train_label) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size) diff --git a/tests/unit/ops/accelerators/test_accelerator_backward.py b/tests/unit/ops/accelerators/test_accelerator_backward.py index ad26daeb6..9b67749c5 100644 --- a/tests/unit/ops/accelerators/test_accelerator_backward.py +++ b/tests/unit/ops/accelerators/test_accelerator_backward.py @@ -1,6 +1,5 @@ '''Copyright The Microsoft DeepSpeed Team''' -import math import numpy as np import torch import pytest @@ -91,26 +90,21 @@ kwargs_fp16 = {'dtype': torch.half, 'device': device, 'requires_grad': True} class DSEncoder(nn.Module): + def __init__(self, config, weights, biases): super(DSEncoder, self).__init__() self.FinalLayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) self.layer = nn.ModuleList([ - copy.deepcopy(DeepSpeedTransformerLayer(config, - weights, - biases)) - for _ in range(config.num_hidden_layers) + copy.deepcopy(DeepSpeedTransformerLayer(config, weights, biases)) for _ in range(config.num_hidden_layers) ]) self.grads = [] self.pre_or_post = config.pre_layer_norm - def forward(self, - hidden_states, - attention_mask, - output_all_encoded_layers=True, - checkpoint_activations=False): + def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, checkpoint_activations=False): all_encoder_layers = [] def custom(start, end): + def custom_forward(*inputs): layers = self.layer[start:end] x_ = inputs[0] @@ -121,25 +115,23 @@ class DSEncoder(nn.Module): return custom_forward if checkpoint_activations: - l = 0 - num_layers = len(self.layer) - chunk_length = math.ceil(math.sqrt(num_layers)) - while l < num_layers: - hidden_states = checkpoint.checkpoint(custom(l, # noqa: F821 - l + chunk_length), - hidden_states, - attention_mask * 1) - l += chunk_length + raise NotImplementedError("`checkpoint` is not defined below") + #l = 0 + #num_layers = len(self.layer) + #chunk_length = math.ceil(math.sqrt(num_layers)) + #while l < num_layers: + # hidden_states = checkpoint.checkpoint( + # custom( + # l, # noqa: F821 + # l + chunk_length), + # hidden_states, + # attention_mask * 1) + # l += chunk_length # decoder layers else: for i, layer_module in enumerate(self.layer): - hidden_states = layer_module(hidden_states, - attention_mask, - grads=self.grads) - hidden_states.register_hook( - lambda x, - self=self: self.grads.append([x, - "hidden_state"])) + hidden_states = layer_module(hidden_states, attention_mask, grads=self.grads) + hidden_states.register_hook(lambda x, self=self: self.grads.append([x, "hidden_state"])) if output_all_encoded_layers: all_encoder_layers.append(hidden_states) @@ -171,20 +163,14 @@ def create_models(ds_config): biases = [] for i in range(4): - weights.append( - nn.Parameter(torch.Tensor(ds_config.hidden_size, - ds_config.hidden_size))) + weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size, ds_config.hidden_size))) weights[i].data.normal_(mean=0.0, std=ds_config.initializer_range) weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size))) weights[4].data.fill_(1.0) - weights.append( - nn.Parameter(torch.Tensor(ds_config.intermediate_size, - ds_config.hidden_size))) + weights.append(nn.Parameter(torch.Tensor(ds_config.intermediate_size, ds_config.hidden_size))) weights[5].data.normal_(mean=0.0, std=ds_config.initializer_range) - weights.append( - nn.Parameter(torch.Tensor(ds_config.hidden_size, - ds_config.intermediate_size))) + weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size, ds_config.intermediate_size))) weights[6].data.normal_(mean=0.0, std=ds_config.initializer_range) weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size))) weights[7].data.fill_(1.0) @@ -229,10 +215,7 @@ def run_backward(ds_config, seq_len, atol=1e-2, verbose=False): # prepare test data kwargs = kwargs_fp16 if ds_config.fp16 else kwargs_fp32 - hidden_states = torch.randn(ds_config.batch_size, - seq_len, - ds_config.hidden_size, - **kwargs) + hidden_states = torch.randn(ds_config.batch_size, seq_len, ds_config.hidden_size, **kwargs) input_mask = torch.randn(ds_config.batch_size, 1, 1, seq_len, **kwargs) Y = torch.randn(ds_config.batch_size, seq_len, ds_config.hidden_size, **kwargs) @@ -247,10 +230,7 @@ def run_backward(ds_config, seq_len, atol=1e-2, verbose=False): base_grads = bert_encoder.get_grads() # run ds - ds_results = ds_encoder(hidden_states, - input_mask, - output_all_encoded_layers=False, - checkpoint_activations=False) + ds_results = ds_encoder(hidden_states, input_mask, output_all_encoded_layers=False, checkpoint_activations=False) loss = (Y - ds_results[0]).pow(2).sum() / 64 loss.backward() @@ -280,18 +260,9 @@ def run_backward(ds_config, seq_len, atol=1e-2, verbose=False): class TestCUDABackward(DistributedTest): world_size = 1 - def test_backward(self, - batch_size, - hidden_size, - seq_len, - heads, - num_layers, - is_preln, - use_fp16, - atol): + def test_backward(self, batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16, atol): # Only run fp16 test cases on devices with FP16 capability. - if not get_accelerator().is_fp16_supported() and (use_fp16 is True - or is_preln is False): + if not get_accelerator().is_fp16_supported() and (use_fp16 is True or is_preln is False): return ds_config = DeepSpeedTransformerConfig() diff --git a/tests/unit/ops/accelerators/test_accelerator_forward.py b/tests/unit/ops/accelerators/test_accelerator_forward.py index 317e2fe3c..42c79702e 100644 --- a/tests/unit/ops/accelerators/test_accelerator_forward.py +++ b/tests/unit/ops/accelerators/test_accelerator_forward.py @@ -1,6 +1,5 @@ '''Copyright The Microsoft DeepSpeed Team''' -import math import numpy as np import torch import pytest @@ -38,26 +37,21 @@ kwargs_fp16 = {'dtype': torch.half, 'device': device, 'requires_grad': True} class DSEncoder(nn.Module): + def __init__(self, config, weights, biases): super(DSEncoder, self).__init__() self.FinalLayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) self.layer = nn.ModuleList([ - copy.deepcopy(DeepSpeedTransformerLayer(config, - weights, - biases)) - for _ in range(config.num_hidden_layers) + copy.deepcopy(DeepSpeedTransformerLayer(config, weights, biases)) for _ in range(config.num_hidden_layers) ]) self.grads = [] self.pre_or_post = config.pre_layer_norm - def forward(self, - hidden_states, - attention_mask, - output_all_encoded_layers=True, - checkpoint_activations=False): + def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, checkpoint_activations=False): all_encoder_layers = [] def custom(start, end): + def custom_forward(*inputs): layers = self.layer[start:end] x_ = inputs[0] @@ -68,15 +62,18 @@ class DSEncoder(nn.Module): return custom_forward if checkpoint_activations: - l = 0 - num_layers = len(self.layer) - chunk_length = math.ceil(math.sqrt(num_layers)) - while l < num_layers: - hidden_states = checkpoint.checkpoint(custom(l, # noqa: F821 - l + chunk_length), - hidden_states, - attention_mask * 1) - l += chunk_length + raise NotImplementedError("`checkpoint` below is not defined") + #l = 0 + #num_layers = len(self.layer) + #chunk_length = math.ceil(math.sqrt(num_layers)) + #while l < num_layers: + # hidden_states = checkpoint.checkpoint( + # custom( + # l, # noqa: F821 + # l + chunk_length), + # hidden_states, + # attention_mask * 1) + # l += chunk_length # decoder layers else: for i, layer_module in enumerate(self.layer): @@ -111,20 +108,14 @@ def create_models(ds_config): biases = [] for i in range(4): - weights.append( - nn.Parameter(torch.Tensor(ds_config.hidden_size, - ds_config.hidden_size))) + weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size, ds_config.hidden_size))) weights[i].data.normal_(mean=0.0, std=ds_config.initializer_range) weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size))) weights[4].data.fill_(1.0) - weights.append( - nn.Parameter(torch.Tensor(ds_config.intermediate_size, - ds_config.hidden_size))) + weights.append(nn.Parameter(torch.Tensor(ds_config.intermediate_size, ds_config.hidden_size))) weights[5].data.normal_(mean=0.0, std=ds_config.initializer_range) - weights.append( - nn.Parameter(torch.Tensor(ds_config.hidden_size, - ds_config.intermediate_size))) + weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size, ds_config.intermediate_size))) weights[6].data.normal_(mean=0.0, std=ds_config.initializer_range) weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size))) weights[7].data.fill_(1.0) @@ -181,10 +172,7 @@ def run_forward(ds_config, seq_len, atol=1e-2, verbose=False, test_bsz=None): checkpoint_activations=False) # run ds - ds_results = ds_encoder(hidden_states, - input_mask, - output_all_encoded_layers=False, - checkpoint_activations=False) + ds_results = ds_encoder(hidden_states, input_mask, output_all_encoded_layers=False, checkpoint_activations=False) # check forward evaluation check_equal(base_results, ds_results, atol=atol, verbose=verbose) @@ -234,14 +222,7 @@ def run_forward(ds_config, seq_len, atol=1e-2, verbose=False, test_bsz=None): class TestCUDAForward(DistributedTest): world_size = 1 - def test_forward(self, - batch_size, - hidden_size, - seq_len, - heads, - num_layers, - is_preln, - use_fp16): + def test_forward(self, batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16): # Only run fp16 test cases on devices with FP16 capability. if not get_accelerator().is_fp16_supported() and use_fp16 is True: return @@ -272,14 +253,7 @@ class TestCUDAForward(DistributedTest): class TestCUDAForwardSmallBatchSize(DistributedTest): world_size = 1 - def test_forward_with_small_bsz(self, - batch_size, - small_bsz, - hidden_size, - seq_len, - heads, - num_layers, - is_preln, + def test_forward_with_small_bsz(self, batch_size, small_bsz, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16): # Only run fp16 test cases on devices with FP16 capability. if not get_accelerator().is_fp16_supported() and use_fp16 is True: @@ -310,14 +284,7 @@ class TestCUDAForwardSmallBatchSize(DistributedTest): class TestCUDAForwardStochastic(DistributedTest): world_size = 1 - def test_forward_stochastic(self, - batch_size, - hidden_size, - seq_len, - heads, - num_layers, - is_preln, - use_fp16): + def test_forward_stochastic(self, batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16): # Only run fp16 test cases on devices with FP16 capability. if not get_accelerator().is_fp16_supported() and use_fp16 is True: return diff --git a/tests/unit/ops/adagrad/test_cpu_adagrad.py b/tests/unit/ops/adagrad/test_cpu_adagrad.py index 17001e6bd..29b84238e 100644 --- a/tests/unit/ops/adagrad/test_cpu_adagrad.py +++ b/tests/unit/ops/adagrad/test_cpu_adagrad.py @@ -74,24 +74,13 @@ class TestCPUAdagrad(DistributedTest): rng_state = torch.get_rng_state() def gen_sparse_grad(vocabulary_size, dim, num_indices, dtype, device): - i = torch.randint(vocabulary_size, - size=(1, - num_indices), - dtype=torch.int64, - device=device) + i = torch.randint(vocabulary_size, size=(1, num_indices), dtype=torch.int64, device=device) v = torch.randn(num_indices, dim, dtype=dtype, device=device) t = torch.sparse_coo_tensor(i, v, (vocabulary_size, dim), device=device) t = t.coalesce() - new_i = (t.indices().view(-1, - 1).repeat(1, - dim) * dim + - torch.tensor(range(dim))).flatten().unsqueeze(0) + new_i = (t.indices().view(-1, 1).repeat(1, dim) * dim + torch.tensor(range(dim))).flatten().unsqueeze(0) new_v = t.values().flatten() - new_t = torch.sparse_coo_tensor(new_i, - new_v, - (vocabulary_size * dim, - ), - device=device) + new_t = torch.sparse_coo_tensor(new_i, new_v, (vocabulary_size * dim, ), device=device) new_t = new_t.coalesce() new_t.requires_grad = False return new_t @@ -101,17 +90,9 @@ class TestCPUAdagrad(DistributedTest): num_indices = int(model_size // dim) dtype = torch.float32 - param = torch.nn.Parameter(torch.randn((voc_size * dim, - ), - dtype=dtype, - device=device), - requires_grad=True) + param = torch.nn.Parameter(torch.randn((voc_size * dim, ), dtype=dtype, device=device), requires_grad=True) torch.set_rng_state(rng_state) - param1 = torch.nn.Parameter(torch.randn((voc_size * dim, - ), - dtype=dtype, - device=device), - requires_grad=True) + param1 = torch.nn.Parameter(torch.randn((voc_size * dim, ), dtype=dtype, device=device), requires_grad=True) torch.set_rng_state(rng_state) optimizer = DeepSpeedCPUAdagrad([param]) @@ -119,17 +100,9 @@ class TestCPUAdagrad(DistributedTest): for i in range(10): torch.set_rng_state(rng_state) - param.grad = gen_sparse_grad(voc_size, - dim, - num_indices, - dtype=dtype, - device=device) + param.grad = gen_sparse_grad(voc_size, dim, num_indices, dtype=dtype, device=device) torch.set_rng_state(rng_state) - param1.grad = gen_sparse_grad(voc_size, - dim, - num_indices, - dtype=dtype, - device=device) + param1.grad = gen_sparse_grad(voc_size, dim, num_indices, dtype=dtype, device=device) optimizer.step() optimizer1.step() @@ -137,6 +110,7 @@ class TestCPUAdagrad(DistributedTest): class TestCPUAdagradGPUError(DistributedTest): + def test_cpu_adagrad_gpu_error(self): model_size = 64 device = get_accelerator().device_name(0) # 'cuda:0' or 'xpu:0' diff --git a/tests/unit/ops/adam/test_cpu_adam.py b/tests/unit/ops/adam/test_cpu_adam.py index d10fb9810..ee0d61c6a 100644 --- a/tests/unit/ops/adam/test_cpu_adam.py +++ b/tests/unit/ops/adam/test_cpu_adam.py @@ -31,17 +31,13 @@ def check_equal(first, second, atol=1e-2, verbose=False): def _compare_optimizers(model_size, param1, optimizer1, param2, optimizer2): for i in range(10): param1.grad = torch.randn(model_size, device=param1.device).to(param1.dtype) - param2.grad = param1.grad.clone().detach().to(device=param2.device, - dtype=param2.dtype) + param2.grad = param1.grad.clone().detach().to(device=param2.device, dtype=param2.dtype) optimizer1.step() optimizer2.step() tolerance = param1.float().norm().detach().numpy() * 1e-2 - check_equal(param1.float().norm(), - param2.float().cpu().norm(), - atol=tolerance, - verbose=True) + check_equal(param1.float().norm(), param2.float().cpu().norm(), atol=tolerance, verbose=True) @pytest.mark.parametrize('dtype', [torch.half, torch.float], ids=["fp16", "fp32"]) @@ -61,8 +57,7 @@ class TestCPUAdam(DistributedTest): init_distributed = False set_dist_env = False - @pytest.mark.skipif(not get_accelerator().is_available(), - reason="only supported in CUDA environments.") + @pytest.mark.skipif(not get_accelerator().is_available(), reason="only supported in CUDA environments.") def test_fused_adam_equal(self, dtype, model_size): if ("amd" in pytest.cpu_vendor) and (dtype == torch.half): pytest.skip("cpu-adam with half precision not supported on AMD CPUs") @@ -95,9 +90,7 @@ class TestCPUAdam(DistributedTest): ref_param_device = get_accelerator().device_name() else: if dtype == torch.half: - pytest.skip( - "torch.optim.AdamW with half precision only supported in CUDA environments." - ) + pytest.skip("torch.optim.AdamW with half precision only supported in CUDA environments.") ref_param_device = 'cpu' from deepspeed.ops.adam import DeepSpeedCPUAdam @@ -117,6 +110,7 @@ class TestCPUAdam(DistributedTest): class TestCPUAdamGPUError(DistributedTest): + def test_cpu_adam_gpu_error(self): model_size = 64 from deepspeed.ops.adam import DeepSpeedCPUAdam diff --git a/tests/unit/ops/aio/test_aio.py b/tests/unit/ops/aio/test_aio.py index 86265ab15..61e42b9ba 100644 --- a/tests/unit/ops/aio/test_aio.py +++ b/tests/unit/ops/aio/test_aio.py @@ -85,29 +85,15 @@ class TestRead(DistributedTest): init_distributed = False set_dist_env = False - def test_parallel_read(self, - tmpdir, - use_cuda_pinned_tensor, - single_submit, - overlap_events): - _skip_for_invalid_environment(use_cuda_device=False, - use_cuda_pinned_tensor=use_cuda_pinned_tensor) + def test_parallel_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events): + _skip_for_invalid_environment(use_cuda_device=False, use_cuda_pinned_tensor=use_cuda_pinned_tensor) - h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, - QUEUE_DEPTH, - single_submit, - overlap_events, - IO_PARALLEL) + h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL) if use_cuda_pinned_tensor: - aio_buffer = get_accelerator().pin_memory( - torch.empty(IO_SIZE, - dtype=torch.uint8, - device='cpu')) + aio_buffer = get_accelerator().pin_memory(torch.empty(IO_SIZE, dtype=torch.uint8, device='cpu')) else: - aio_buffer = h.new_cpu_locked_tensor(IO_SIZE, - torch.empty(0, - dtype=torch.uint8)) + aio_buffer = h.new_cpu_locked_tensor(IO_SIZE, torch.empty(0, dtype=torch.uint8)) _validate_handle_state(h, single_submit, overlap_events) @@ -123,35 +109,18 @@ class TestRead(DistributedTest): h.free_cpu_locked_tensor(aio_buffer) @pytest.mark.parametrize("cuda_device", [True, False]) - def test_async_read(self, - tmpdir, - use_cuda_pinned_tensor, - single_submit, - overlap_events, - cuda_device): - _skip_for_invalid_environment(use_cuda_device=cuda_device, - use_cuda_pinned_tensor=use_cuda_pinned_tensor) + def test_async_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, cuda_device): + _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor) use_cpu_locked_tensor = False - h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, - QUEUE_DEPTH, - single_submit, - overlap_events, - IO_PARALLEL) + h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL) if cuda_device: - aio_buffer = torch.empty(IO_SIZE, - dtype=torch.uint8, - device=get_accelerator().device_name()) + aio_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name()) elif use_cuda_pinned_tensor: - aio_buffer = get_accelerator().pin_memory( - torch.empty(IO_SIZE, - dtype=torch.uint8, - device='cpu')) + aio_buffer = get_accelerator().pin_memory(torch.empty(IO_SIZE, dtype=torch.uint8, device='cpu')) else: - aio_buffer = h.new_cpu_locked_tensor(IO_SIZE, - torch.empty(0, - dtype=torch.uint8)) + aio_buffer = h.new_cpu_locked_tensor(IO_SIZE, torch.empty(0, dtype=torch.uint8)) use_cpu_locked_tensor = True _validate_handle_state(h, single_submit, overlap_events) @@ -181,20 +150,11 @@ class TestWrite(DistributedTest): init_distributed = False set_dist_env = False - def test_parallel_write(self, - tmpdir, - use_cuda_pinned_tensor, - single_submit, - overlap_events): - _skip_for_invalid_environment(use_cuda_device=False, - use_cuda_pinned_tensor=use_cuda_pinned_tensor) + def test_parallel_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events): + _skip_for_invalid_environment(use_cuda_device=False, use_cuda_pinned_tensor=use_cuda_pinned_tensor) ref_file, ref_buffer = _do_ref_write(tmpdir) - h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, - QUEUE_DEPTH, - single_submit, - overlap_events, - IO_PARALLEL) + h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL) if use_cuda_pinned_tensor: aio_file, aio_buffer = _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer) @@ -215,22 +175,12 @@ class TestWrite(DistributedTest): assert filecmp.cmp(ref_file, aio_file, shallow=False) @pytest.mark.parametrize("cuda_device", [True, False]) - def test_async_write(self, - tmpdir, - use_cuda_pinned_tensor, - single_submit, - overlap_events, - cuda_device): - _skip_for_invalid_environment(use_cuda_device=cuda_device, - use_cuda_pinned_tensor=use_cuda_pinned_tensor) + def test_async_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, cuda_device): + _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor) ref_file, ref_buffer = _do_ref_write(tmpdir) - h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, - QUEUE_DEPTH, - single_submit, - overlap_events, - IO_PARALLEL) + h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL) use_cpu_locked_tensor = False if cuda_device: aio_file, aio_buffer = _get_test_write_file_and_cuda_buffer(tmpdir, ref_buffer) @@ -269,8 +219,7 @@ class TestAsyncQueue(DistributedTest): @pytest.mark.parametrize("async_queue", [2, 3]) def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, cuda_device): - _skip_for_invalid_environment(use_cuda_device=cuda_device, - use_cuda_pinned_tensor=use_cuda_pinned_tensor) + _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor) ref_files = [] for i in range(async_queue): @@ -279,33 +228,22 @@ class TestAsyncQueue(DistributedTest): single_submit = True overlap_events = True - h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, - QUEUE_DEPTH, - single_submit, - overlap_events, - IO_PARALLEL) + h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL) use_cpu_locked_tensor = False if cuda_device: aio_buffers = [ - torch.empty(IO_SIZE, - dtype=torch.uint8, - device=get_accelerator().device_name()) + torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name()) for _ in range(async_queue) ] elif use_cuda_pinned_tensor: aio_buffers = [ - get_accelerator().pin_memory( - torch.empty(IO_SIZE, - dtype=torch.uint8, - device='cpu')) for _ in range(async_queue) + get_accelerator().pin_memory(torch.empty(IO_SIZE, dtype=torch.uint8, device='cpu')) + for _ in range(async_queue) ] else: tmp_tensor = torch.empty(0, dtype=torch.uint8) - aio_buffers = [ - h.new_cpu_locked_tensor(IO_SIZE, - tmp_tensor) for _ in range(async_queue) - ] + aio_buffers = [h.new_cpu_locked_tensor(IO_SIZE, tmp_tensor) for _ in range(async_queue)] use_cpu_locked_tensor = True _validate_handle_state(h, single_submit, overlap_events) @@ -328,8 +266,7 @@ class TestAsyncQueue(DistributedTest): @pytest.mark.parametrize("async_queue", [2, 3]) def test_write(self, tmpdir, use_cuda_pinned_tensor, async_queue, cuda_device): - _skip_for_invalid_environment(use_cuda_device=cuda_device, - use_cuda_pinned_tensor=use_cuda_pinned_tensor) + _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor) ref_files = [] ref_buffers = [] @@ -340,11 +277,7 @@ class TestAsyncQueue(DistributedTest): single_submit = True overlap_events = True - h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, - QUEUE_DEPTH, - single_submit, - overlap_events, - IO_PARALLEL) + h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL) aio_files = [] aio_buffers = [] diff --git a/tests/unit/ops/quantizer/test_dequantize.py b/tests/unit/ops/quantizer/test_dequantize.py index 5dc2f7d68..37d4777e0 100644 --- a/tests/unit/ops/quantizer/test_dequantize.py +++ b/tests/unit/ops/quantizer/test_dequantize.py @@ -21,11 +21,8 @@ def run_quantize(data, num_groups, q_bits, is_symmetric_quant): if quantize_module is None: quantize_module = op_builder.QuantizerBuilder().load() - return quantize_module.quantize( - data, - num_groups, - q_bits, - quantize_module.Symmetric if is_symmetric_quant else quantize_module.Asymmetric) + return quantize_module.quantize(data, num_groups, q_bits, + quantize_module.Symmetric if is_symmetric_quant else quantize_module.Asymmetric) def run_dequantize(quantized_data, params, num_groups, q_bits, is_symmetric_quant): @@ -33,12 +30,8 @@ def run_dequantize(quantized_data, params, num_groups, q_bits, is_symmetric_quan if quantize_module is None: quantize_module = op_builder.QuantizerBuilder().load() - return quantize_module.dequantize( - quantized_data, - params, - num_groups, - q_bits, - quantize_module.Symmetric if is_symmetric_quant else quantize_module.Asymmetric) + return quantize_module.dequantize(quantized_data, params, num_groups, q_bits, + quantize_module.Symmetric if is_symmetric_quant else quantize_module.Asymmetric) def run_ref_dequantize(quantized_data, params, num_groups, q_bits, is_symmetric_quant): @@ -58,39 +51,15 @@ def run_ref_dequantize(quantized_data, params, num_groups, q_bits, is_symmetric_ @pytest.mark.inference_ops @pytest.mark.parametrize("num_groups", [1, 13, 512]) -@pytest.mark.parametrize("num_elems", - [8, - 16, - 32, - 64, - 128, - 256, - 4096, - 8192, - 12288, - 16384]) +@pytest.mark.parametrize("num_elems", [8, 16, 32, 64, 128, 256, 4096, 8192, 12288, 16384]) @pytest.mark.parametrize("is_symmetric_quant", [True, False]) @pytest.mark.parametrize("q_bits", [4, 8]) def test_dequantize(num_elems, num_groups, is_symmetric_quant, q_bits): - activations = torch.randn((num_groups, - num_elems), - dtype=torch.float16, - device=get_accelerator().device_name()) + activations = torch.randn((num_groups, num_elems), dtype=torch.float16, device=get_accelerator().device_name()) quantized_data, params = run_quantize(activations, num_groups, q_bits, is_symmetric_quant) - ds_dequant = run_dequantize(quantized_data, - params, - num_groups, - q_bits, - is_symmetric_quant) - ref_dequant = run_ref_dequantize(quantized_data, - params, - num_groups, - q_bits, - is_symmetric_quant) + ds_dequant = run_dequantize(quantized_data, params, num_groups, q_bits, is_symmetric_quant) + ref_dequant = run_ref_dequantize(quantized_data, params, num_groups, q_bits, is_symmetric_quant) - assert (torch.allclose(ds_dequant.flatten(), - ref_dequant.flatten(), - rtol=3e-2, - atol=2e-3)) + assert (torch.allclose(ds_dequant.flatten(), ref_dequant.flatten(), rtol=3e-2, atol=2e-3)) diff --git a/tests/unit/ops/quantizer/test_fake_quantization.py b/tests/unit/ops/quantizer/test_fake_quantization.py index c5304f769..2ea904f5b 100644 --- a/tests/unit/ops/quantizer/test_fake_quantization.py +++ b/tests/unit/ops/quantizer/test_fake_quantization.py @@ -45,8 +45,7 @@ def run_quant_dequant(inputs, groups, bits): # Note that we have an explicit boundary for groups as ((size / groups) - 1) / 4096 + 1) <= MAX_REG. def test_fake_quant_dequant(tensor_shape, groups): - input_tensor = torch.rand((tensor_shape), - dtype=torch.float16).to(get_accelerator().device_name()) + input_tensor = torch.rand((tensor_shape), dtype=torch.float16).to(get_accelerator().device_name()) # 8-bit quantization. ref_input_8bit = input_tensor.clone().detach() diff --git a/tests/unit/ops/quantizer/test_quantize.py b/tests/unit/ops/quantizer/test_quantize.py index 3cfd812e6..0d87ab784 100644 --- a/tests/unit/ops/quantizer/test_quantize.py +++ b/tests/unit/ops/quantizer/test_quantize.py @@ -15,12 +15,8 @@ def run_quantize_ds(activations, num_groups, q_bits, is_symmetric_quant): if inference_module is None: inference_module = op_builder.QuantizerBuilder().load() - return inference_module.quantize( - activations, - num_groups, - q_bits, - inference_module.Symmetric - if is_symmetric_quant else inference_module.Asymmetric) + return inference_module.quantize(activations, num_groups, q_bits, + inference_module.Symmetric if is_symmetric_quant else inference_module.Asymmetric) def get_q_props(q_bits): @@ -33,13 +29,7 @@ def get_q_props(q_bits): return q_range, q_max, q_min -def get_scale_zero_point(q_bits, - is_symmetric_quant, - max, - min, - absmax, - scales=None, - zero_points=None): +def get_scale_zero_point(q_bits, is_symmetric_quant, max, min, absmax, scales=None, zero_points=None): q_range, q_max, q_min = get_q_props(q_bits) @@ -47,14 +37,11 @@ def get_scale_zero_point(q_bits, scale = torch.empty_like(absmax) for i, x in enumerate(absmax): scale[i] = torch.ones_like(x) if x == 0 else q_range / (2 * x) - zero_point = torch.zeros(scale.shape, - dtype=torch.float32, - device=get_accelerator().device_name()) + zero_point = torch.zeros(scale.shape, dtype=torch.float32, device=get_accelerator().device_name()) else: scale = torch.empty_like(max) for i, x in enumerate(max): - scale[i] = torch.ones_like(x) if max[i] == min[i] else q_range / (max[i] - - min[i]) + scale[i] = torch.ones_like(x) if max[i] == min[i] else q_range / (max[i] - min[i]) zero_point = q_min - (min * scale) return scale, zero_point @@ -73,15 +60,14 @@ def run_float_quantize(q_bits, is_symmetric_quant, activations_ref, num_groups): activations_ref = activations_ref.reshape(num_groups, -1).to(dtype=torch.float32) - max_abs_activations_ref = torch.amax(torch.abs(activations_ref), - dim=-1).view(num_groups, - -1) + max_abs_activations_ref = torch.amax(torch.abs(activations_ref), dim=-1).view(num_groups, -1) max_activations_ref = torch.amax(activations_ref, dim=-1).view(num_groups, -1) min_activations_ref = torch.amin(activations_ref, dim=-1).view(num_groups, -1) _, q_max, q_min = get_q_props(q_bits) - scale, zero_point = get_scale_zero_point(q_bits, is_symmetric_quant, max_activations_ref, min_activations_ref, max_abs_activations_ref) + scale, zero_point = get_scale_zero_point(q_bits, is_symmetric_quant, max_activations_ref, min_activations_ref, + max_abs_activations_ref) data_f = activations_ref * scale @@ -90,9 +76,7 @@ def run_float_quantize(q_bits, is_symmetric_quant, activations_ref, num_groups): data_i32 = torch.round(data_f).to(dtype=torch.int32) - data_i32 = torch.minimum(torch.maximum(data_i32, - q_min.expand_as(data_i32)), - q_max.expand_as(data_i32)) + data_i32 = torch.minimum(torch.maximum(data_i32, q_min.expand_as(data_i32)), q_max.expand_as(data_i32)) data_i8 = data_i32.to(dtype=torch.int8) scales = (1.0 / scale).reshape(-1, 1) @@ -104,34 +88,18 @@ def run_float_quantize(q_bits, is_symmetric_quant, activations_ref, num_groups): @pytest.mark.inference_ops @pytest.mark.parametrize("num_groups", [1, 13, 512]) -@pytest.mark.parametrize("num_elems", - [8, - 16, - 32, - 64, - 128, - 256, - 4096, - 8192, - 12288, - 16384]) +@pytest.mark.parametrize("num_elems", [8, 16, 32, 64, 128, 256, 4096, 8192, 12288, 16384]) @pytest.mark.parametrize("is_symmetric_quant", [True, False]) @pytest.mark.parametrize("q_bits", [4, 8]) @pytest.mark.parametrize("directed_case", ["all_zeros", None]) -def test_float_quantize(num_elems, - num_groups, - is_symmetric_quant, - q_bits, - directed_case): +def test_float_quantize(num_elems, num_groups, is_symmetric_quant, q_bits, directed_case): if directed_case == "all_zeros": - activations_ds = torch.zeros((num_groups, - num_elems), + activations_ds = torch.zeros((num_groups, num_elems), dtype=torch.float16, device=get_accelerator().device_name()) else: - activations_ds = torch.randn((num_groups, - num_elems), + activations_ds = torch.randn((num_groups, num_elems), dtype=torch.float16, device=get_accelerator().device_name()) activations_ref = activations_ds.clone().detach() @@ -144,19 +112,9 @@ def test_float_quantize(num_elems, ds_out_tensor = int4x2to2xint4(ds_out_tensor) # Allow a max difference of 1 to account for differences in rounding in pytorch implementation - assert (torch.all( - torch.lt(torch.abs(ds_out_tensor.flatten() - ref_out_tensor.flatten()), - 2))) + assert (torch.all(torch.lt(torch.abs(ds_out_tensor.flatten() - ref_out_tensor.flatten()), 2))) if is_symmetric_quant: assert (torch.allclose(ds_out_params.flatten(), ref_params[:, 0].flatten())) else: - assert (torch.allclose(ds_out_params[:, - 0].flatten(), - ref_params[:, - 0].flatten())) - assert (torch.allclose(ds_out_params[:, - 1].flatten(), - ref_params[:, - 1].flatten(), - atol=5e-5, - rtol=5e-5)) + assert (torch.allclose(ds_out_params[:, 0].flatten(), ref_params[:, 0].flatten())) + assert (torch.allclose(ds_out_params[:, 1].flatten(), ref_params[:, 1].flatten(), atol=5e-5, rtol=5e-5)) diff --git a/tests/unit/ops/sparse_attention/test_sparse_attention.py b/tests/unit/ops/sparse_attention/test_sparse_attention.py index 70be0971a..3c8a1e043 100644 --- a/tests/unit/ops/sparse_attention/test_sparse_attention.py +++ b/tests/unit/ops/sparse_attention/test_sparse_attention.py @@ -13,8 +13,7 @@ from deepspeed.ops.op_builder import SparseAttnBuilder from unit.util import skip_on_arch, skip_on_cuda if not deepspeed.ops.__compatible_ops__[SparseAttnBuilder.NAME]: - pytest.skip("sparse attention op is not compatible on this system", - allow_module_level=True) + pytest.skip("sparse attention op is not compatible on this system", allow_module_level=True) def dense_to_sparse(w, mask, block): @@ -26,7 +25,7 @@ def dense_to_sparse(w, mask, block): h, i, j = nnz[:, 0], nnz[:, 1], nnz[:, 2] for zz in range(Z): for idx, (hh, ii, jj) in enumerate(zip(h, i, j)): - ret[zz, idx, :, :] = w[zz, hh, ii*block: (ii+1)*block, jj*block: (jj+1)*block] + ret[zz, idx, :, :] = w[zz, hh, ii * block:(ii + 1) * block, jj * block:(jj + 1) * block] return ret @@ -96,34 +95,23 @@ def init_softmax_inputs(Z, H, M, N, scale, rho, block, dtype, dense_x=True, layo if layout is None: layout = make_layout(rho, (H, M // block, N // block)) if dense_x: - x = torch.rand((Z, - H, - M, - N), - dtype=dtype, - requires_grad=True, - device=get_accelerator().device_name()) + x = torch.rand((Z, H, M, N), dtype=dtype, requires_grad=True, device=get_accelerator().device_name()) else: - x = torch.rand((Z, - layout.sum(), - block, - block), + x = torch.rand((Z, layout.sum(), block, block), dtype=dtype, requires_grad=True, device=get_accelerator().device_name()) dx = torch.rand_like(x) bool_attn_mask = torch.randint(low=0, high=2, - size=(N, - N), + size=(N, N), dtype=torch.bool, requires_grad=False, device=get_accelerator().device_name()) fp_attn_mask = bool_attn_mask.type(dtype) kp_mask = torch.randint(low=0, high=2, - size=(Z, - N), + size=(Z, N), dtype=dtype, requires_grad=False, device=get_accelerator().device_name()) @@ -144,7 +132,15 @@ def test_softmax(block, width, dtype): scale = 0.4 rho = 0.4 M = N = width - layout, x, dx, bool_attn_mask, fp_attn_mask, kp_mask = init_softmax_inputs(Z, H, M, N, scale, rho, block, dtype, layout=None) + layout, x, dx, bool_attn_mask, fp_attn_mask, kp_mask = init_softmax_inputs(Z, + H, + M, + N, + scale, + rho, + block, + dtype, + layout=None) ref_y, ref_dx = run_softmax_reference(x, scale, dx, kp_mask, bool_attn_mask, layout, block) st_y, st_dx = run_softmax_sparse(x, scale, dx, kp_mask, fp_attn_mask, layout, block) @@ -195,20 +191,8 @@ def init_matmul_inputs(Z, H, M, N, K, rho, mode, trans_a, trans_b, block, dtype, BS0 = N if trans_b else K BS1 = K if trans_b else N shape = {'sdd': (M, N), 'dsd': (AS0, AS1), 'dds': (BS0, BS1)}[mode] - x = torch.rand((Z, - H, - AS0, - AS1), - dtype=dtype, - requires_grad=True, - device=get_accelerator().device_name()) - w = torch.rand((Z, - H, - BS0, - BS1), - dtype=dtype, - requires_grad=True, - device=get_accelerator().device_name()) + x = torch.rand((Z, H, AS0, AS1), dtype=dtype, requires_grad=True, device=get_accelerator().device_name()) + w = torch.rand((Z, H, BS0, BS1), dtype=dtype, requires_grad=True, device=get_accelerator().device_name()) dy = torch.rand((Z, H, M, N), dtype=dtype, device=get_accelerator().device_name()) if layout is None: layout = make_layout(rho, (H, shape[0] // block, shape[1] // block)) diff --git a/tests/unit/ops/spatial/test_nhwc_bias_add.py b/tests/unit/ops/spatial/test_nhwc_bias_add.py index f3a31cf47..085731a67 100644 --- a/tests/unit/ops/spatial/test_nhwc_bias_add.py +++ b/tests/unit/ops/spatial/test_nhwc_bias_add.py @@ -18,22 +18,7 @@ def ref_bias_add(activations, bias): return activations + bias.reshape(1, -1, 1, 1) -channels_list = [ - 192, - 384, - 320, - 576, - 640, - 768, - 960, - 1152, - 1280, - 1536, - 1600, - 1920, - 2240, - 2560 -] +channels_list = [192, 384, 320, 576, 640, 768, 960, 1152, 1280, 1536, 1600, 1920, 2240, 2560] @pytest.mark.inference_ops @@ -41,16 +26,10 @@ channels_list = [ @pytest.mark.parametrize("image_size", [16, 32, 64]) @pytest.mark.parametrize("channels", channels_list) def test_bias_add(batch, image_size, channels): - activations = torch.randn( - (batch, - channels, - image_size, - image_size), - dtype=torch.float16, - device=get_accelerator().device_name()).to(memory_format=torch.channels_last) - bias = torch.randn((channels), - dtype=torch.float16, - device=get_accelerator().device_name()) + activations = torch.randn((batch, channels, image_size, image_size), + dtype=torch.float16, + device=get_accelerator().device_name()).to(memory_format=torch.channels_last) + bias = torch.randn((channels), dtype=torch.float16, device=get_accelerator().device_name()) ref_vals = ref_bias_add(activations.clone().detach(), bias) ds_vals = nhwc_bias_add(activations, bias) @@ -67,23 +46,13 @@ def ref_bias_add_add(activations, bias, other): @pytest.mark.parametrize("image_size", [16, 32, 64]) @pytest.mark.parametrize("channels", channels_list) def test_bias_add_add(batch, image_size, channels): - activations = torch.randn( - (batch, - channels, - image_size, - image_size), - dtype=torch.float16, - device=get_accelerator().device_name()).to(memory_format=torch.channels_last) - other = torch.randn( - (batch, - channels, - image_size, - image_size), - dtype=torch.float16, - device=get_accelerator().device_name()).to(memory_format=torch.channels_last) - bias = torch.randn((channels), - dtype=torch.float16, - device=get_accelerator().device_name()) + activations = torch.randn((batch, channels, image_size, image_size), + dtype=torch.float16, + device=get_accelerator().device_name()).to(memory_format=torch.channels_last) + other = torch.randn((batch, channels, image_size, image_size), + dtype=torch.float16, + device=get_accelerator().device_name()).to(memory_format=torch.channels_last) + bias = torch.randn((channels), dtype=torch.float16, device=get_accelerator().device_name()) ref_vals = ref_bias_add_add(activations.clone().detach(), bias, other) ds_vals = nhwc_bias_add(activations, bias, other=other) @@ -92,13 +61,7 @@ def test_bias_add_add(batch, image_size, channels): def ref_bias_add_bias_add(activations, bias, other, other_bias): - return (activations + bias.reshape(1, - -1, - 1, - 1)) + (other + other_bias.reshape(1, - -1, - 1, - 1)) + return (activations + bias.reshape(1, -1, 1, 1)) + (other + other_bias.reshape(1, -1, 1, 1)) @pytest.mark.inference_ops @@ -106,31 +69,16 @@ def ref_bias_add_bias_add(activations, bias, other, other_bias): @pytest.mark.parametrize("image_size", [16, 32, 64]) @pytest.mark.parametrize("channels", channels_list) def test_bias_add_bias_add(batch, image_size, channels): - activations = torch.randn( - (batch, - channels, - image_size, - image_size), - dtype=torch.float16, - device=get_accelerator().device_name()).to(memory_format=torch.channels_last) - other = torch.randn( - (batch, - channels, - image_size, - image_size), - dtype=torch.float16, - device=get_accelerator().device_name()).to(memory_format=torch.channels_last) - bias = torch.randn((channels), - dtype=torch.float16, - device=get_accelerator().device_name()) - other_bias = torch.randn((channels), - dtype=torch.float16, - device=get_accelerator().device_name()) + activations = torch.randn((batch, channels, image_size, image_size), + dtype=torch.float16, + device=get_accelerator().device_name()).to(memory_format=torch.channels_last) + other = torch.randn((batch, channels, image_size, image_size), + dtype=torch.float16, + device=get_accelerator().device_name()).to(memory_format=torch.channels_last) + bias = torch.randn((channels), dtype=torch.float16, device=get_accelerator().device_name()) + other_bias = torch.randn((channels), dtype=torch.float16, device=get_accelerator().device_name()) - ref_vals = ref_bias_add_bias_add(activations.clone().detach(), - bias, - other, - other_bias) + ref_vals = ref_bias_add_bias_add(activations.clone().detach(), bias, other, other_bias) ds_vals = nhwc_bias_add(activations, bias, other=other, other_bias=other_bias) assert allclose(ds_vals, ref_vals) diff --git a/tests/unit/ops/transformer/inference/test_bias_add.py b/tests/unit/ops/transformer/inference/test_bias_add.py index f3475a147..2a58933fa 100644 --- a/tests/unit/ops/transformer/inference/test_bias_add.py +++ b/tests/unit/ops/transformer/inference/test_bias_add.py @@ -7,8 +7,7 @@ from deepspeed.accelerator import get_accelerator from deepspeed.ops.op_builder import InferenceBuilder if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]: - pytest.skip("Inference ops are not available on this system", - allow_module_level=True) + pytest.skip("Inference ops are not available on this system", allow_module_level=True) inference_module = None torch_minor_version = None @@ -40,14 +39,8 @@ def run_bias_add_ds(activations, bias): @pytest.mark.parametrize("channels", [512, 1232, 4096]) @pytest.mark.parametrize("dtype", [torch.float16, torch.float32], ids=["fp16", "fp32"]) def test_bias_add(batch, sequence, channels, dtype): - activations_ds = torch.randn((batch, - sequence, - channels), - dtype=dtype, - device=get_accelerator().device_name()) - bias_ds = torch.randn((channels), - dtype=dtype, - device=get_accelerator().device_name()) + activations_ds = torch.randn((batch, sequence, channels), dtype=dtype, device=get_accelerator().device_name()) + bias_ds = torch.randn((channels), dtype=dtype, device=get_accelerator().device_name()) activations_ref = activations_ds.clone().detach() bias_ref = bias_ds.clone().detach() diff --git a/tests/unit/ops/transformer/inference/test_bias_geglu.py b/tests/unit/ops/transformer/inference/test_bias_geglu.py index c70974e51..508a1a990 100644 --- a/tests/unit/ops/transformer/inference/test_bias_geglu.py +++ b/tests/unit/ops/transformer/inference/test_bias_geglu.py @@ -9,8 +9,7 @@ from deepspeed.ops.op_builder import InferenceBuilder from deepspeed.accelerator import get_accelerator if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]: - pytest.skip("Inference ops are not available on this system", - allow_module_level=True) + pytest.skip("Inference ops are not available on this system", allow_module_level=True) inference_module = None torch_minor_version = None @@ -27,8 +26,7 @@ def run_bias_geglu_reference(activations, bias): # Explicitly using the default GeLU activations = activations + bias.reshape(1, 1, -1) hidden_states, gate = activations.chunk(2, dim=-1) - return hidden_states * torch.nn.functional.gelu(gate.to(torch.float32)).to( - activations.dtype) + return hidden_states * torch.nn.functional.gelu(gate.to(torch.float32)).to(activations.dtype) def run_bias_geglu_ds(activation, bias): @@ -44,14 +42,8 @@ def run_bias_geglu_ds(activation, bias): @pytest.mark.parametrize("channels", [512, 1232, 4096]) @pytest.mark.parametrize("dtype", [torch.float16, torch.float32]) def test_bias_geglu(batch, sequence, channels, dtype): - activation = torch.randn((batch, - sequence, - channels * 2), - dtype=dtype, - device=get_accelerator().device_name()) - bias = torch.randn((channels * 2), - dtype=dtype, - device=get_accelerator().device_name()) + activation = torch.randn((batch, sequence, channels * 2), dtype=dtype, device=get_accelerator().device_name()) + bias = torch.randn((channels * 2), dtype=dtype, device=get_accelerator().device_name()) ds_out = run_bias_geglu_ds(activation, bias) ref_out = run_bias_geglu_reference(activation, bias) diff --git a/tests/unit/ops/transformer/inference/test_bias_gelu.py b/tests/unit/ops/transformer/inference/test_bias_gelu.py index 3c1762179..eea1f9a77 100644 --- a/tests/unit/ops/transformer/inference/test_bias_gelu.py +++ b/tests/unit/ops/transformer/inference/test_bias_gelu.py @@ -10,8 +10,7 @@ from deepspeed.ops.op_builder import InferenceBuilder from packaging import version as pkg_version if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]: - pytest.skip("Inference ops are not available on this system", - allow_module_level=True) + pytest.skip("Inference ops are not available on this system", allow_module_level=True) inference_module = None torch_minor_version = None @@ -25,8 +24,7 @@ def allclose(x, y): def run_bias_gelu_reference(activations, bias): # Expected behavior is that of casting to float32 internally and using the tanh approximation - return torch.nn.functional.gelu(activations.to(torch.float32) + - bias.to(torch.float32), + return torch.nn.functional.gelu(activations.to(torch.float32) + bias.to(torch.float32), approximate='tanh').to(activations.dtype) @@ -49,14 +47,8 @@ def test_bias_gelu(batch, sequence, channels, dtype): if pkg_version.parse(torch.__version__) < pkg_version.parse("1.12"): pytest.skip("gelu implementation matches only after torch 1.12") - activations_ds = torch.randn((batch, - sequence, - channels), - dtype=dtype, - device=get_accelerator().device_name()) - bias_ds = torch.randn((channels), - dtype=dtype, - device=get_accelerator().device_name()) + activations_ds = torch.randn((batch, sequence, channels), dtype=dtype, device=get_accelerator().device_name()) + bias_ds = torch.randn((channels), dtype=dtype, device=get_accelerator().device_name()) activations_ref = activations_ds.clone().detach() bias_ref = bias_ds.clone().detach() diff --git a/tests/unit/ops/transformer/inference/test_bias_relu.py b/tests/unit/ops/transformer/inference/test_bias_relu.py index e2b66f6bd..b0ac4c624 100644 --- a/tests/unit/ops/transformer/inference/test_bias_relu.py +++ b/tests/unit/ops/transformer/inference/test_bias_relu.py @@ -9,8 +9,7 @@ from deepspeed.accelerator import get_accelerator from deepspeed.ops.op_builder import InferenceBuilder if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]: - pytest.skip("Inference ops are not available on this system", - allow_module_level=True) + pytest.skip("Inference ops are not available on this system", allow_module_level=True) inference_module = None torch_minor_version = None @@ -24,8 +23,7 @@ def allclose(x, y): def run_bias_relu_reference(activations, bias): # Expected behavior is that of casting to float32 internally - return torch.nn.functional.relu( - activations.to(torch.float32) + bias.to(torch.float32)).to(activations.dtype) + return torch.nn.functional.relu(activations.to(torch.float32) + bias.to(torch.float32)).to(activations.dtype) def run_bias_relu_ds(activations, bias): @@ -44,14 +42,8 @@ def run_bias_relu_ds(activations, bias): @pytest.mark.parametrize("channels", [512, 1232, 4096]) @pytest.mark.parametrize("dtype", [torch.float16, torch.float32]) def test_bias_relu(batch, sequence, channels, dtype): - activations_ds = torch.randn((batch, - sequence, - channels), - dtype=dtype, - device=get_accelerator().device_name()) - bias_ds = torch.randn((channels), - dtype=dtype, - device=get_accelerator().device_name()) + activations_ds = torch.randn((batch, sequence, channels), dtype=dtype, device=get_accelerator().device_name()) + bias_ds = torch.randn((channels), dtype=dtype, device=get_accelerator().device_name()) activations_ref = activations_ds.clone().detach() bias_ref = bias_ds.clone().detach() diff --git a/tests/unit/ops/transformer/inference/test_layer_norm.py b/tests/unit/ops/transformer/inference/test_layer_norm.py index 61f645562..66b71b0b1 100644 --- a/tests/unit/ops/transformer/inference/test_layer_norm.py +++ b/tests/unit/ops/transformer/inference/test_layer_norm.py @@ -9,8 +9,7 @@ from deepspeed.accelerator import get_accelerator from deepspeed.ops.op_builder import InferenceBuilder if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]: - pytest.skip("Inference ops are not available on this system", - allow_module_level=True) + pytest.skip("Inference ops are not available on this system", allow_module_level=True) inference_module = None @@ -25,11 +24,7 @@ def ref_implementation(vals, gamma, beta, espilon, channels, dtype): vals_f = vals.to(torch.float32) gamma_f = gamma.to(torch.float32) beta_f = beta.to(torch.float32) - return torch.nn.functional.layer_norm(vals_f, - (channels, - ), - weight=gamma_f, - bias=beta_f).to(dtype) + return torch.nn.functional.layer_norm(vals_f, (channels, ), weight=gamma_f, bias=beta_f).to(dtype) def ds_implementation(vals, gamma, beta, epsilon): @@ -45,17 +40,9 @@ def ds_implementation(vals, gamma, beta, epsilon): @pytest.mark.parametrize("channels", [384, 512, 768, 1024, 2048, 8192, 14432]) @pytest.mark.parametrize("dtype", [torch.float16, torch.float32]) def test_layer_norm(batch, seq_len, channels, dtype): - vals = torch.randn((batch, - seq_len, - channels), - dtype=dtype, - device=get_accelerator().current_device_name()) - gamma = torch.randn((channels), - dtype=dtype, - device=get_accelerator().current_device_name()) - beta = torch.rand((channels), - dtype=dtype, - device=get_accelerator().current_device_name()) + vals = torch.randn((batch, seq_len, channels), dtype=dtype, device=get_accelerator().current_device_name()) + gamma = torch.randn((channels), dtype=dtype, device=get_accelerator().current_device_name()) + beta = torch.rand((channels), dtype=dtype, device=get_accelerator().current_device_name()) epsilon = 1e-5 ref_output = ref_implementation(vals, gamma, beta, epsilon, channels, dtype) @@ -70,11 +57,7 @@ def residual_ref_implementation(vals, bias, res, gamma, beta, espilon, channels, res_f = res.to(torch.float32) gamma_f = gamma.to(torch.float32) beta_f = beta.to(torch.float32) - return torch.nn.functional.layer_norm(vals_f + bias_f + res_f, - (channels, - ), - weight=gamma_f, - bias=beta_f).to(dtype) + return torch.nn.functional.layer_norm(vals_f + bias_f + res_f, (channels, ), weight=gamma_f, bias=beta_f).to(dtype) def residual_ds_implementation(vals, bias, res, gamma, beta, epsilon): @@ -90,59 +73,27 @@ def residual_ds_implementation(vals, bias, res, gamma, beta, epsilon): @pytest.mark.parametrize("channels", [384, 512, 768, 1024, 2048, 8192, 14432]) @pytest.mark.parametrize("dtype", [torch.float16, torch.float32]) def test_layer_norm_residual(batch, seq_len, channels, dtype): - vals = torch.randn((batch, - seq_len, - channels), - dtype=dtype, - device=get_accelerator().current_device_name()) - residual = torch.randn((batch, - seq_len, - channels), - dtype=dtype, - device=get_accelerator().current_device_name()) - bias = torch.randn((channels), - dtype=dtype, - device=get_accelerator().current_device_name()) - gamma = torch.randn((channels), - dtype=dtype, - device=get_accelerator().current_device_name()) - beta = torch.rand((channels), - dtype=dtype, - device=get_accelerator().current_device_name()) + vals = torch.randn((batch, seq_len, channels), dtype=dtype, device=get_accelerator().current_device_name()) + residual = torch.randn((batch, seq_len, channels), dtype=dtype, device=get_accelerator().current_device_name()) + bias = torch.randn((channels), dtype=dtype, device=get_accelerator().current_device_name()) + gamma = torch.randn((channels), dtype=dtype, device=get_accelerator().current_device_name()) + beta = torch.rand((channels), dtype=dtype, device=get_accelerator().current_device_name()) epsilon = 1e-5 new_output = residual_ds_implementation(vals, bias, residual, gamma, beta, epsilon) - ref_output = residual_ref_implementation(vals, - bias, - residual, - gamma, - beta, - epsilon, - channels, - dtype) + ref_output = residual_ref_implementation(vals, bias, residual, gamma, beta, epsilon, channels, dtype) assert allclose(new_output, ref_output) -def residual_store_ref_implementation(vals, - bias, - res, - gamma, - beta, - espilon, - channels, - dtype): +def residual_store_ref_implementation(vals, bias, res, gamma, beta, espilon, channels, dtype): vals_f = vals.to(torch.float32) bias_f = bias.to(torch.float32).reshape(1, 1, -1) res_f = res.to(torch.float32) gamma_f = gamma.to(torch.float32) beta_f = beta.to(torch.float32) res_output = vals_f + bias_f + res_f - norm_output = torch.nn.functional.layer_norm(res_output, - (channels, - ), - weight=gamma_f, - bias=beta_f).to(dtype) + norm_output = torch.nn.functional.layer_norm(res_output, (channels, ), weight=gamma_f, bias=beta_f).to(dtype) return norm_output, res_output.to(dtype) @@ -150,13 +101,7 @@ def residual_store_ds_implementation(vals, bias, res, gamma, beta, epsilon): global inference_module if inference_module is None: inference_module = InferenceBuilder().load() - return inference_module.layer_norm_residual_store_pre_ln_res( - vals, - bias, - res, - gamma, - beta, - epsilon) + return inference_module.layer_norm_residual_store_pre_ln_res(vals, bias, res, gamma, beta, epsilon) @pytest.mark.inference_ops @@ -165,36 +110,16 @@ def residual_store_ds_implementation(vals, bias, res, gamma, beta, epsilon): @pytest.mark.parametrize("channels", [384, 512, 768, 1024, 2048, 8192, 14432]) @pytest.mark.parametrize("dtype", [torch.float16, torch.float32]) def test_layer_norm_residual_store_pre_ln_res(batch, seq_len, channels, dtype): - vals = torch.randn((batch, - seq_len, - channels), - dtype=dtype, - device=get_accelerator().current_device_name()) - residual = torch.randn((batch, - seq_len, - channels), - dtype=dtype, - device=get_accelerator().current_device_name()) - bias = torch.randn((channels), - dtype=dtype, - device=get_accelerator().current_device_name()) - gamma = torch.randn((channels), - dtype=dtype, - device=get_accelerator().current_device_name()) - beta = torch.rand((channels), - dtype=dtype, - device=get_accelerator().current_device_name()) + vals = torch.randn((batch, seq_len, channels), dtype=dtype, device=get_accelerator().current_device_name()) + residual = torch.randn((batch, seq_len, channels), dtype=dtype, device=get_accelerator().current_device_name()) + bias = torch.randn((channels), dtype=dtype, device=get_accelerator().current_device_name()) + gamma = torch.randn((channels), dtype=dtype, device=get_accelerator().current_device_name()) + beta = torch.rand((channels), dtype=dtype, device=get_accelerator().current_device_name()) epsilon = 1e-5 # Need to run the reference first since there's an in-place component to ours - ref_norm_output, norm_res_output = residual_store_ref_implementation(vals, - bias, - residual, - gamma, - beta, - epsilon, - channels, - dtype) + ref_norm_output, norm_res_output = residual_store_ref_implementation(vals, bias, residual, gamma, beta, epsilon, + channels, dtype) ds_norm_output, ds_res_output = residual_store_ds_implementation(vals, bias, residual, gamma, beta, epsilon) diff --git a/tests/unit/ops/transformer/inference/test_moe_res_matmult.py b/tests/unit/ops/transformer/inference/test_moe_res_matmult.py index fdd6e8607..0a3a4ee8e 100644 --- a/tests/unit/ops/transformer/inference/test_moe_res_matmult.py +++ b/tests/unit/ops/transformer/inference/test_moe_res_matmult.py @@ -9,8 +9,7 @@ from deepspeed.accelerator import get_accelerator from deepspeed.ops.op_builder import InferenceBuilder if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]: - pytest.skip("Inference ops are not available on this system", - allow_module_level=True) + pytest.skip("Inference ops are not available on this system", allow_module_level=True) inference_module = None @@ -38,26 +37,10 @@ def run_moe_res_matmul_ds(residual, coef, output): @pytest.mark.parametrize("c", [1, 4]) @pytest.mark.parametrize("dtype", [torch.float32, torch.float16]) def test_moe_residual_matmul(hidden_dim, c, dtype): - residual_ds = torch.randn((c, - hidden_dim * c, - hidden_dim), - dtype=dtype, - device=get_accelerator().device_name()) - coeff1 = torch.randn((1, - 1, - hidden_dim), - dtype=dtype, - device=get_accelerator().device_name()) - coeff2 = torch.randn((1, - 1, - hidden_dim), - dtype=dtype, - device=get_accelerator().device_name()) - out_ds = torch.randn((c, - hidden_dim * c, - hidden_dim), - dtype=dtype, - device=get_accelerator().device_name()) + residual_ds = torch.randn((c, hidden_dim * c, hidden_dim), dtype=dtype, device=get_accelerator().device_name()) + coeff1 = torch.randn((1, 1, hidden_dim), dtype=dtype, device=get_accelerator().device_name()) + coeff2 = torch.randn((1, 1, hidden_dim), dtype=dtype, device=get_accelerator().device_name()) + out_ds = torch.randn((c, hidden_dim * c, hidden_dim), dtype=dtype, device=get_accelerator().device_name()) coeff_ds = torch.cat((coeff1, coeff2), dim=-1) residual_ref = residual_ds.clone().detach() coeff_ref = coeff_ds.clone().detach() diff --git a/tests/unit/ops/transformer/inference/test_residual_add.py b/tests/unit/ops/transformer/inference/test_residual_add.py index 0dacee355..2d6b515a5 100644 --- a/tests/unit/ops/transformer/inference/test_residual_add.py +++ b/tests/unit/ops/transformer/inference/test_residual_add.py @@ -9,8 +9,7 @@ from deepspeed.accelerator import get_accelerator from deepspeed.ops.op_builder import InferenceBuilder if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]: - pytest.skip("Inference ops are not available on this system", - allow_module_level=True) + pytest.skip("Inference ops are not available on this system", allow_module_level=True) def allclose(x, y): @@ -24,13 +23,7 @@ def inference_module(): return InferenceBuilder().load() -def res_add_bias_ref(hidden_state, - residual, - attn_output, - attn_bias, - final_bias, - mp_size=1, - pre_attn_norm=True): +def res_add_bias_ref(hidden_state, residual, attn_output, attn_bias, final_bias, mp_size=1, pre_attn_norm=True): if pre_attn_norm: hidden_state += (residual + final_bias + attn_output + attn_bias) / mp_size else: @@ -38,43 +31,19 @@ def res_add_bias_ref(hidden_state, return hidden_state -def res_add_bias_ref_gptj(hidden_state, - residual, - attn_output, - attn_bias, - final_bias, - add_attn_bias, - mp_size): +def res_add_bias_ref_gptj(hidden_state, residual, attn_output, attn_bias, final_bias, add_attn_bias, mp_size): hidden_state += attn_output + (residual + final_bias) / mp_size if add_attn_bias: hidden_state += attn_bias / mp_size return hidden_state -def run_residual_add_reference(hidden_state, - residual, - attn_output, - attn_bias, - final_bias, - mlp_after_attn, - add_attn_bias, - mp_size, - pre_attn_norm): +def run_residual_add_reference(hidden_state, residual, attn_output, attn_bias, final_bias, mlp_after_attn, + add_attn_bias, mp_size, pre_attn_norm): if mlp_after_attn: - return res_add_bias_ref(hidden_state, - residual, - attn_output, - attn_bias, - final_bias, - mp_size, - pre_attn_norm) + return res_add_bias_ref(hidden_state, residual, attn_output, attn_bias, final_bias, mp_size, pre_attn_norm) else: - return res_add_bias_ref_gptj(hidden_state, - residual, - attn_output, - attn_bias, - final_bias, - add_attn_bias, + return res_add_bias_ref_gptj(hidden_state, residual, attn_output, attn_bias, final_bias, add_attn_bias, mp_size) @@ -87,58 +56,20 @@ def run_residual_add_reference(hidden_state, @pytest.mark.parametrize("add_bias", [True, False]) @pytest.mark.parametrize("mp_size", [1, 2]) @pytest.mark.parametrize("pre_attn_norm", [True, False]) -def test_residual_add(inference_module, - batch, - sequence, - hidden_dim, - dtype, - mlp_after_attn, - add_bias, - mp_size, +def test_residual_add(inference_module, batch, sequence, hidden_dim, dtype, mlp_after_attn, add_bias, mp_size, pre_attn_norm): - ds_out = torch.randn((batch, - sequence, - hidden_dim), - dtype=dtype, - device=get_accelerator().device_name()) - residual = torch.randn((batch, - sequence, - hidden_dim), - dtype=dtype, - device=get_accelerator().device_name()) - attn_output = torch.randn((batch, - sequence, - hidden_dim), - dtype=dtype, - device=get_accelerator().device_name()) - final_bias = torch.randn((hidden_dim), - dtype=dtype, - device=get_accelerator().device_name()) - attn_bias = torch.randn((hidden_dim), - dtype=dtype, - device=get_accelerator().device_name()) + ds_out = torch.randn((batch, sequence, hidden_dim), dtype=dtype, device=get_accelerator().device_name()) + residual = torch.randn((batch, sequence, hidden_dim), dtype=dtype, device=get_accelerator().device_name()) + attn_output = torch.randn((batch, sequence, hidden_dim), dtype=dtype, device=get_accelerator().device_name()) + final_bias = torch.randn((hidden_dim), dtype=dtype, device=get_accelerator().device_name()) + attn_bias = torch.randn((hidden_dim), dtype=dtype, device=get_accelerator().device_name()) ref_out = ds_out.clone() - ref_out = run_residual_add_reference(ref_out, - residual, - attn_output, - attn_bias, - final_bias, - mlp_after_attn, - add_bias, - mp_size, - pre_attn_norm) + ref_out = run_residual_add_reference(ref_out, residual, attn_output, attn_bias, final_bias, mlp_after_attn, + add_bias, mp_size, pre_attn_norm) res_add_args = [ - ds_out, - residual, - attn_output, - attn_bias, - final_bias, - mp_size, - mlp_after_attn, - add_bias, - pre_attn_norm + ds_out, residual, attn_output, attn_bias, final_bias, mp_size, mlp_after_attn, add_bias, pre_attn_norm ] if dtype == torch.float16: diff --git a/tests/unit/pipe/test_pipe_module.py b/tests/unit/pipe/test_pipe_module.py index e8404b0d5..5ed6d9e20 100644 --- a/tests/unit/pipe/test_pipe_module.py +++ b/tests/unit/pipe/test_pipe_module.py @@ -22,10 +22,8 @@ LAYERS = 8 @pytest.fixture def sequential_model(): model = torch.nn.Sequential( - *[nn.Linear(HIDDEN_DIM, - HIDDEN_DIM) for _ in range(LAYERS)], - nn.Linear(HIDDEN_DIM, - 1), + *[nn.Linear(HIDDEN_DIM, HIDDEN_DIM) for _ in range(LAYERS)], + nn.Linear(HIDDEN_DIM, 1), ) return model @@ -40,8 +38,7 @@ def simple_config(): "type": "Adam", "params": { "lr": 0.001, - "betas": [0.9, - 0.999], + "betas": [0.9, 0.999], "eps": 1e-8, "weight_decay": 3e-7 } @@ -73,16 +70,14 @@ class TestPipeModuleSequential(DistributedTest): # Ensure all parameters are accounted for. my_params = sum(p.numel() for p in pipe_model.parameters()) - total_pipe_params = torch.LongTensor([my_params - ]).to(get_accelerator().device_name()) + total_pipe_params = torch.LongTensor([my_params]).to(get_accelerator().device_name()) dist.all_reduce(total_pipe_params) total_pipe_params = total_pipe_params.item() assert total_pipe_params == base_params - pipe_model, _, _, _ = deepspeed.initialize( - config=simple_config, - model=pipe_model, - model_parameters=[p for p in pipe_model.parameters()]) + pipe_model, _, _, _ = deepspeed.initialize(config=simple_config, + model=pipe_model, + model_parameters=[p for p in pipe_model.parameters()]) if pipe_model.is_first_stage or pipe_model.is_last_stage: pipe_input = base_input.clone().detach().to(get_accelerator().device_name()) diff --git a/tests/unit/profiling/flops_profiler/test_flops_profiler.py b/tests/unit/profiling/flops_profiler/test_flops_profiler.py index 1f9353358..b23cf54c3 100644 --- a/tests/unit/profiling/flops_profiler/test_flops_profiler.py +++ b/tests/unit/profiling/flops_profiler/test_flops_profiler.py @@ -9,8 +9,7 @@ from unit.common import DistributedTest TORCH_MAJOR = int(torch.__version__.split('.')[0]) TORCH_MINOR = int(torch.__version__.split('.')[1]) -pytestmark = pytest.mark.skipif(TORCH_MAJOR < 1 - or (TORCH_MAJOR == 1 and TORCH_MINOR < 3), +pytestmark = pytest.mark.skipif(TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 3), reason='requires Pytorch version 1.3 or above') @@ -22,35 +21,25 @@ TOLERANCE = 0.05 class LeNet5(torch.nn.Module): + def __init__(self, n_classes): super(LeNet5, self).__init__() self.feature_extractor = torch.nn.Sequential( - torch.nn.Conv2d(in_channels=1, - out_channels=6, - kernel_size=5, - stride=1), + torch.nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, stride=1), torch.nn.Tanh(), torch.nn.AvgPool2d(kernel_size=2), - torch.nn.Conv2d(in_channels=6, - out_channels=16, - kernel_size=5, - stride=1), + torch.nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1), torch.nn.Tanh(), torch.nn.AvgPool2d(kernel_size=2), - torch.nn.Conv2d(in_channels=16, - out_channels=120, - kernel_size=5, - stride=1), + torch.nn.Conv2d(in_channels=16, out_channels=120, kernel_size=5, stride=1), torch.nn.Tanh(), ) self.classifier = torch.nn.Sequential( - torch.nn.Linear(in_features=120, - out_features=84), + torch.nn.Linear(in_features=120, out_features=84), torch.nn.Tanh(), - torch.nn.Linear(in_features=84, - out_features=n_classes), + torch.nn.Linear(in_features=84, out_features=n_classes), ) def forward(self, x): @@ -90,9 +79,7 @@ class TestFlopsProfiler(DistributedTest): hidden_dim = 10 model = SimpleModel(hidden_dim, empty_grad=False) - model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=50, diff --git a/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py b/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py index af354fe1c..29da1e378 100644 --- a/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py +++ b/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py @@ -106,6 +106,7 @@ def _test_activation_checkpoint_ordering(module, expected_ordering, *inputs): class MaskedLinear(torch.nn.Linear): + def forward(self, x, mask): out = super().forward(x) if mask.is_floating_point(): @@ -118,12 +119,14 @@ class MaskedLinear(torch.nn.Linear): class MaskedLinearSeq(MaskedLinear): """Tests pipeline modules by also returning the mask.""" + def forward(self, x, mask): return super().forward(x, mask), mask class MaskedLinearSeqDup(MaskedLinearSeq): """MaskedLinearSeq, but with more outputs than inputs and in a different order.""" + def forward(self, x, mask): dup = x.clone().detach() * 1.38 # just an arbitrary scaling x, mask = super().forward(x, mask) @@ -131,16 +134,19 @@ class MaskedLinearSeqDup(MaskedLinearSeq): class DropMaskLinear(torch.nn.Linear): + def forward(self, x, mask): return super().forward(x) class LinearNonTensorInput(torch.nn.Linear): + def forward(self, x, non_tensor_input): return super().forward(x) class LinearNonTensorOutput(torch.nn.Linear): + def __init__(self, non_tensor_output): super().__init__(HIDDEN_DIM, HIDDEN_DIM) self.non_tensor_output = non_tensor_output @@ -173,11 +179,10 @@ def _bool_to_float(btensor, dtype=torch.float32): # both bool and float are important, as bool is not differentiable -@pytest.mark.parametrize('mask', - [ - _mixed_mask(), - _bool_to_float(_mixed_mask()), - ]) +@pytest.mark.parametrize('mask', [ + _mixed_mask(), + _bool_to_float(_mixed_mask()), +]) class TestActivationCheckpoint(DistributedTest): world_size = 1 @@ -212,16 +217,7 @@ class TestActivationCheckpoint(DistributedTest): _test_activation_checkpoint(module, *inputs) -@pytest.mark.parametrize( - 'non_tensor', - [None, - 2, - True, - (None, - 2.5), - (None, - True, - torch.randn(HIDDEN_DIM))]) +@pytest.mark.parametrize('non_tensor', [None, 2, True, (None, 2.5), (None, True, torch.randn(HIDDEN_DIM))]) class TestCheckpointNonTensor(DistributedTest): world_size = 1 @@ -238,18 +234,9 @@ class TestCheckpointNonTensor(DistributedTest): _test_activation_checkpoint(module, inputs) -@pytest.mark.parametrize('non_tensor_output', - [ - None, - (torch.randn(HIDDEN_DIM), - 2.5), - (None, - torch.randn(HIDDEN_DIM), - True), - (None, - True, - torch.randn(HIDDEN_DIM)) - ]) +@pytest.mark.parametrize('non_tensor_output', [ + None, (torch.randn(HIDDEN_DIM), 2.5), (None, torch.randn(HIDDEN_DIM), True), (None, True, torch.randn(HIDDEN_DIM)) +]) class TestCheckpointNonTensorOutputOrdering(DistributedTest): world_size = 1 diff --git a/tests/unit/runtime/comm/test_coalesced_collectives.py b/tests/unit/runtime/comm/test_coalesced_collectives.py index fa1041379..f7d1832eb 100644 --- a/tests/unit/runtime/comm/test_coalesced_collectives.py +++ b/tests/unit/runtime/comm/test_coalesced_collectives.py @@ -13,11 +13,7 @@ class TestReduceScatterCoalesced(DistributedTest): world_size = 2 def test_single_input(self): - input = torch.full((6, - ), - dist.get_rank(), - dtype=torch.half, - device=get_accelerator().current_device_name()) + input = torch.full((6, ), dist.get_rank(), dtype=torch.half, device=get_accelerator().current_device_name()) (output, ) = reduce_scatter_coalesced([input], dist.get_world_group()) @@ -25,17 +21,10 @@ class TestReduceScatterCoalesced(DistributedTest): assert torch.allclose(output, torch.full_like(output, 0.5)) def test_two_inputs(self): - tensor_kwargs = { - "device": get_accelerator().current_device_name(), - "dtype": torch.half - } + tensor_kwargs = {"device": get_accelerator().current_device_name(), "dtype": torch.half} inputs = [ - dist.get_rank() * torch.arange(0, - 6, - **tensor_kwargs), - dist.get_rank() * torch.arange(6, - 9, - **tensor_kwargs), + dist.get_rank() * torch.arange(0, 6, **tensor_kwargs), + dist.get_rank() * torch.arange(6, 9, **tensor_kwargs), ] output1, output2 = reduce_scatter_coalesced(inputs, dist.get_world_group()) @@ -56,10 +45,7 @@ class TestReduceScatterCoalescedTensorSmallerThanWorldSize(DistributedTest): world_size = 2 def test(self): - input = torch.zeros((1, - ), - dtype=torch.half, - device=get_accelerator().current_device_name()) + input = torch.zeros((1, ), dtype=torch.half, device=get_accelerator().current_device_name()) (output, ) = reduce_scatter_coalesced([input], dist.get_world_group()) diff --git a/tests/unit/runtime/half_precision/onebit/test_onebit.py b/tests/unit/runtime/half_precision/onebit/test_onebit.py index 84a367681..36c7bcafd 100644 --- a/tests/unit/runtime/half_precision/onebit/test_onebit.py +++ b/tests/unit/runtime/half_precision/onebit/test_onebit.py @@ -29,9 +29,8 @@ if TORCH_MAJOR < 1 or TORCH_MINOR < 8: rocm_version = OpBuilder.installed_rocm_version() if rocm_version[0] > 4: - pytest.skip( - "NCCL-based 1-bit compression is not yet supported w. ROCm 5 until cupy supports ROCm 5", - allow_module_level=True) + pytest.skip("NCCL-based 1-bit compression is not yet supported w. ROCm 5 until cupy supports ROCm 5", + allow_module_level=True) @pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=["fp32", "fp16"]) @@ -62,9 +61,7 @@ class TestOneBitAdamBasic(DistributedTest): hidden_dim = 10 model = SimpleModel(hidden_dim) - model, _, _, _ = deepspeed.initialize( - config=config_dict, model=model, model_parameters=model.parameters() - ) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) data_loader = random_dataloader( model=model, total_samples=50, @@ -127,10 +124,7 @@ class TestOneBitAdamExpAvgMask(DistributedTest): model=model, model_parameters=optimizer_grouped_parameters, ) - data_loader = random_dataloader(model=model, - total_samples=50, - hidden_dim=hidden_dim, - device=model.device) + data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) @@ -234,14 +228,12 @@ class TestOneBitAdamCheckpointing(DistributedTest): # Test whether momentum mask still exist after saving checkpoint assert optimizer_1.optimizer.adam_freeze_key is True mask1 = mask1.to(device=optimizer_1.param_groups[0]["exp_avg_mask"].device) - assert torch.allclose( - optimizer_1.param_groups[0]["exp_avg_mask"], mask1, atol=1e-07 - ), f"Incorrect momentum mask" + assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1, + atol=1e-07), f"Incorrect momentum mask" save_folder = os.path.join(tmpdir, "saved_checkpoint") model_1.save_checkpoint(save_folder, tag=None) - assert torch.allclose( - optimizer_1.param_groups[0]["exp_avg_mask"], mask1, atol=1e-07 - ), f"Momentum mask should not change after saving checkpoint" + assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1, + atol=1e-07), f"Momentum mask should not change after saving checkpoint" model_2, optimizer_2, _, _ = deepspeed.initialize( config=config_dict, @@ -250,18 +242,16 @@ class TestOneBitAdamCheckpointing(DistributedTest): ) # Test whether momentum mask stays the same after loading checkpoint mask2 = mask2.to(device=optimizer_2.param_groups[0]["exp_avg_mask"].device) - assert torch.allclose( - optimizer_2.param_groups[0]["exp_avg_mask"], mask2, atol=1e-07 - ), f"Incorrect momentum mask" + assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2, + atol=1e-07), f"Incorrect momentum mask" model_2.load_checkpoint( save_folder, tag=None, load_optimizer_states=True, load_lr_scheduler_states=True, ) - assert torch.allclose( - optimizer_2.param_groups[0]["exp_avg_mask"], mask2, atol=1e-07 - ), f"Momentum mask should not change after loading checkpoint" + assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2, + atol=1e-07), f"Momentum mask should not change after loading checkpoint" # Test whether worker&server error is reset for v in optimizer_2.state.values(): assert "worker_error" not in v, f"Incorrect worker error" @@ -286,18 +276,15 @@ class TestOneBitAdamCheckpointing(DistributedTest): model_3.step() assert optimizer_3.optimizer.adam_freeze_key is True # Test whether momentum mask stays the same after loading checkpoint - assert ( - "exp_avg_mask" not in optimizer_3.param_groups[0] - ), f"Incorrect momentum mask" + assert ("exp_avg_mask" not in optimizer_3.param_groups[0]), f"Incorrect momentum mask" model_3.load_checkpoint( save_folder, tag=None, load_optimizer_states=True, load_lr_scheduler_states=True, ) - assert ( - "exp_avg_mask" not in optimizer_3.param_groups[0] - ), f"Momentum mask should not change after loading checkpoint" + assert ("exp_avg_mask" + not in optimizer_3.param_groups[0]), f"Momentum mask should not change after loading checkpoint" # Test whether worker&server error is reset for v in optimizer_3.state.values(): assert "worker_error" not in v, f"Incorrect worker error" @@ -328,13 +315,8 @@ class TestOneBitAdamCheckpointing(DistributedTest): hidden_dim = 10 model = SimpleModel(hidden_dim) - model, _, _, _ = deepspeed.initialize( - config=config_dict, model=model, model_parameters=model.parameters() - ) - data_loader = random_dataloader(model=model, - total_samples=100, - hidden_dim=hidden_dim, - device=model.device) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) + data_loader = random_dataloader(model=model, total_samples=100, hidden_dim=hidden_dim, device=model.device) save_folder = os.path.join(tmpdir, "saved_checkpoint") for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) @@ -376,8 +358,7 @@ class TestOneBitAdamFP16Pipeline(DistributedTest): "type": "OneBitAdam", "params": { "lr": 0.00001, - "betas": [0.9, - 0.999], + "betas": [0.9, 0.999], "eps": 1e-8, "weight_decay": 3e-7, "freeze_step": 200, @@ -407,9 +388,7 @@ class TestOneBitAdamFP16Pipeline(DistributedTest): init_net = AlexNetPipe() test_net = copy.deepcopy(init_net) - test_model = PipelineModule(layers=test_net.to_layers(), - topology=topo, - loss_fn=nn.CrossEntropyLoss()) + test_model = PipelineModule(layers=test_net.to_layers(), topology=topo, loss_fn=nn.CrossEntropyLoss()) test_losses = train_cifar( test_model, @@ -450,9 +429,7 @@ class TestZeroOneAdamBasic(DistributedTest): hidden_dim = 10 model = SimpleModel(hidden_dim) - model, _, _, _ = deepspeed.initialize( - config=config_dict, model=model, model_parameters=model.parameters() - ) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) data_loader = random_dataloader( model=model, total_samples=50, @@ -518,10 +495,7 @@ class TestZeroOneAdamExpAvgMask(DistributedTest): model=model, model_parameters=optimizer_grouped_parameters, ) - data_loader = random_dataloader(model=model, - total_samples=50, - hidden_dim=hidden_dim, - device=model.device) + data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) @@ -627,14 +601,12 @@ class TestZeroOneAdamCheckpointing(DistributedTest): model_1.step() # Test whether momentum mask still exist after saving checkpoint mask1 = mask1.to(device=optimizer_1.param_groups[0]["exp_avg_mask"].device) - assert torch.allclose( - optimizer_1.param_groups[0]["exp_avg_mask"], mask1, atol=1e-07 - ), f"Incorrect momentum mask" + assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1, + atol=1e-07), f"Incorrect momentum mask" save_folder = os.path.join(tmpdir, "saved_checkpoint") model_1.save_checkpoint(save_folder, tag=None) - assert torch.allclose( - optimizer_1.param_groups[0]["exp_avg_mask"], mask1, atol=1e-07 - ), f"Momentum mask should not change after saving checkpoint" + assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1, + atol=1e-07), f"Momentum mask should not change after saving checkpoint" model_2, optimizer_2, _, _ = deepspeed.initialize( config=config_dict, @@ -643,18 +615,16 @@ class TestZeroOneAdamCheckpointing(DistributedTest): ) # Test whether momentum mask stays the same after loading checkpoint mask2 = mask2.to(device=optimizer_2.param_groups[0]["exp_avg_mask"].device) - assert torch.allclose( - optimizer_2.param_groups[0]["exp_avg_mask"], mask2, atol=1e-07 - ), f"Incorrect momentum mask" + assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2, + atol=1e-07), f"Incorrect momentum mask" model_2.load_checkpoint( save_folder, tag=None, load_optimizer_states=True, load_lr_scheduler_states=True, ) - assert torch.allclose( - optimizer_2.param_groups[0]["exp_avg_mask"], mask2, atol=1e-07 - ), f"Momentum mask should not change after loading checkpoint" + assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2, + atol=1e-07), f"Momentum mask should not change after loading checkpoint" # Test whether worker&server error is reset for v in optimizer_2.state.values(): assert "worker_error" not in v, f"Incorrect worker error" @@ -677,18 +647,15 @@ class TestZeroOneAdamCheckpointing(DistributedTest): model_3.backward(loss) model_3.step() # Test whether momentum mask stays the same after loading checkpoint - assert ( - "exp_avg_mask" not in optimizer_3.param_groups[0] - ), f"Incorrect momentum mask" + assert ("exp_avg_mask" not in optimizer_3.param_groups[0]), f"Incorrect momentum mask" model_3.load_checkpoint( save_folder, tag=None, load_optimizer_states=True, load_lr_scheduler_states=True, ) - assert ( - "exp_avg_mask" not in optimizer_3.param_groups[0] - ), f"Momentum mask should not change after loading checkpoint" + assert ("exp_avg_mask" + not in optimizer_3.param_groups[0]), f"Momentum mask should not change after loading checkpoint" # Test whether worker&server error is reset for v in optimizer_3.state.values(): assert "worker_error" not in v, f"Incorrect worker error" @@ -721,13 +688,8 @@ class TestZeroOneAdamCheckpointing(DistributedTest): hidden_dim = 10 model = SimpleModel(hidden_dim) - model, _, _, _ = deepspeed.initialize( - config=config_dict, model=model, model_parameters=model.parameters() - ) - data_loader = random_dataloader(model=model, - total_samples=100, - hidden_dim=hidden_dim, - device=model.device) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) + data_loader = random_dataloader(model=model, total_samples=100, hidden_dim=hidden_dim, device=model.device) save_folder = os.path.join(tmpdir, "saved_checkpoint") for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) @@ -769,8 +731,7 @@ class TestZeroOneAdamFP16Pipeline(DistributedTest): "type": "ZeroOneAdam", "params": { "lr": 0.00001, - "betas": [0.9, - 0.999], + "betas": [0.9, 0.999], "eps": 1e-8, "weight_decay": 3e-7, "var_freeze_step": 4, @@ -803,9 +764,7 @@ class TestZeroOneAdamFP16Pipeline(DistributedTest): init_net = AlexNetPipe() test_net = copy.deepcopy(init_net) - test_model = PipelineModule(layers=test_net.to_layers(), - topology=topo, - loss_fn=nn.CrossEntropyLoss()) + test_model = PipelineModule(layers=test_net.to_layers(), topology=topo, loss_fn=nn.CrossEntropyLoss()) test_losses = train_cifar( test_model, @@ -849,9 +808,7 @@ class TestOneBitLambBasic(DistributedTest): hidden_dim = 10 model = SimpleModel(hidden_dim) - model, _, _, _ = deepspeed.initialize( - config=config_dict, model=model, model_parameters=model.parameters() - ) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) data_loader = random_dataloader( model=model, total_samples=50, @@ -919,10 +876,7 @@ class TestOneBitLampExpAvgMask(DistributedTest): model=model, model_parameters=optimizer_grouped_parameters, ) - data_loader = random_dataloader(model=model, - total_samples=50, - hidden_dim=hidden_dim, - device=model.device) + data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) @@ -1030,18 +984,16 @@ class TestOneBitLambCheckpointing(DistributedTest): # Test whether momentum mask still exist after saving checkpoint assert optimizer_1.optimizer.lamb_freeze_key is True mask1 = mask1.to(device=optimizer_1.param_groups[0]["exp_avg_mask"].device) - assert torch.allclose( - optimizer_1.param_groups[0]["exp_avg_mask"], mask1, atol=1e-07 - ), f"Incorrect momentum mask" + assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1, + atol=1e-07), f"Incorrect momentum mask" scaling_coeff_1 = [] for v in optimizer_1.state.values(): assert "scaling_coeff" in v, f"Incorrect scaling_coeff" scaling_coeff_1.append(v["scaling_coeff"]) save_folder = os.path.join(tmpdir, "saved_checkpoint") model_1.save_checkpoint(save_folder, tag=None) - assert torch.allclose( - optimizer_1.param_groups[0]["exp_avg_mask"], mask1, atol=1e-07 - ), f"Momentum mask should not change after saving checkpoint" + assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1, + atol=1e-07), f"Momentum mask should not change after saving checkpoint" model_2, optimizer_2, _, _ = deepspeed.initialize( config=config_dict, @@ -1050,18 +1002,16 @@ class TestOneBitLambCheckpointing(DistributedTest): ) # Test whether momentum mask stays the same after loading checkpoint mask2 = mask2.to(device=optimizer_2.param_groups[0]["exp_avg_mask"].device) - assert torch.allclose( - optimizer_2.param_groups[0]["exp_avg_mask"], mask2, atol=1e-07 - ), f"Incorrect momentum mask" + assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2, + atol=1e-07), f"Incorrect momentum mask" model_2.load_checkpoint( save_folder, tag=None, load_optimizer_states=True, load_lr_scheduler_states=True, ) - assert torch.allclose( - optimizer_2.param_groups[0]["exp_avg_mask"], mask2, atol=1e-07 - ), f"Momentum mask should not change after loading checkpoint" + assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2, + atol=1e-07), f"Momentum mask should not change after loading checkpoint" # Test whether worker&server error is reset assert len(optimizer_2.optimizer.worker_errors) == 0, f"Incorrect worker error" assert len(optimizer_2.optimizer.server_errors) == 0, f"Incorrect server error" @@ -1070,9 +1020,7 @@ class TestOneBitLambCheckpointing(DistributedTest): for v in optimizer_2.state.values(): assert "scaling_coeff" in v, f"Incorrect scaling_coeff" scaling_coeff_2.append(v["scaling_coeff"]) - assert list(sorted(scaling_coeff_2)) == list( - sorted(scaling_coeff_1) - ), f"Incorrect scaling_coeffs" + assert list(sorted(scaling_coeff_2)) == list(sorted(scaling_coeff_1)), f"Incorrect scaling_coeffs" assert optimizer_2.optimizer.lamb_freeze_key is True model_3, optimizer_3, _, _ = deepspeed.initialize( @@ -1093,18 +1041,15 @@ class TestOneBitLambCheckpointing(DistributedTest): model_3.step() assert optimizer_3.optimizer.lamb_freeze_key is True # Test whether momentum mask stays the same after loading checkpoint - assert ( - "exp_avg_mask" not in optimizer_3.param_groups[0] - ), f"Incorrect momentum mask" + assert ("exp_avg_mask" not in optimizer_3.param_groups[0]), f"Incorrect momentum mask" model_3.load_checkpoint( save_folder, tag=None, load_optimizer_states=True, load_lr_scheduler_states=True, ) - assert ( - "exp_avg_mask" not in optimizer_3.param_groups[0] - ), f"Momentum mask should not change after loading checkpoint" + assert ("exp_avg_mask" + not in optimizer_3.param_groups[0]), f"Momentum mask should not change after loading checkpoint" # Test whether worker&server error is reset assert len(optimizer_3.optimizer.worker_errors) == 0, f"Incorrect worker error" assert len(optimizer_3.optimizer.server_errors) == 0, f"Incorrect server error" @@ -1145,13 +1090,8 @@ class TestOneBitLambCheckpointing(DistributedTest): hidden_dim = 10 model = SimpleModel(hidden_dim) - model, _, _, _ = deepspeed.initialize( - config=config_dict, model=model, model_parameters=model.parameters() - ) - data_loader = random_dataloader(model=model, - total_samples=100, - hidden_dim=hidden_dim, - device=model.device) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) + data_loader = random_dataloader(model=model, total_samples=100, hidden_dim=hidden_dim, device=model.device) save_folder = os.path.join(tmpdir, "saved_checkpoint") for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) @@ -1193,8 +1133,7 @@ class TestOneBitLambFP16Pipeline(DistributedTest): "type": "OneBitLamb", "params": { "lr": 0.00001, - "betas": [0.9, - 0.999], + "betas": [0.9, 0.999], "eps": 1e-8, "weight_decay": 3e-7, "freeze_step": 200, @@ -1224,9 +1163,7 @@ class TestOneBitLambFP16Pipeline(DistributedTest): init_net = AlexNetPipe() test_net = copy.deepcopy(init_net) - test_model = PipelineModule(layers=test_net.to_layers(), - topology=topo, - loss_fn=nn.CrossEntropyLoss()) + test_model = PipelineModule(layers=test_net.to_layers(), topology=topo, loss_fn=nn.CrossEntropyLoss()) test_losses = train_cifar( test_model, @@ -1258,15 +1195,11 @@ class TestCompressedAllReduceBasic(DistributedTest): worker_error = a - a_compressed dist.all_reduce(a_compressed) a_compressed.mul_(1 / dist.get_world_size()) - a_server_sign = ( - a_compressed.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)) + a_server_sign = (a_compressed.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)) a_list = torch.chunk(a_compressed, chunks=dist.get_world_size()) - server_scale = [ - chunk_a.norm() / np.sqrt(chunk_a.numel()) for chunk_a in a_list - ] + server_scale = [chunk_a.norm() / np.sqrt(chunk_a.numel()) for chunk_a in a_list] a_sign_list = torch.chunk(a_server_sign, dist.get_world_size()) - a_server_compressed = torch.cat( - [server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())]) + a_server_compressed = torch.cat([server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())]) rank = dist.get_rank() server_error = a_list[rank] - server_scale[rank] * a_sign_list[rank] get_accelerator().synchronize() diff --git a/tests/unit/runtime/half_precision/test_bf16.py b/tests/unit/runtime/half_precision/test_bf16.py index 3bc5cb138..c6a2c05a0 100644 --- a/tests/unit/runtime/half_precision/test_bf16.py +++ b/tests/unit/runtime/half_precision/test_bf16.py @@ -59,9 +59,7 @@ class TestAdamBF16ZeroOneCycleCompatibility(DistributedTest): hidden_dim = 10 model = SimpleModel(hidden_dim) - model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, @@ -154,9 +152,7 @@ class TestZeroEmptyPartition(DistributedTest): # Ensure model has 2 parameters, to cause empty partition with DP=3 assert len(list(model.parameters())) == 2 - model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) # Now make sure things work.. data_loader = random_dataloader(model=model, @@ -197,9 +193,7 @@ class TestZeroSupportedClientOptimizer(DistributedTest): model = SimpleModel(hidden_dim) client_optimizer = optimizer_constructor(params=model.parameters()) - model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - optimizer=client_optimizer) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=client_optimizer) class TestZero2ReduceScatterOff(DistributedTest): @@ -239,9 +233,7 @@ class TestZero2ReduceScatterOff(DistributedTest): hidden_dim = 10 model = SimpleModel(hidden_dim) - model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, @@ -279,9 +271,7 @@ class TestZeroEmptyGrad(DistributedTest): model = SimpleModel(hidden_dim) optimizer = torch.optim.Adam(model.parameters()) - model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - optimizer=optimizer) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=optimizer) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, @@ -293,18 +283,8 @@ class TestZeroEmptyGrad(DistributedTest): model.step() -@pytest.mark.parametrize("comp_type", - [torch.float16, - torch.bfloat16, - torch.float], - ids=["fp16", - "bfp16", - "fp32"]) -@pytest.mark.parametrize("comm_type", - [torch.float16, - torch.bfloat16], - ids=["fp16", - "bfp16"]) +@pytest.mark.parametrize("comp_type", [torch.float16, torch.bfloat16, torch.float], ids=["fp16", "bfp16", "fp32"]) +@pytest.mark.parametrize("comm_type", [torch.float16, torch.bfloat16], ids=["fp16", "bfp16"]) class TestZeroDtypeCocktail(DistributedTest): world_size = 2 @@ -335,9 +315,7 @@ class TestZeroDtypeCocktail(DistributedTest): model = SimpleModel(hidden_dim) optimizer = torch.optim.Adam(model.parameters()) - model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - optimizer=optimizer) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=optimizer) data_loader = random_dataloader(model=model, total_samples=2, hidden_dim=hidden_dim, diff --git a/tests/unit/runtime/half_precision/test_dynamic_loss_scale.py b/tests/unit/runtime/half_precision/test_dynamic_loss_scale.py index 3052c4ee1..2882bd4df 100644 --- a/tests/unit/runtime/half_precision/test_dynamic_loss_scale.py +++ b/tests/unit/runtime/half_precision/test_dynamic_loss_scale.py @@ -37,9 +37,7 @@ class TestFused(DistributedTest): } hidden_dim = 1 model = SimpleModel(hidden_dim) - model, optim, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) + model, optim, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) expected_loss_scale = 2**8 expected_scale_window = 2 @@ -74,9 +72,7 @@ class TestFused(DistributedTest): } hidden_dim = 1 model = SimpleModel(hidden_dim) - model, optim, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) + model, optim, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) expected_loss_scale = 2**4 # Ensure the dynamic loss scaler is correctly configured. @@ -109,9 +105,7 @@ class TestFused(DistributedTest): } hidden_dim = 1 model = SimpleModel(hidden_dim) - model, optim, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) + model, optim, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) expected_loss_scale = 2**8 expected_scale_window = 2 @@ -168,9 +162,7 @@ class TestUnfused(DistributedTest): } hidden_dim = 1 model = SimpleModel(hidden_dim) - model, optim, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) + model, optim, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) expected_loss_scale = 2**8 expected_scale_window = 2 # Ensure the dynamic loss scaler is correctly configured. @@ -205,9 +197,7 @@ class TestUnfused(DistributedTest): } hidden_dim = 1 model = SimpleModel(hidden_dim) - model, optim, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) + model, optim, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) expected_loss_scale = 2**4 expected_min_loss_scale = 0.25 @@ -242,9 +232,7 @@ class TestUnfused(DistributedTest): } hidden_dim = 1 model = SimpleModel(hidden_dim) - model, optim, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) + model, optim, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) expected_loss_scale = 2**8 expected_scale_window = 2 diff --git a/tests/unit/runtime/half_precision/test_fp16.py b/tests/unit/runtime/half_precision/test_fp16.py index b8b3e3d39..5699352e6 100644 --- a/tests/unit/runtime/half_precision/test_fp16.py +++ b/tests/unit/runtime/half_precision/test_fp16.py @@ -16,8 +16,7 @@ try: _amp_available = True except ImportError: _amp_available = False -amp_available = pytest.mark.skipif(not _amp_available, - reason="apex/amp is not installed") +amp_available = pytest.mark.skipif(not _amp_available, reason="apex/amp is not installed") class TestLambFP32GradClip(DistributedTest): @@ -38,9 +37,7 @@ class TestLambFP32GradClip(DistributedTest): hidden_dim = 10 model = SimpleModel(hidden_dim) - model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, @@ -73,13 +70,8 @@ class TestLambFP16(DistributedTest): hidden_dim = 10 model = SimpleModel(hidden_dim) - model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) - data_loader = random_dataloader(model=model, - total_samples=50, - hidden_dim=hidden_dim, - device=model.device) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) + data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) @@ -103,13 +95,8 @@ class TestLambFP16(DistributedTest): hidden_dim = 10 model = SimpleModel(hidden_dim, empty_grad=True) - model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) - data_loader = random_dataloader(model=model, - total_samples=50, - hidden_dim=hidden_dim, - device=model.device) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) + data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) @@ -137,9 +124,7 @@ class TestAdamFP32EmptyGrad(DistributedTest): hidden_dim = 10 model = SimpleModel(hidden_dim, empty_grad=True) - model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, @@ -155,24 +140,13 @@ class TestAdamwFP16Basic(DistributedTest): world_size = 1 def test(self): - config_dict = { - "train_batch_size": 1, - "steps_per_print": 1, - "fp16": { - "enabled": True - } - } + config_dict = {"train_batch_size": 1, "steps_per_print": 1, "fp16": {"enabled": True}} hidden_dim = 10 model = SimpleModel(hidden_dim) optimizer = torch.optim.AdamW(params=model.parameters()) - model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - optimizer=optimizer) - data_loader = random_dataloader(model=model, - total_samples=50, - hidden_dim=hidden_dim, - device=model.device) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=optimizer) + data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) @@ -186,20 +160,12 @@ class TestFP16OptimizerForMoE(DistributedTest): if not required_torch_version(): pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly") - config_dict = { - "train_batch_size": 2, - "steps_per_print": 1, - "fp16": { - "enabled": True - } - } + config_dict = {"train_batch_size": 2, "steps_per_print": 1, "fp16": {"enabled": True}} hidden_dim = 10 def mock_unscale_and_clip_grads(total_norm, apply_scale=True): torch_norm_tensor = get_accelerator().FloatTensor([total_norm]) - all_gather_results = [ - torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size()) - ] + all_gather_results = [torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size())] dist.all_gather(all_gather_results, torch_norm_tensor) assert len(set([x.item() for x in all_gather_results])) == 1 return 1.0 @@ -208,16 +174,11 @@ class TestFP16OptimizerForMoE(DistributedTest): model = SimpleMoEModel(hidden_dim, ep_size=2) optimizer = torch.optim.AdamW(params=model.parameters()) engine, optimizer, _, _ = deepspeed.initialize(config=config_dict, - model=model, - optimizer=optimizer, - dist_init_required=False) - monkeypatch.setattr(optimizer, - 'unscale_and_clip_grads', - mock_unscale_and_clip_grads) - data_loader = sequence_dataloader(model=engine, - total_samples=50, - hidden_dim=hidden_dim, - device=engine.device) + model=model, + optimizer=optimizer, + dist_init_required=False) + monkeypatch.setattr(optimizer, 'unscale_and_clip_grads', mock_unscale_and_clip_grads) + data_loader = sequence_dataloader(model=engine, total_samples=50, hidden_dim=hidden_dim, device=engine.device) for n, batch in enumerate(data_loader): loss = engine(batch[0], batch[1]) engine.backward(loss) @@ -227,20 +188,12 @@ class TestFP16OptimizerForMoE(DistributedTest): if not required_torch_version(): pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly") - config_dict = { - "train_batch_size": 2, - "steps_per_print": 1, - "fp16": { - "enabled": True - } - } + config_dict = {"train_batch_size": 2, "steps_per_print": 1, "fp16": {"enabled": True}} hidden_dim = 10 def mock_unscale_and_clip_grads(grads_groups_flat, total_norm, apply_scale=True): torch_norm_tensor = get_accelerator().FloatTensor([total_norm]) - all_gather_results = [ - torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size()) - ] + all_gather_results = [torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size())] dist.all_gather(all_gather_results, torch_norm_tensor) assert len(set([x.item() for x in all_gather_results])) == 1 return 1.0 @@ -250,16 +203,11 @@ class TestFP16OptimizerForMoE(DistributedTest): # optimizer = torch.optim.AdamW(params=model.parameters()) optimizer = FusedAdam(params=model.parameters()) engine, optimizer, _, _ = deepspeed.initialize(config=config_dict, - model=model, - optimizer=optimizer, - dist_init_required=False) - monkeypatch.setattr(optimizer, - 'unscale_and_clip_grads', - mock_unscale_and_clip_grads) - data_loader = sequence_dataloader(model=engine, - total_samples=50, - hidden_dim=hidden_dim, - device=engine.device) + model=model, + optimizer=optimizer, + dist_init_required=False) + monkeypatch.setattr(optimizer, 'unscale_and_clip_grads', mock_unscale_and_clip_grads) + data_loader = sequence_dataloader(model=engine, total_samples=50, hidden_dim=hidden_dim, device=engine.device) for n, batch in enumerate(data_loader): loss = engine(batch[0], batch[1]) engine.backward(loss) @@ -287,9 +235,7 @@ class TestFP16OptimizerForMoE(DistributedTest): def mock_unscale_and_clip_grads(total_norm, apply_scale=True): torch_norm_tensor = get_accelerator().FloatTensor([total_norm]) - all_gather_results = [ - torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size()) - ] + all_gather_results = [torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size())] dist.all_gather(all_gather_results, torch_norm_tensor) assert len(set([x.item() for x in all_gather_results])) == 1 return 1.0 @@ -297,17 +243,12 @@ class TestFP16OptimizerForMoE(DistributedTest): # initialize MoE model = SimpleMoEModel(hidden_dim, ep_size=2) engine, optimizer, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters(), - dist_init_required=False) - monkeypatch.setattr(optimizer, - 'unscale_and_clip_grads', - mock_unscale_and_clip_grads) + model=model, + model_parameters=model.parameters(), + dist_init_required=False) + monkeypatch.setattr(optimizer, 'unscale_and_clip_grads', mock_unscale_and_clip_grads) optimizer.fused_lamb_legacy = fused_lamb_legacy - data_loader = sequence_dataloader(model=engine, - total_samples=50, - hidden_dim=hidden_dim, - device=engine.device) + data_loader = sequence_dataloader(model=engine, total_samples=50, hidden_dim=hidden_dim, device=engine.device) for n, batch in enumerate(data_loader): loss = engine(batch[0], batch[1]) engine.backward(loss) @@ -318,24 +259,13 @@ class TestAdamwFP16EmptyGrad(DistributedTest): world_size = 1 def test(self): - config_dict = { - "train_batch_size": 1, - "steps_per_print": 1, - "fp16": { - "enabled": True - } - } + config_dict = {"train_batch_size": 1, "steps_per_print": 1, "fp16": {"enabled": True}} hidden_dim = 10 model = SimpleModel(hidden_dim) optimizer = torch.optim.AdamW(params=model.parameters()) - model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - optimizer=optimizer) - data_loader = random_dataloader(model=model, - total_samples=50, - hidden_dim=hidden_dim, - device=model.device) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=optimizer) + data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) @@ -385,13 +315,8 @@ class TestAdamFP16ZeroOneCycleCompatibility(DistributedTest): hidden_dim = 10 model = SimpleModel(hidden_dim) - model, _, _,_ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) - data_loader = random_dataloader(model=model, - total_samples=50, - hidden_dim=hidden_dim, - device=model.device) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) + data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) @@ -428,19 +353,14 @@ class TestZeroStaticScale(DistributedTest): } model = SimpleModel(hidden_dim) - model, optim, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) + model, optim, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) # Ensure the static scaler is configured. assert optim.dynamic_loss_scale == False assert optim.loss_scaler.loss_scale == 138. # Now make sure things work.. - data_loader = random_dataloader(model=model, - total_samples=10, - hidden_dim=hidden_dim, - device=model.device) + data_loader = random_dataloader(model=model, total_samples=10, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) @@ -517,15 +437,10 @@ class TestZeroEmptyPartition(DistributedTest): # Ensure model has 2 parameters, to cause empty partition with DP=3 assert len(list(model.parameters())) == 2 - model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) # Now make sure things work.. - data_loader = random_dataloader(model=model, - total_samples=1, - hidden_dim=hidden_dim, - device=model.device) + data_loader = random_dataloader(model=model, total_samples=1, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) @@ -537,24 +452,13 @@ class TestAmp(DistributedTest): world_size = 2 def test_adam_basic(self): - config_dict = { - "train_batch_size": 2, - "steps_per_print": 1, - "amp": { - "enabled": True - } - } + config_dict = {"train_batch_size": 2, "steps_per_print": 1, "amp": {"enabled": True}} hidden_dim = 10 model = SimpleModel(hidden_dim) optimizer = torch.optim.Adam(params=model.parameters()) - model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - optimizer=optimizer) - data_loader = random_dataloader(model=model, - total_samples=50, - hidden_dim=hidden_dim, - device=model.device) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=optimizer) + data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) @@ -578,13 +482,8 @@ class TestAmp(DistributedTest): hidden_dim = 10 model = SimpleModel(hidden_dim) - model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) - data_loader = random_dataloader(model=model, - total_samples=50, - hidden_dim=hidden_dim, - device=model.device) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) + data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) @@ -609,13 +508,8 @@ class TestAmp(DistributedTest): hidden_dim = 10 model = SimpleModel(hidden_dim) - model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) - data_loader = random_dataloader(model=model, - total_samples=50, - hidden_dim=hidden_dim, - device=model.device) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) + data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) @@ -640,13 +534,8 @@ class TestAmp(DistributedTest): hidden_dim = 10 model = SimpleModel(hidden_dim) - model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) - data_loader = random_dataloader(model=model, - total_samples=50, - hidden_dim=hidden_dim, - device=model.device) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) + data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) @@ -673,9 +562,7 @@ class TestZeroSupportedClientOptimizer(DistributedTest): model = SimpleModel(hidden_dim) client_optimizer = optimizer_constructor(params=model.parameters()) - model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - optimizer=client_optimizer) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=client_optimizer) class TestZero2ReduceScatterOff(DistributedTest): @@ -707,13 +594,8 @@ class TestZero2ReduceScatterOff(DistributedTest): hidden_dim = 10 model = SimpleModel(hidden_dim) - model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) - data_loader = random_dataloader(model=model, - total_samples=50, - hidden_dim=hidden_dim, - device=model.device) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) + data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) @@ -744,14 +626,9 @@ class TestFP16AdamTypes(DistributedTest): hidden_dim = 10 model = SimpleModel(hidden_dim) - model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) - data_loader = random_dataloader(model=model, - total_samples=10, - hidden_dim=hidden_dim, - device=model.device) + data_loader = random_dataloader(model=model, total_samples=10, hidden_dim=hidden_dim, device=model.device) for _, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) @@ -783,14 +660,9 @@ class TestZero3LazyScatter(DistributedTest): hidden_dim = 10 model = SimpleModel(hidden_dim) - model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) - data_loader = random_dataloader(model=model, - total_samples=10, - hidden_dim=hidden_dim, - device=model.device) + data_loader = random_dataloader(model=model, total_samples=10, hidden_dim=hidden_dim, device=model.device) for _, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) @@ -817,13 +689,8 @@ class TestZeroEmptyGrad(DistributedTest): model = SimpleModel(hidden_dim) optimizer = torch.optim.Adam(model.parameters()) - model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - optimizer=optimizer) - data_loader = random_dataloader(model=model, - total_samples=50, - hidden_dim=hidden_dim, - device=model.device) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=optimizer) + data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) diff --git a/tests/unit/runtime/pipe/test_pipe.py b/tests/unit/runtime/pipe/test_pipe.py index f6f2e6b4f..974eb2694 100644 --- a/tests/unit/runtime/pipe/test_pipe.py +++ b/tests/unit/runtime/pipe/test_pipe.py @@ -18,21 +18,20 @@ def rel_diff(A, B): return abs(A - B) / abs(A) -@pytest.mark.parametrize('topo_config', - [ - { - "num_pp": 1, - "num_dp": 4 - }, - { - "num_pp": 2, - "num_dp": 2 - }, - { - "num_pp": 4, - "num_dp": 1 - }, - ]) +@pytest.mark.parametrize('topo_config', [ + { + "num_pp": 1, + "num_dp": 4 + }, + { + "num_pp": 2, + "num_dp": 2 + }, + { + "num_pp": 4, + "num_dp": 1 + }, +]) class TestPipeCifar10(DistributedTest): world_size = 4 @@ -47,8 +46,7 @@ class TestPipeCifar10(DistributedTest): "type": "Adam", "params": { "lr": 0.001, - "betas": [0.9, - 0.999], + "betas": [0.9, 0.999], "eps": 1e-8, "weight_decay": 3e-7 } @@ -72,38 +70,22 @@ class TestPipeCifar10(DistributedTest): init_net = AlexNetPipe() base_net = copy.deepcopy(init_net) - base_model = PipelineModule(layers=base_net.to_layers(), - num_stages=1, - loss_fn=nn.CrossEntropyLoss()) + base_model = PipelineModule(layers=base_net.to_layers(), num_stages=1, loss_fn=nn.CrossEntropyLoss()) # Train with just data parallelism - base_losses = train_cifar(base_model, - config=config_dict, - num_steps=steps, - fp16=config_dict['fp16']['enabled']) + base_losses = train_cifar(base_model, config=config_dict, num_steps=steps, fp16=config_dict['fp16']['enabled']) test_net = copy.deepcopy(init_net) - test_model = PipelineModule(layers=test_net.to_layers(), - topology=topo, - loss_fn=nn.CrossEntropyLoss()) + test_model = PipelineModule(layers=test_net.to_layers(), topology=topo, loss_fn=nn.CrossEntropyLoss()) - test_losses = train_cifar(test_model, - config=config_dict, - num_steps=steps, - fp16=config_dict['fp16']['enabled']) + test_losses = train_cifar(test_model, config=config_dict, num_steps=steps, fp16=config_dict['fp16']['enabled']) abs_diffs = [l0 - l1 for l0, l1 in zip(base_losses, test_losses)] rel_diffs = [rel_diff(l0, l1) for l0, l1 in zip(base_losses, test_losses)] if dist.get_rank() == 0: - print( - f'abs min={min(abs_diffs)} max={max(abs_diffs)} avg={sum(abs_diffs)/len(abs_diffs)}' - ) - print( - f'rel min={min(rel_diffs)} max={max(rel_diffs)} avg={sum(rel_diffs)/len(rel_diffs)}' - ) - print( - f'first: base={base_losses[0]} test={test_losses[0]} abs={abs_diffs[0]} rel={rel_diffs[0]}' - ) + print(f'abs min={min(abs_diffs)} max={max(abs_diffs)} avg={sum(abs_diffs)/len(abs_diffs)}') + print(f'rel min={min(rel_diffs)} max={max(rel_diffs)} avg={sum(rel_diffs)/len(rel_diffs)}') + print(f'first: base={base_losses[0]} test={test_losses[0]} abs={abs_diffs[0]} rel={rel_diffs[0]}') for lastX in [1, 10, 100]: base_avg = sum(base_losses[-lastX:]) / lastX @@ -117,6 +99,4 @@ class TestPipeCifar10(DistributedTest): base_avg = sum(base) / len(base) test = test_losses[-lastX:] test_avg = sum(test) / len(test) - assert rel_diff( - base_avg, - test_avg) < 0.05 # Originally 0.03, but seeing instability with AMD results + assert rel_diff(base_avg, test_avg) < 0.05 # Originally 0.03, but seeing instability with AMD results diff --git a/tests/unit/runtime/pipe/test_pipe_schedule.py b/tests/unit/runtime/pipe/test_pipe_schedule.py index 5ca3dfe1d..5ea324362 100644 --- a/tests/unit/runtime/pipe/test_pipe_schedule.py +++ b/tests/unit/runtime/pipe/test_pipe_schedule.py @@ -38,9 +38,7 @@ def test_pipe_train_schedule_singlestage(): @pytest.mark.parametrize('micro_batches', [1, 3, 8, 10]) def test_pipe_inference_schedule_firststage(micro_batches, stages=3): - sched = schedule.InferenceSchedule(micro_batches=micro_batches, - stages=stages, - stage_id=0) + sched = schedule.InferenceSchedule(micro_batches=micro_batches, stages=stages, stage_id=0) assert sched.num_micro_batches == micro_batches full = list(iter(sched)) for idx, cmds in enumerate(full): @@ -73,9 +71,7 @@ def test_pipe_inference_schedule_firststage(micro_batches, stages=3): @pytest.mark.parametrize('micro_batches', [1, 3, 8, 10]) def test_pipe_inference_schedule_midstage(micro_batches, stages=3): - sched = schedule.InferenceSchedule(micro_batches=micro_batches, - stages=stages, - stage_id=1) + sched = schedule.InferenceSchedule(micro_batches=micro_batches, stages=stages, stage_id=1) full = list(iter(sched)) for idx, cmds in enumerate(full): @@ -99,9 +95,7 @@ def test_pipe_inference_schedule_midstage(micro_batches, stages=3): @pytest.mark.parametrize('micro_batches', [1, 3, 8, 10]) def test_pipe_inference_schedule_laststage(micro_batches, stages=3): - sched = schedule.InferenceSchedule(micro_batches=micro_batches, - stages=stages, - stage_id=2) + sched = schedule.InferenceSchedule(micro_batches=micro_batches, stages=stages, stage_id=2) full = list(iter(sched)) for idx, cmds in enumerate(full): if idx < sched.stage or idx > sched.stage + sched.num_micro_batches: diff --git a/tests/unit/runtime/pipe/test_topology.py b/tests/unit/runtime/pipe/test_topology.py index 4b0cc42d4..3a7e297ca 100644 --- a/tests/unit/runtime/pipe/test_topology.py +++ b/tests/unit/runtime/pipe/test_topology.py @@ -52,9 +52,7 @@ def test_topology_rank_repr(): assert topo.get_rank_repr(rank=3) == 'a_01-b_01' assert topo.get_rank_repr(rank=3, inner_sep='+') == 'a+01-b+01' - assert topo.get_rank_repr(rank=3, - inner_sep='🤗', - outer_sep='_JEFF_') == 'a🤗01_JEFF_b🤗01' + assert topo.get_rank_repr(rank=3, inner_sep='🤗', outer_sep='_JEFF_') == 'a🤗01_JEFF_b🤗01' topo = Topo(axes=['pipe', 'data'], dims=[2, 2]) assert topo.get_rank_repr(rank=0) == '' @@ -132,26 +130,26 @@ def test_topology_comm_list(): assert topo.get_rank(pipe=1, data=1, model=1) == 7 pipe_list = [ - [0, 4], # data=0, model=0 - [1, 5], # data=0, model=1 - [2, 6], # data=1, model=0 - [3, 7], # data=1, model=1 + [0, 4], # data=0, model=0 + [1, 5], # data=0, model=1 + [2, 6], # data=1, model=0 + [3, 7], # data=1, model=1 ] assert topo.get_axis_comm_lists('pipe') == pipe_list data_list = [ - [0, 2], # pipe=0, model=0 - [1, 3], # pipe=0, model=1 - [4, 6], # pipe=1, model=0 - [5, 7], # pipe=1, model=1 + [0, 2], # pipe=0, model=0 + [1, 3], # pipe=0, model=1 + [4, 6], # pipe=1, model=0 + [5, 7], # pipe=1, model=1 ] assert topo.get_axis_comm_lists('data') == data_list model_list = [ - [0, 1], # pipe=0, data=0 - [2, 3], # pipe=0, data=1 - [4, 5], # pipe=1, data=0 - [6, 7], # pipe=1, data=1 + [0, 1], # pipe=0, data=0 + [2, 3], # pipe=0, data=1 + [4, 5], # pipe=1, data=0 + [6, 7], # pipe=1, data=1 ] assert topo.get_axis_comm_lists('model') == model_list @@ -172,8 +170,7 @@ class TestDistributedTopology(DistributedTest): rank = dist.get_rank() assert grid.is_first_stage == (grid.get_stage_id() == 0) - assert grid.is_last_stage == ( - grid.get_stage_id() == grid.get_pipe_parallel_world_size() - 1) + assert grid.is_last_stage == (grid.get_stage_id() == grid.get_pipe_parallel_world_size() - 1) # Test collectives along the pipeline parallel process groups rank_tensor = torch.LongTensor(data=[rank]).to(get_accelerator().device_name()) @@ -209,6 +206,7 @@ class TestDistributedTopology(DistributedTest): def test_primes(): """ Test prime factorizations. """ + def _product(ps): p = 1 for num in ps: diff --git a/tests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py b/tests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py index bf0d04c56..2f24efa80 100644 --- a/tests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py +++ b/tests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py @@ -7,6 +7,7 @@ from unit.util import skip_on_arch class Model(torch.nn.Module): + def __init__(self): super().__init__() self.emb = torch.nn.EmbeddingBag(10, 3, mode="sum", sparse=True) @@ -17,6 +18,7 @@ class Model(torch.nn.Module): class Adam(torch.optim.Optimizer): + def __init__(self, dense_params, sparse_params): super().__init__(dense_params + sparse_params, defaults={}) self.adam = torch.optim.Adam(dense_params) @@ -52,16 +54,10 @@ class TestSparseAdam(DistributedTest): def test(self): skip_on_arch(min_arch=7) - config_dict = { - "train_batch_size": 2, - "steps_per_print": 1, - "sparse_gradients": True - } + config_dict = {"train_batch_size": 2, "steps_per_print": 1, "sparse_gradients": True} model, optimizer = get_model_optimizer() loss = torch.nn.BCEWithLogitsLoss() - engine, _, _, _ = deepspeed.initialize(model=model, - optimizer=optimizer, - config=config_dict) + engine, _, _, _ = deepspeed.initialize(model=model, optimizer=optimizer, config=config_dict) x, offsets, y = get_data(engine.device) diff --git a/tests/unit/runtime/sparse_tensor/test_sparse_grads.py b/tests/unit/runtime/sparse_tensor/test_sparse_grads.py index ba9a6b028..0da7aad7a 100644 --- a/tests/unit/runtime/sparse_tensor/test_sparse_grads.py +++ b/tests/unit/runtime/sparse_tensor/test_sparse_grads.py @@ -8,6 +8,7 @@ import deepspeed.utils.groups as groups class Model(torch.nn.Module): + def __init__(self): super().__init__() self.emb = torch.nn.EmbeddingBag(10, 3, mode="sum", sparse=True) @@ -18,6 +19,7 @@ class Model(torch.nn.Module): class Adam(torch.optim.Optimizer): + def __init__(self, dense_params, sparse_params): super().__init__(dense_params + sparse_params, defaults={}) self.adam = torch.optim.Adam(dense_params) @@ -37,38 +39,19 @@ class TestSparseAdam(DistributedTest): world_size = 2 def test(self): - config_dict = { - "train_batch_size": 2, - "steps_per_print": 1, - "sparse_gradients": True - } + config_dict = {"train_batch_size": 2, "steps_per_print": 1, "sparse_gradients": True} model = Model() optimizer = Adam(list(model.linear.parameters()), list(model.emb.parameters())) - engine, _, _, _ = deepspeed.initialize(model=model, - optimizer=optimizer, - config=config_dict) + engine, _, _, _ = deepspeed.initialize(model=model, optimizer=optimizer, config=config_dict) loss = torch.nn.BCEWithLogitsLoss() - x = torch.tensor([1, - 2, - 4, - 5, - 4, - 3, - 2, - 9], - dtype=torch.long, - device=engine.device) + x = torch.tensor([1, 2, 4, 5, 4, 3, 2, 9], dtype=torch.long, device=engine.device) offsets = torch.tensor([0, 4], dtype=torch.long, device=engine.device) y = torch.tensor([[1.0], [0.0]], device=engine.device) res = engine(x, offsets) engine.backward(loss(res, y)) engine.step() - results = [ - engine.all_gather_scalar(i, - groups._get_data_parallel_group()) - for i in model.emb.parameters() - ] + results = [engine.all_gather_scalar(i, groups._get_data_parallel_group()) for i in model.emb.parameters()] for res in results: assert torch.allclose(res[0], res[1]) diff --git a/tests/unit/runtime/test_autocast.py b/tests/unit/runtime/test_autocast.py index b0d8d8696..55ca20faa 100644 --- a/tests/unit/runtime/test_autocast.py +++ b/tests/unit/runtime/test_autocast.py @@ -9,18 +9,15 @@ from unit.common import DistributedTest @pytest.mark.parametrize('half_op', [False, True]) class TestAutoCastDisable(DistributedTest): + def test_missing_amp_autocast(self, half_op): hidden_dim = 4 if half_op: input = torch.randn(hidden_dim).to(get_accelerator().device_name()).half() - ds_linear = LinearModuleForZeroStage3( - hidden_dim, - hidden_dim).to(get_accelerator().device_name()).half() + ds_linear = LinearModuleForZeroStage3(hidden_dim, hidden_dim).to(get_accelerator().device_name()).half() else: input = torch.randn(hidden_dim).to(get_accelerator().device_name()) - ds_linear = LinearModuleForZeroStage3(hidden_dim, - hidden_dim).to( - get_accelerator().device_name()) + ds_linear = LinearModuleForZeroStage3(hidden_dim, hidden_dim).to(get_accelerator().device_name()) output = ds_linear(input) assert output.dtype == ds_linear.weight.dtype @@ -31,14 +28,10 @@ class TestAutoCastDisable(DistributedTest): hidden_dim = 4 if half_op: input = torch.randn(hidden_dim).to(get_accelerator().device_name()).half() - ds_linear = LinearModuleForZeroStage3( - hidden_dim, - hidden_dim).to(get_accelerator().device_name()).half() + ds_linear = LinearModuleForZeroStage3(hidden_dim, hidden_dim).to(get_accelerator().device_name()).half() else: input = torch.randn(hidden_dim).to(get_accelerator().device_name()) - ds_linear = LinearModuleForZeroStage3(hidden_dim, - hidden_dim).to( - get_accelerator().device_name()) + ds_linear = LinearModuleForZeroStage3(hidden_dim, hidden_dim).to(get_accelerator().device_name()) with amp.autocast(False): output = ds_linear(input) @@ -46,24 +39,15 @@ class TestAutoCastDisable(DistributedTest): @pytest.mark.skipif(get_accelerator().amp() is None, reason='amp is not installed') -@pytest.mark.parametrize('half_input, half_weight', - [(False, - False), - (False, - True), - (True, - False), - (True, - True)]) +@pytest.mark.parametrize('half_input, half_weight', [(False, False), (False, True), (True, False), (True, True)]) class TestAutoCastEnable(DistributedTest): + def test_autocast_linear(self, tmpdir, half_input, half_weight): amp = get_accelerator().amp() hidden_dim = 4 input = torch.randn(hidden_dim).to(get_accelerator().device_name()) - ds_linear = LinearModuleForZeroStage3(hidden_dim, - hidden_dim).to( - get_accelerator().device_name()) + ds_linear = LinearModuleForZeroStage3(hidden_dim, hidden_dim).to(get_accelerator().device_name()) if half_input: input = input.half() diff --git a/tests/unit/runtime/test_data.py b/tests/unit/runtime/test_data.py index ed2fee950..93d0d7fe4 100644 --- a/tests/unit/runtime/test_data.py +++ b/tests/unit/runtime/test_data.py @@ -19,24 +19,12 @@ def test_repeating_loader(): assert next(loader) == 3 -@pytest.mark.parametrize('train_batch_size, drop_last', - [(1, - True), - (4, - True), - (1, - False), - (4, - False)]) +@pytest.mark.parametrize('train_batch_size, drop_last', [(1, True), (4, True), (1, False), (4, False)]) class TestDataLoaderDropLast(DistributedTest): world_size = 1 def test(self, train_batch_size, drop_last): - config_dict = { - "train_batch_size": train_batch_size, - "dataloader_drop_last": drop_last, - "steps_per_print": 1 - } + config_dict = {"train_batch_size": train_batch_size, "dataloader_drop_last": drop_last, "steps_per_print": 1} hidden_dim = 10 model = SimpleModel(hidden_dim) diff --git a/tests/unit/runtime/test_data_efficiency.py b/tests/unit/runtime/test_data_efficiency.py index 993e4aa66..e6c99fd40 100644 --- a/tests/unit/runtime/test_data_efficiency.py +++ b/tests/unit/runtime/test_data_efficiency.py @@ -9,6 +9,7 @@ from unit.simple_model import Curriculum_SimpleModel, SimpleModel, random_datalo class MPU(): + def __init__(self, tp_world_size): self.rank = deepspeed.comm.get_rank() self.world_size = deepspeed.comm.get_world_size() @@ -103,10 +104,10 @@ class TestDataEfficiency(DistributedTest): model = SimpleModel(hidden_dim) dataset = random_dataset(20, hidden_dim, torch.device('cpu'), dtype=torch.half) model, _, data_loader, _ = deepspeed.initialize(config=config_dict, - model=model, - training_data=dataset, - model_parameters=model.parameters(), - mpu=MPU(1)) + model=model, + training_data=dataset, + model_parameters=model.parameters(), + mpu=MPU(1)) if model.mpu.get_data_parallel_rank() == 0 and not os.path.exists('/tmp'): os.makedirs('/tmp') model.set_data_post_process_func(data_post_process) @@ -147,15 +148,8 @@ class TestLegacyCurriculumScheduler(DistributedTest): "max_difficulty": 5, "schedule_type": "fixed_discrete", "schedule_config": { - "difficulty": [1, - 2, - 3, - 4, - 5], - "max_step": [2, - 4, - 6, - 8] + "difficulty": [1, 2, 3, 4, 5], + "max_step": [2, 4, 6, 8] } } } @@ -163,13 +157,8 @@ class TestLegacyCurriculumScheduler(DistributedTest): ground_truths = {1: 1, 2: 1, 3: 2, 4: 2, 5: 3, 6: 3, 7: 4, 8: 4} model = Curriculum_SimpleModel(hidden_dim) - model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) - data_loader = random_dataloader(model=model, - total_samples=20, - hidden_dim=hidden_dim, - device=model.device) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) + data_loader = random_dataloader(model=model, total_samples=20, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss, seqlen = model(batch[0], batch[1]) model.backward(loss) @@ -212,13 +201,8 @@ class TestLegacyCurriculumScheduler(DistributedTest): ground_truths = {1: 2, 2: 4, 3: 4, 4: 6, 5: 6, 6: 8, 7: 8, 8: 10, 9: 10, 10: 10} model = Curriculum_SimpleModel(hidden_dim) - model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) - data_loader = random_dataloader(model=model, - total_samples=20, - hidden_dim=hidden_dim, - device=model.device) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) + data_loader = random_dataloader(model=model, total_samples=20, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss, seqlen = model(batch[0], batch[1]) model.backward(loss) diff --git a/tests/unit/runtime/test_ds_config_dict.py b/tests/unit/runtime/test_ds_config_dict.py index 54c91a6fc..12e502167 100644 --- a/tests/unit/runtime/test_ds_config_dict.py +++ b/tests/unit/runtime/test_ds_config_dict.py @@ -93,10 +93,7 @@ class TestBatchConfig(DistributedTest): ds_config = DeepSpeedConfig(ds_batch_config) #test cases when all parameters are provided - status = _run_batch_config(ds_config, - train_batch=batch, - micro_batch=micro_batch, - gas=gas) + status = _run_batch_config(ds_config, train_batch=batch, micro_batch=micro_batch, gas=gas) _batch_assert(status, ds_config, batch, micro_batch, gas, success) #test cases when two out of three parameters are provided @@ -139,10 +136,7 @@ def test_temp_config_json(tmpdir): @pytest.mark.parametrize("gather_weights_key", - [ - "stage3_gather_16bit_weights_on_model_save", - "stage3_gather_fp16_weights_on_model_save" - ]) + ["stage3_gather_16bit_weights_on_model_save", "stage3_gather_fp16_weights_on_model_save"]) def test_gather_16bit_params_on_model_save(gather_weights_key): config_dict = { gather_weights_key: True, @@ -168,9 +162,7 @@ class TestConfigLoad(DistributedTest): def test_dict(self, base_config): hidden_dim = 10 model = SimpleModel(hidden_dim) - model, _, _, _ = deepspeed.initialize(config=base_config, - model=model, - model_parameters=model.parameters()) + model, _, _, _ = deepspeed.initialize(config=base_config, model=model, model_parameters=model.parameters()) def test_json(self, base_config, tmpdir): config_path = os.path.join(tmpdir, "config.json") @@ -178,9 +170,7 @@ class TestConfigLoad(DistributedTest): json.dump(base_config, fp) hidden_dim = 10 model = SimpleModel(hidden_dim) - model, _, _, _ = deepspeed.initialize(config=config_path, - model=model, - model_parameters=model.parameters()) + model, _, _, _ = deepspeed.initialize(config=config_path, model=model, model_parameters=model.parameters()) def test_hjson(self, base_config, tmpdir): config_path = os.path.join(tmpdir, "config.json") @@ -188,9 +178,7 @@ class TestConfigLoad(DistributedTest): hjson.dump(base_config, fp) hidden_dim = 10 model = SimpleModel(hidden_dim) - model, _, _, _ = deepspeed.initialize(config=config_path, - model=model, - model_parameters=model.parameters()) + model, _, _, _ = deepspeed.initialize(config=config_path, model=model, model_parameters=model.parameters()) class TestDeprecatedDeepScaleConfig(DistributedTest): @@ -206,13 +194,8 @@ class TestDeprecatedDeepScaleConfig(DistributedTest): hidden_dim = 10 model = SimpleModel(hidden_dim) - model, _, _,_ = deepspeed.initialize(args=args, - model=model, - model_parameters=model.parameters()) - data_loader = random_dataloader(model=model, - total_samples=5, - hidden_dim=hidden_dim, - device=model.device) + model, _, _, _ = deepspeed.initialize(args=args, model=model, model_parameters=model.parameters()) + data_loader = random_dataloader(model=model, total_samples=5, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) @@ -226,14 +209,11 @@ class TestDistInit(DistributedTest): hidden_dim = 10 model = SimpleModel(hidden_dim) - model, _, _,_ = deepspeed.initialize(config=base_config, - model=model, - model_parameters=model.parameters(), - dist_init_required=True) - data_loader = random_dataloader(model=model, - total_samples=5, - hidden_dim=hidden_dim, - device=model.device) + model, _, _, _ = deepspeed.initialize(config=base_config, + model=model, + model_parameters=model.parameters(), + dist_init_required=True) + data_loader = random_dataloader(model=model, total_samples=5, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) @@ -250,10 +230,7 @@ class TestInitNoOptimizer(DistributedTest): model = SimpleModel(hidden_dim=hidden_dim) model, _, _, _ = deepspeed.initialize(config=base_config, model=model) - data_loader = random_dataloader(model=model, - total_samples=5, - hidden_dim=hidden_dim, - device=model.device) + data_loader = random_dataloader(model=model, total_samples=5, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) with pytest.raises(AssertionError): @@ -268,20 +245,14 @@ class TestArgs(DistributedTest): def test_none_args(self, base_config): model = SimpleModel(hidden_dim=10) model, _, _, _ = deepspeed.initialize(args=None, model=model, config=base_config) - data_loader = random_dataloader(model=model, - total_samples=5, - hidden_dim=10, - device=model.device) + data_loader = random_dataloader(model=model, total_samples=5, hidden_dim=10, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) def test_no_args(self, base_config): model = SimpleModel(hidden_dim=10) model, _, _, _ = deepspeed.initialize(model=model, config=base_config) - data_loader = random_dataloader(model=model, - total_samples=5, - hidden_dim=10, - device=model.device) + data_loader = random_dataloader(model=model, total_samples=5, hidden_dim=10, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) diff --git a/tests/unit/runtime/test_ds_config_model.py b/tests/unit/runtime/test_ds_config_model.py index 24343a999..0fbfe5e77 100644 --- a/tests/unit/runtime/test_ds_config_model.py +++ b/tests/unit/runtime/test_ds_config_model.py @@ -11,10 +11,7 @@ from deepspeed.runtime.config_utils import DeepSpeedConfigModel class SimpleConf(DeepSpeedConfigModel): param_1: int = 0 - param_2_old: str = Field(None, - deprecated=True, - new_param="param_2", - new_param_fn=(lambda x: [x])) + param_2_old: str = Field(None, deprecated=True, new_param="param_2", new_param_fn=(lambda x: [x])) param_2: List[str] = None param_3: int = Field(0, alias="param_3_alias") @@ -68,16 +65,7 @@ def test_config_base_aliasfield(): assert config.param_3 == 10 -@pytest.mark.parametrize("config_dict", - [{ - "param_1": "DS" - }, - { - "param_2": "DS" - }, - { - "param_1_typo": 0 - }]) +@pytest.mark.parametrize("config_dict", [{"param_1": "DS"}, {"param_2": "DS"}, {"param_1_typo": 0}]) def test_config_base_literalfail(config_dict): with pytest.raises(ValidationError): config = SimpleConf(**config_dict) diff --git a/tests/unit/runtime/test_ds_initialize.py b/tests/unit/runtime/test_ds_initialize.py index c7eeef863..134f53267 100644 --- a/tests/unit/runtime/test_ds_initialize.py +++ b/tests/unit/runtime/test_ds_initialize.py @@ -61,6 +61,7 @@ class TestClientOptimizer(DistributedTest): world_size = 1 def test(self, optimizer_type): + def _optimizer_callable(params) -> Optimizer: return AdamW(params=params) @@ -77,9 +78,9 @@ class TestClientOptimizer(DistributedTest): client_optimizer = _optimizer_callable _, ds_optimizer, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=list(model.parameters()), - optimizer=client_optimizer) + model=model, + model_parameters=list(model.parameters()), + optimizer=client_optimizer) if client_optimizer is None: assert isinstance(ds_optimizer, FusedAdam) elif isinstance(client_optimizer, Optimizer): @@ -93,15 +94,7 @@ class TestConfigOptimizer(DistributedTest): world_size = 1 def test(self, client_parameters): - ds_config = { - "train_batch_size": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.001 - } - } - } + ds_config = {"train_batch_size": 1, "optimizer": {"type": "Adam", "params": {"lr": 0.001}}} hidden_dim = 10 model = SimpleModel(hidden_dim) @@ -111,9 +104,7 @@ class TestConfigOptimizer(DistributedTest): else: model_parameters = None - _, ds_optimizer, _, _ = deepspeed.initialize(config=ds_config, - model=model, - model_parameters=model_parameters) + _, ds_optimizer, _, _ = deepspeed.initialize(config=ds_config, model=model, model_parameters=model_parameters) assert isinstance(ds_optimizer, FusedAdam) @@ -205,14 +196,14 @@ class TestOptimizerImplementation(DistributedTest): if key in is_supported: _, ds_optimizer, _, _ = deepspeed.initialize(config=ds_config, - model=model, - model_parameters=model_parameters) + model=model, + model_parameters=model_parameters) assert True else: with pytest.raises(NotImplementedError): _, ds_optimizer, _, _ = deepspeed.initialize(config=ds_config, - model=model, - model_parameters=model_parameters) + model=model, + model_parameters=model_parameters) @pytest.mark.parametrize("scheduler_type", [None, _LRScheduler, Callable]) @@ -221,6 +212,7 @@ class TestClientLrScheduler(DistributedTest): world_size = 1 def test(self, scheduler_type, optimizer_type): + def _my_lambda(epoch): return epoch // 10 @@ -252,14 +244,11 @@ class TestClientLrScheduler(DistributedTest): client_scheduler = LambdaLR(client_optimizer, _my_lambda) else: # Verify invalid combination is correctly handled - client_scheduler = LambdaLR(torch.optim.Adam(model.parameters()), - _my_lambda) + client_scheduler = LambdaLR(torch.optim.Adam(model.parameters()), _my_lambda) else: client_scheduler = _lr_scheduler_callable - if isinstance(client_scheduler, - _LRScheduler) and not isinstance(client_optimizer, - Optimizer): + if isinstance(client_scheduler, _LRScheduler) and not isinstance(client_optimizer, Optimizer): with pytest.raises(AssertionError): _, _, _, _ = deepspeed.initialize(config=config_dict, model=model, diff --git a/tests/unit/runtime/test_lr_schedulers.py b/tests/unit/runtime/test_lr_schedulers.py index 7afcad542..e7e200a68 100644 --- a/tests/unit/runtime/test_lr_schedulers.py +++ b/tests/unit/runtime/test_lr_schedulers.py @@ -29,21 +29,14 @@ def _verify_staircase_increase(values, step_size): assert all([values[i] == v for v in values[i:j]]) -@pytest.mark.parametrize("scheduler_type,params", - [(WARMUP_LR, - {}), - (WARMUP_DECAY_LR, - { - WARMUP_NUM_STEPS: 10, - TOTAL_NUM_STEPS: 20 - }), - (ONE_CYCLE, - { - CYCLE_MIN_LR: 0, - CYCLE_MAX_LR: 0.1 - }), - (LR_RANGE_TEST, - {})]) +@pytest.mark.parametrize("scheduler_type,params", [(WARMUP_LR, {}), + (WARMUP_DECAY_LR, { + WARMUP_NUM_STEPS: 10, + TOTAL_NUM_STEPS: 20 + }), (ONE_CYCLE, { + CYCLE_MIN_LR: 0, + CYCLE_MAX_LR: 0.1 + }), (LR_RANGE_TEST, {})]) class TestGetLrBeforeTrain(DistributedTest): world_size = 1 @@ -198,26 +191,21 @@ class TestLrSchedule(DistributedTest): previous_lr = lr -@pytest.mark.parametrize("scheduler_type,params", - [(WARMUP_LR, - {}), - (WARMUP_DECAY_LR, - { - WARMUP_NUM_STEPS: 5, - TOTAL_NUM_STEPS: 10 - }), - (ONE_CYCLE, - { - CYCLE_MIN_LR: 0, - CYCLE_MAX_LR: 0.1, - CYCLE_FIRST_STEP_SIZE: 5, - DECAY_STEP_SIZE: 5 - }), - (LR_RANGE_TEST, - { - LR_RANGE_TEST_MIN_LR: 1e-4, - LR_RANGE_TEST_STEP_SIZE: 1 - })]) +@pytest.mark.parametrize("scheduler_type,params", [(WARMUP_LR, {}), + (WARMUP_DECAY_LR, { + WARMUP_NUM_STEPS: 5, + TOTAL_NUM_STEPS: 10 + }), + (ONE_CYCLE, { + CYCLE_MIN_LR: 0, + CYCLE_MAX_LR: 0.1, + CYCLE_FIRST_STEP_SIZE: 5, + DECAY_STEP_SIZE: 5 + }), + (LR_RANGE_TEST, { + LR_RANGE_TEST_MIN_LR: 1e-4, + LR_RANGE_TEST_STEP_SIZE: 1 + })]) class TestSchedulerOptimizerParity(DistributedTest): world_size = 1 @@ -294,8 +282,7 @@ class TestLrRange(DistributedTest): model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, - total_samples=max(50, - step_size * 2), + total_samples=max(50, step_size * 2), hidden_dim=hidden_dim, device=model.device, dtype=torch.float) @@ -358,8 +345,7 @@ class TestOneCycle(DistributedTest): model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, - total_samples=max(50, - cycle_step_size * 3), + total_samples=max(50, cycle_step_size * 3), hidden_dim=hidden_dim, device=model.device, dtype=torch.float) @@ -425,8 +411,7 @@ class TestOneCycle(DistributedTest): model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, - total_samples=max(50, - step_size * 3), + total_samples=max(50, step_size * 3), hidden_dim=hidden_dim, device=model.device, dtype=torch.float) diff --git a/tests/unit/runtime/test_multi_output_model.py b/tests/unit/runtime/test_multi_output_model.py index 0a802373a..b147ff6e4 100644 --- a/tests/unit/runtime/test_multi_output_model.py +++ b/tests/unit/runtime/test_multi_output_model.py @@ -34,18 +34,14 @@ class TestTwoOutputModel(DistributedTest): weight_value = 0.1 model = MultiOutputModel(hidden_dim, weight_value) - model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) total_samples = 4 data_loader = multi_output_dataloader(model=model, total_samples=total_samples, hidden_dim=hidden_dim, device=model.device, - inputs=[1.0, - 2.0], - targets=[1, - 2]) + inputs=[1.0, 2.0], + targets=[1, 2]) for n, batch in enumerate(data_loader): assert len(batch) % 2 == 0, \ f"multi_output_dataloader failed to return even number of data samples (input+target)" @@ -54,9 +50,7 @@ class TestTwoOutputModel(DistributedTest): inputs, targets = batch[:midpoint], batch[midpoint:] loss_tuple = model(inputs, targets) - expected_loss = torch.tensor(2.302734375, - dtype=torch.half, - device=model.device) + expected_loss = torch.tensor(2.302734375, dtype=torch.half, device=model.device) for loss in loss_tuple: assert loss.shape == torch.Size([]) assert loss.item() == approx(expected_loss.item()) @@ -96,21 +90,15 @@ class TestThreeOutputModel(DistributedTest): weight_value = 0.1 model = MultiOutputModel(hidden_dim, weight_value) - model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) total_samples = grad_accumulation_steps * micro_batch_size * 2 data_loader = multi_output_dataloader(model=model, total_samples=total_samples, hidden_dim=hidden_dim, device=model.device, - inputs=[1.0, - 2.0, - 3.0], - targets=[1, - 2, - 3]) + inputs=[1.0, 2.0, 3.0], + targets=[1, 2, 3]) for n, batch in enumerate(data_loader): assert len(batch) % 2 == 0, \ f"multi_output_dataloader failed to return even number of data samples (input+target)" @@ -120,9 +108,7 @@ class TestThreeOutputModel(DistributedTest): loss_tuple = model(inputs, targets) assert len(loss_tuple) == 3 - expected_loss = torch.tensor(2.302734375, - dtype=torch.half, - device=model.device) + expected_loss = torch.tensor(2.302734375, dtype=torch.half, device=model.device) for loss in loss_tuple: assert loss.shape == torch.Size([]) diff --git a/tests/unit/runtime/test_pld.py b/tests/unit/runtime/test_pld.py index 8b8ed2365..300d966a0 100644 --- a/tests/unit/runtime/test_pld.py +++ b/tests/unit/runtime/test_pld.py @@ -48,14 +48,9 @@ class TestPLDModel(DistributedTest): hidden_dim = 10 model = PLD_SimpleModel(hidden_dim, empty_grad=False) - model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) - data_loader = random_dataloader(model=model, - total_samples=50, - hidden_dim=hidden_dim, - device=model.device) + data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) for i, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) @@ -94,14 +89,9 @@ class TestNonPLDModel(DistributedTest): hidden_dim = 10 model = SimpleModel(hidden_dim, empty_grad=False) - model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) - data_loader = random_dataloader(model=model, - total_samples=1, - hidden_dim=hidden_dim, - device=model.device) + data_loader = random_dataloader(model=model, total_samples=1, hidden_dim=hidden_dim, device=model.device) for i, batch in enumerate(data_loader): with pytest.raises(TypeError): diff --git a/tests/unit/runtime/test_runtime_utils.py b/tests/unit/runtime/test_runtime_utils.py index 18a8bb77a..f7260c104 100644 --- a/tests/unit/runtime/test_runtime_utils.py +++ b/tests/unit/runtime/test_runtime_utils.py @@ -41,9 +41,7 @@ class TestClibGradNorm(DistributedTest): norm = ds_utils.clip_grad_norm_(parameters, max_norm=0.1) norm = torch.Tensor([norm]).to(get_accelerator().device_name(dist.get_rank())) world_size = dist.get_world_size() - gathered_norm = [ - torch.zeros(1).to(get_accelerator().device_name()) for i in range(world_size) - ] + gathered_norm = [torch.zeros(1).to(get_accelerator().device_name()) for i in range(world_size)] dist.all_gather(gathered_norm, norm) diff --git a/tests/unit/runtime/utils/test_partition.py b/tests/unit/runtime/utils/test_partition.py index 58b62825d..8eae6612e 100644 --- a/tests/unit/runtime/utils/test_partition.py +++ b/tests/unit/runtime/utils/test_partition.py @@ -164,33 +164,9 @@ def test_float_midheavy(): def test_balance_bert(): # Parameters per layer for a transformer model with 24 transformers and hidden dim 1024 weights = [ - 52559872, - 12596224, - 12596224, - 12596224, - 12596224, - 12596224, - 12596224, - 12596224, - 12596224, - 12596224, - 12596224, - 12596224, - 12596224, - 12596224, - 12596224, - 12596224, - 12596224, - 12596224, - 12596224, - 12596224, - 12596224, - 12596224, - 12596224, - 12596224, - 12596224, - 0, - 52559872 + 52559872, 12596224, 12596224, 12596224, 12596224, 12596224, 12596224, 12596224, 12596224, 12596224, 12596224, + 12596224, 12596224, 12596224, 12596224, 12596224, 12596224, 12596224, 12596224, 12596224, 12596224, 12596224, + 12596224, 12596224, 12596224, 0, 52559872 ] P = 8 parts = partition_balanced(weights, P) diff --git a/tests/unit/runtime/zero/test_ignore_unused_parameters.py b/tests/unit/runtime/zero/test_ignore_unused_parameters.py index efd4949c9..4e3a015f8 100644 --- a/tests/unit/runtime/zero/test_ignore_unused_parameters.py +++ b/tests/unit/runtime/zero/test_ignore_unused_parameters.py @@ -41,14 +41,9 @@ class TestStage2IgnoreUnusedParameters(DistributedTest): hidden_dim = 4 model = UnusedParametersModel(hidden_dim=hidden_dim) - model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) - data_loader = random_dataloader(model=model, - total_samples=10, - hidden_dim=hidden_dim, - device=model.device) + data_loader = random_dataloader(model=model, total_samples=10, hidden_dim=hidden_dim, device=model.device) def _loop(): for n, batch in enumerate(data_loader): diff --git a/tests/unit/runtime/zero/test_zero.py b/tests/unit/runtime/zero/test_zero.py index bd7d0ed1e..1863a1e78 100644 --- a/tests/unit/runtime/zero/test_zero.py +++ b/tests/unit/runtime/zero/test_zero.py @@ -23,6 +23,7 @@ from deepspeed.accelerator import get_accelerator def run_unbalanced_gradients(model, data_loader): + def drop_some_gradients(model, iter): odd_iteration = iter % 2 for i, p in enumerate(model.parameters()): @@ -73,13 +74,8 @@ class TestZeroUnbalancedGradients(DistributedTest): hidden_dim = 4 model = SimpleModel(hidden_dim=hidden_dim) - model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) - data_loader = random_dataloader(model=model, - total_samples=16, - hidden_dim=hidden_dim, - device=model.device) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) + data_loader = random_dataloader(model=model, total_samples=16, hidden_dim=hidden_dim, device=model.device) run_unbalanced_gradients(model, data_loader) @@ -112,6 +108,7 @@ class TestZero3RepeatForwardLoop(DistributedTest): hidden_dim = 4 class AlbertLikeModel(torch.nn.Module): + def __init__(self, hidden_dim): super().__init__() self.linear = torch.nn.Linear(hidden_dim, hidden_dim) @@ -125,13 +122,8 @@ class TestZero3RepeatForwardLoop(DistributedTest): return self.cross_entropy_loss(hidden, y) model = AlbertLikeModel(hidden_dim=hidden_dim) - model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) - data_loader = random_dataloader(model=model, - total_samples=16, - hidden_dim=hidden_dim, - device=model.device) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) + data_loader = random_dataloader(model=model, total_samples=16, hidden_dim=hidden_dim, device=model.device) for i, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) @@ -169,14 +161,13 @@ class TestZeroToFP32(DistributedTest): } class MyModel(torch.nn.Module): + def __init__(self, hidden_dim, n_layers): super().__init__() # to reproduce https://github.com/microsoft/DeepSpeed/pull/1372 it is important that # the number of total elements is uneven: # (1) 4 layers of 3*(3+1)=12 elements each, 48 in total - self.ll = torch.nn.ModuleList( - torch.nn.Linear(hidden_dim, - hidden_dim) for i in range(n_layers)) + self.ll = torch.nn.ModuleList(torch.nn.Linear(hidden_dim, hidden_dim) for i in range(n_layers)) # (2) the following adds 4+1=5 elements self.classifier = torch.nn.Linear(4, 1) # total 48+5=53 (uneven as desired) elements @@ -195,13 +186,8 @@ class TestZeroToFP32(DistributedTest): n_layers = world_size * 2 model = MyModel(hidden_dim=hidden_dim, n_layers=n_layers) - model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) - data_loader = random_dataloader(model=model, - total_samples=16, - hidden_dim=hidden_dim, - device=model.device) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) + data_loader = random_dataloader(model=model, total_samples=16, hidden_dim=hidden_dim, device=model.device) for i, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) @@ -222,8 +208,7 @@ class TestZeroToFP32(DistributedTest): orig_state_dict[name] = param.detach().cpu() if zero_stage == 3: - with deepspeed.zero.GatheredParameters(model.parameters(), - modifier_rank=None): + with deepspeed.zero.GatheredParameters(model.parameters(), modifier_rank=None): fp32_model = load_state_dict_from_zero_checkpoint(model.module, tmpdir) fp32_state_dict = fp32_model.state_dict() else: @@ -235,8 +220,7 @@ class TestZeroToFP32(DistributedTest): if dist.get_rank() == 0: for name in orig_state_dict.keys(): # float() workaround for torch<1.6 - assert torch.allclose(orig_state_dict[name].float(), - fp32_state_dict[name].float()) + assert torch.allclose(orig_state_dict[name].float(), fp32_state_dict[name].float()) def test_2_param_groups(self, tmpdir, zero_stage): # TODO: @@ -264,11 +248,10 @@ class TestZeroToFP32(DistributedTest): } class MyModel(torch.nn.Module): + def __init__(self, hidden_dim, n_layers): super().__init__() - self.ll = torch.nn.ModuleList( - torch.nn.Linear(hidden_dim, - hidden_dim) for i in range(n_layers)) + self.ll = torch.nn.ModuleList(torch.nn.Linear(hidden_dim, hidden_dim) for i in range(n_layers)) self.cross_entropy_loss = torch.nn.CrossEntropyLoss() def forward(self, x, y): @@ -298,12 +281,8 @@ class TestZeroToFP32(DistributedTest): model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), optimizer=optim, - config=config_dict - ) - data_loader = random_dataloader(model=model, - total_samples=16, - hidden_dim=hidden_dim, - device=model.device) + config=config_dict) + data_loader = random_dataloader(model=model, total_samples=16, hidden_dim=hidden_dim, device=model.device) for i, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) @@ -326,8 +305,7 @@ class TestZeroToFP32(DistributedTest): orig_state_dict[name] = param.detach().cpu() if zero_stage == 3: - with deepspeed.zero.GatheredParameters(model.parameters(), - modifier_rank=None): + with deepspeed.zero.GatheredParameters(model.parameters(), modifier_rank=None): fp32_model = load_state_dict_from_zero_checkpoint(model.module, tmpdir) fp32_state_dict = fp32_model.state_dict() else: @@ -339,8 +317,7 @@ class TestZeroToFP32(DistributedTest): if dist.get_rank() == 0: for name in orig_state_dict.keys(): # float() workaround for torch<1.6 - assert torch.allclose(orig_state_dict[name].float(), - fp32_state_dict[name].float()) + assert torch.allclose(orig_state_dict[name].float(), fp32_state_dict[name].float()) @pytest.mark.parametrize("allgather_bucket_size", [1000, 1001]) @@ -371,16 +348,13 @@ class TestIncorectAllgatherBucketSize(DistributedTest): model = SimpleModel(hidden_dim=hidden_dim) if allgather_bucket_size % 2 == 0: - model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) else: with pytest.raises(AssertionError) as assertinfo: model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) - assert "allgather_bucket_size must be a multiple of nccl_start_alignment_factor" in str( - assertinfo) + model=model, + model_parameters=model.parameters()) + assert "allgather_bucket_size must be a multiple of nccl_start_alignment_factor" in str(assertinfo) class TestPartitionNcclAlignment(DistributedTest): @@ -408,9 +382,7 @@ class TestPartitionNcclAlignment(DistributedTest): hidden_dim = 4 model = SimpleModel(hidden_dim=hidden_dim) - model, _, _, _ = deepspeed.initialize(config=config_dict, - model=model, - model_parameters=model.parameters()) + model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) # get nccl all-gather send buffers alignment factor nccl_start_alignment_factor = model.optimizer.nccl_start_alignment_factor @@ -419,23 +391,16 @@ class TestPartitionNcclAlignment(DistributedTest): for data_parallel_partitions in parallel_partitioned_bit16_groups: for partition_id, partitioned_data in enumerate(data_parallel_partitions): # verify that data partition start locations are 4-byte aligned - assert (partitioned_data.data_ptr() % - (2 * nccl_start_alignment_factor) == 0) + assert (partitioned_data.data_ptr() % (2 * nccl_start_alignment_factor) == 0) -def _ds_initialize_for_param_partitioning_testing(model: Module, - cfg: dict) -> DeepSpeedEngine: - ds_engine, _, _, _ = deepspeed.initialize( - config=cfg, - model=model, - model_parameters=model.parameters() - ) +def _ds_initialize_for_param_partitioning_testing(model: Module, cfg: dict) -> DeepSpeedEngine: + ds_engine, _, _, _ = deepspeed.initialize(config=cfg, model=model, model_parameters=model.parameters()) return ds_engine -def _assert_partition_status(model: Module, - valid_statuses: Set[ZeroParamStatus]) -> None: +def _assert_partition_status(model: Module, valid_statuses: Set[ZeroParamStatus]) -> None: for _, param in model.named_parameters(): assert param.ds_status in valid_statuses, param.ds_summary() @@ -446,6 +411,7 @@ def _assert_fully_available(model: Module) -> None: class EltwiseMultiplicationModule(Module): + def __init__(self, weight: Parameter) -> None: super().__init__() self.weight = weight @@ -459,6 +425,7 @@ class EltwiseMultiplicationModule(Module): class EltwiseMultiplicationTestNetwork(Module): """used for testing purposes""" + def __init__( self, weight1: Parameter, @@ -472,29 +439,18 @@ class EltwiseMultiplicationTestNetwork(Module): self.loss = L1Loss(reduction="none") - def forward(self, - x: Tensor, - y: Tensor, - use_module_trace: bool, - param_prefetching: bool) -> Dict[str, - Tensor]: - _assert_partition_status( - self, - { - ZeroParamStatus.NOT_AVAILABLE, - ZeroParamStatus.INFLIGHT, - ZeroParamStatus.AVAILABLE - } if use_module_trace else {ZeroParamStatus.NOT_AVAILABLE}) + def forward(self, x: Tensor, y: Tensor, use_module_trace: bool, param_prefetching: bool) -> Dict[str, Tensor]: + _assert_partition_status(self, + {ZeroParamStatus.NOT_AVAILABLE, ZeroParamStatus.INFLIGHT, ZeroParamStatus.AVAILABLE} + if use_module_trace else {ZeroParamStatus.NOT_AVAILABLE}) pre_layer_expected_states = { - ZeroParamStatus.INFLIGHT - if param_prefetching else ZeroParamStatus.NOT_AVAILABLE, + ZeroParamStatus.INFLIGHT if param_prefetching else ZeroParamStatus.NOT_AVAILABLE, ZeroParamStatus.AVAILABLE, } post_layer_expected_states = { - ZeroParamStatus.AVAILABLE - if param_prefetching else ZeroParamStatus.NOT_AVAILABLE, + ZeroParamStatus.AVAILABLE if param_prefetching else ZeroParamStatus.NOT_AVAILABLE, } _assert_partition_status(self.__layer1, pre_layer_expected_states) @@ -511,13 +467,9 @@ class EltwiseMultiplicationTestNetwork(Module): loss = self.loss(y_hat, y) - _assert_partition_status( - self, - { - ZeroParamStatus.NOT_AVAILABLE, - ZeroParamStatus.INFLIGHT, - ZeroParamStatus.AVAILABLE - } if use_module_trace else {ZeroParamStatus.NOT_AVAILABLE}) + _assert_partition_status(self, + {ZeroParamStatus.NOT_AVAILABLE, ZeroParamStatus.INFLIGHT, ZeroParamStatus.AVAILABLE} + if use_module_trace else {ZeroParamStatus.NOT_AVAILABLE}) return { "hidden1": hidden1, @@ -582,92 +534,34 @@ class TestZero3ParamPartitioningBase(DistributedTest): ds_engine = _ds_initialize_for_param_partitioning_testing(model, cfg) for i, weight in enumerate(weights): - weight.ds_tensor.data = torch.full_like(weight.ds_tensor.data, - (i + 1) * (1 + dist.get_rank())) + weight.ds_tensor.data = torch.full_like(weight.ds_tensor.data, (i + 1) * (1 + dist.get_rank())) def create_tensor(vals, dtype: torch.dtype = None) -> Tensor: return torch.as_tensor(vals, - dtype=dtype - or (torch.float16 if fp16_enabled else torch.float32), + dtype=dtype or (torch.float16 if fp16_enabled else torch.float32), device=ds_engine.device) expected_hidden1 = create_tensor([ - [1, - 1, - 1, - 1, - 1], - [1, - 1, - 1, - 2, - 2], - [2, - 2, - 2, - 2, - 2], + [1, 1, 1, 1, 1], + [1, 1, 1, 2, 2], + [2, 2, 2, 2, 2], ]) expected_hidden2 = create_tensor([ - [2, - 2, - 2, - 2, - 2], - [2, - 2, - 2, - 8, - 8], - [8, - 8, - 8, - 8, - 8], + [2, 2, 2, 2, 2], + [2, 2, 2, 8, 8], + [8, 8, 8, 8, 8], ]) - expected_yhat = create_tensor([[6, - 6, - 6, - 6, - 6], - [6, - 6, - 6, - 48, - 48], - [48, - 48, - 48, - 48, - 48]]) + expected_yhat = create_tensor([[6, 6, 6, 6, 6], [6, 6, 6, 48, 48], [48, 48, 48, 48, 48]]) expected_loss = create_tensor([ - [5, - 5, - 5, - 5, - 5], - [5, - 5, - 5, - 47, - 47], - [47, - 47, - 47, - 47, - 47], + [5, 5, 5, 5, 5], + [5, 5, 5, 47, 47], + [47, 47, 47, 47, 47], ]) for train_iter in range(3): activations = ds_engine( - x=torch.ones((m, - n), - dtype=torch.float16 if fp16_enabled else torch.float32, - device=ds_engine.device), - y=torch.ones((m, - n), - dtype=torch.float16 if fp16_enabled else torch.float32, - device=ds_engine.device), + x=torch.ones((m, n), dtype=torch.float16 if fp16_enabled else torch.float32, device=ds_engine.device), + y=torch.ones((m, n), dtype=torch.float16 if fp16_enabled else torch.float32, device=ds_engine.device), use_module_trace=train_iter > 0, param_prefetching=prefetching and train_iter > 0, ) @@ -680,7 +574,8 @@ class TestZero3ParamPartitioningBase(DistributedTest): # check the gradients grad_partitions = ds_engine.optimizer.get_fp32_grad_partitions() - assert set(grad_partitions.keys()) == {0}, f"should have one parameter group but got {len(grad_partitions)}" + assert set(grad_partitions.keys()) == {0 + }, f"should have one parameter group but got {len(grad_partitions)}" assert set(grad_partitions[0].keys()) == {0, 1, 2} dloss_wrt_layer1 = grad_partitions[0][0] dloss_wrt_layer2 = grad_partitions[0][1] @@ -699,33 +594,21 @@ class TestZero3ParamPartitioningBase(DistributedTest): grad_multiplier = 1 if zero_grad else (train_iter + 1) if dist.get_rank() == 0: - assert torch.allclose( - dloss_wrt_layer3.to(get_accelerator().device_name()), - grad_multiplier * create_tensor([2] * 8, - torch.float)) - assert torch.allclose( - dloss_wrt_layer2.to(get_accelerator().device_name()), - grad_multiplier * create_tensor([3 * 1] * 8, - torch.float)) - assert torch.allclose( - dloss_wrt_layer1.to(get_accelerator().device_name()), - grad_multiplier * create_tensor([3 * 2 * 1] * 8, - torch.float)) + assert torch.allclose(dloss_wrt_layer3.to(get_accelerator().device_name()), + grad_multiplier * create_tensor([2] * 8, torch.float)) + assert torch.allclose(dloss_wrt_layer2.to(get_accelerator().device_name()), + grad_multiplier * create_tensor([3 * 1] * 8, torch.float)) + assert torch.allclose(dloss_wrt_layer1.to(get_accelerator().device_name()), + grad_multiplier * create_tensor([3 * 2 * 1] * 8, torch.float)) elif dist.get_rank() == 1: # parameters dont split evenly across ranks so rank 1 has a zero-padded # partition - assert torch.allclose( - dloss_wrt_layer3.to(get_accelerator().device_name()), - grad_multiplier * create_tensor(([8] * 7) + [0], - torch.float)) - assert torch.allclose( - dloss_wrt_layer2.to(get_accelerator().device_name()), - grad_multiplier * create_tensor(([6 * 2] * 7) + [0], - torch.float)) - assert torch.allclose( - dloss_wrt_layer1.to(get_accelerator().device_name()), - grad_multiplier * create_tensor(([6 * 4 * 1] * 7) + [0], - torch.float)) + assert torch.allclose(dloss_wrt_layer3.to(get_accelerator().device_name()), + grad_multiplier * create_tensor(([8] * 7) + [0], torch.float)) + assert torch.allclose(dloss_wrt_layer2.to(get_accelerator().device_name()), + grad_multiplier * create_tensor(([6 * 2] * 7) + [0], torch.float)) + assert torch.allclose(dloss_wrt_layer1.to(get_accelerator().device_name()), + grad_multiplier * create_tensor(([6 * 4 * 1] * 7) + [0], torch.float)) else: raise RuntimeError("test has world size of two") @@ -746,7 +629,9 @@ class TestZero3ParamPartitioningLargeParam(DistributedTest): world_size = 4 def test(self, init_context_manager: bool, param_sz: int = 8100) -> None: + class LargeParamModel(Module): + def __init__(self): super().__init__() self.param = Parameter(torch.zeros((param_sz, ), dtype=torch.float32)) @@ -783,25 +668,17 @@ class TestZero3ParamPartitioningLargeParam(DistributedTest): "loss_scale": 1., } } - with deepspeed.zero.Init(mem_efficient_linear=False, - enabled=init_context_manager): + with deepspeed.zero.Init(mem_efficient_linear=False, enabled=init_context_manager): model = LargeParamModel() ds_engine = _ds_initialize_for_param_partitioning_testing(model, ds_config) for train_iter in range(3): # test multiple iterations to cover prefetching - activation: Tensor = ds_engine( - torch.ones(param_sz, - dtype=torch.float16, - device=ds_engine.device)) + activation: Tensor = ds_engine(torch.ones(param_sz, dtype=torch.float16, device=ds_engine.device)) partition_sz = math.ceil(param_sz / self.world_size) for rank_idx, start_idx in enumerate(range(0, param_sz, partition_sz)): - activation_from_partition = activation[start_idx:start_idx + - partition_sz] - assert torch.allclose( - activation_from_partition, - torch.full_like(activation_from_partition, - rank_idx)) + activation_from_partition = activation[start_idx:start_idx + partition_sz] + assert torch.allclose(activation_from_partition, torch.full_like(activation_from_partition, rank_idx)) ds_engine.backward(activation.sum()) ds_engine.allreduce_gradients() @@ -809,9 +686,7 @@ class TestZero3ParamPartitioningLargeParam(DistributedTest): avgd_gradients = ds_engine.optimizer.averaged_gradients assert set(avgd_gradients.keys()) == {0}, "should only have one parameter group" weight_gradient, = avgd_gradients[0] - expected_weight_gradient = (train_iter + 1) * torch.full_like( - weight_gradient, - 1) + expected_weight_gradient = (train_iter + 1) * torch.full_like(weight_gradient, 1) assert torch.allclose(weight_gradient, expected_weight_gradient) @@ -823,27 +698,24 @@ class TestZero3ParamPartitioningManyParams(DistributedTest): world_size = 4 def test(self, param_sz: int, n_layers: int, init_context_manager: bool) -> None: + class ManyParamModel(Module): + def __init__(self) -> None: super().__init__() self.modulelist = ModuleList( - EltwiseMultiplicationModule( - weight=Parameter(torch.empty((param_sz, - ), - dtype=torch.float32))) + EltwiseMultiplicationModule(weight=Parameter(torch.empty((param_sz, ), dtype=torch.float32))) for _ in range(n_layers)) for layer_num, module in enumerate(self.modulelist): - with deepspeed.zero.GatheredParameters(module.weight, - modifier_rank=0): + with deepspeed.zero.GatheredParameters(module.weight, modifier_rank=0): param: Parameter = module.weight partition_sz = math.ceil(param.numel() / dist.get_world_size()) offset = 0 for rank in range(dist.get_world_size()): with torch.no_grad(): - param[offset:offset + partition_sz].fill_(2 * layer_num * - rank) + param[offset:offset + partition_sz].fill_(2 * layer_num * rank) offset += partition_sz def forward(self, x: Tensor) -> Tensor: @@ -875,28 +747,20 @@ class TestZero3ParamPartitioningManyParams(DistributedTest): } } - with deepspeed.zero.Init(config=ds_cfg, - mem_efficient_linear=False, - enabled=init_context_manager): + with deepspeed.zero.Init(config=ds_cfg, mem_efficient_linear=False, enabled=init_context_manager): model = ManyParamModel() ds_engine = _ds_initialize_for_param_partitioning_testing(model, ds_cfg) for _ in range(3): # test multiple iterations to cover prefetching activations: List[Tensor] = ds_engine( - torch.ones((param_sz, - ), - dtype=torch.float16, - device=ds_engine.device)) + torch.ones((param_sz, ), dtype=torch.float16, device=ds_engine.device)) assert len(activations) == n_layers partition_sz = math.ceil(param_sz / self.world_size) - expected_activations = torch.empty(param_sz, - dtype=torch.float16, - device=ds_engine.device) + expected_activations = torch.empty(param_sz, dtype=torch.float16, device=ds_engine.device) for start_idx in range(0, param_sz, partition_sz): - expected_activations[start_idx:start_idx + - partition_sz] = dist.get_rank() + expected_activations[start_idx:start_idx + partition_sz] = dist.get_rank() for layer_num, activation in enumerate(activations): expected_activations *= 2 * layer_num @@ -917,7 +781,9 @@ class TestZero3InitForParentWeightInitialization(DistributedTest): world_size = 4 def test(self): + class ModelWhereParentInitializesChildWeights(Module): + def __init__(self) -> None: super().__init__() @@ -950,15 +816,11 @@ class TestZero3InitForParentWeightInitialization(DistributedTest): } } - with deepspeed.zero.Init(config=ds_cfg, - mem_efficient_linear=False, - enabled=True): + with deepspeed.zero.Init(config=ds_cfg, mem_efficient_linear=False, enabled=True): model = ModelWhereParentInitializesChildWeights() assert model.linear.weight.ds_tensor.numel() == math.ceil(12 / self.world_size) - assert torch.allclose(model.linear.weight.ds_tensor, - torch.full_like(model.linear.weight.ds_tensor, - 1)) + assert torch.allclose(model.linear.weight.ds_tensor, torch.full_like(model.linear.weight.ds_tensor, 1)) @pytest.mark.skip("not working") @@ -1015,90 +877,33 @@ class TestZero3ParamPartitioningBaseBF16(DistributedTest): ds_engine = _ds_initialize_for_param_partitioning_testing(model, cfg) for i, weight in enumerate(weights): - weight.ds_tensor.data = torch.full_like(weight.ds_tensor.data, - (i + 1) * (1 + dist.get_rank())) + weight.ds_tensor.data = torch.full_like(weight.ds_tensor.data, (i + 1) * (1 + dist.get_rank())) def create_tensor(vals): return torch.as_tensor(vals, dtype=torch.bfloat16, device=ds_engine.device) expected_hidden1 = create_tensor([ - [1, - 1, - 1, - 1, - 1], - [1, - 1, - 1, - 2, - 2], - [2, - 2, - 2, - 2, - 2], + [1, 1, 1, 1, 1], + [1, 1, 1, 2, 2], + [2, 2, 2, 2, 2], ]) expected_hidden2 = create_tensor([ - [2, - 2, - 2, - 2, - 2], - [2, - 2, - 2, - 8, - 8], - [8, - 8, - 8, - 8, - 8], + [2, 2, 2, 2, 2], + [2, 2, 2, 8, 8], + [8, 8, 8, 8, 8], ]) - expected_yhat = create_tensor([[6, - 6, - 6, - 6, - 6], - [6, - 6, - 6, - 48, - 48], - [48, - 48, - 48, - 48, - 48]]) + expected_yhat = create_tensor([[6, 6, 6, 6, 6], [6, 6, 6, 48, 48], [48, 48, 48, 48, 48]]) expected_loss = create_tensor([ - [5, - 5, - 5, - 5, - 5], - [5, - 5, - 5, - 47, - 47], - [47, - 47, - 47, - 47, - 47], + [5, 5, 5, 5, 5], + [5, 5, 5, 47, 47], + [47, 47, 47, 47, 47], ]) for train_iter in range(3): _assert_partition_status(ds_engine, {ZeroParamStatus.NOT_AVAILABLE}) activations = ds_engine( - x=torch.ones((m, - n), - dtype=torch.bfloat16, - device=ds_engine.device), - y=torch.ones((m, - n), - dtype=torch.bfloat16, - device=ds_engine.device), + x=torch.ones((m, n), dtype=torch.bfloat16, device=ds_engine.device), + y=torch.ones((m, n), dtype=torch.bfloat16, device=ds_engine.device), use_module_trace=train_iter > 0, param_prefetching=prefetching and train_iter > 0, ) @@ -1112,7 +917,8 @@ class TestZero3ParamPartitioningBaseBF16(DistributedTest): # check the gradients grad_partitions = ds_engine.optimizer.get_fp32_grad_partitions() - assert set(grad_partitions.keys()) == {0}, f"should have one parameter group but got {len(grad_partitions)}" + assert set(grad_partitions.keys()) == {0 + }, f"should have one parameter group but got {len(grad_partitions)}" assert set(grad_partitions[0].keys()) == {0, 1, 2} dloss_wrt_layer1 = grad_partitions[0][0] dloss_wrt_layer2 = grad_partitions[0][1] @@ -1129,31 +935,21 @@ class TestZero3ParamPartitioningBaseBF16(DistributedTest): grad_multiplier = 1 if zero_grad else (train_iter + 1) if dist.get_rank() == 0: - assert torch.allclose( - dloss_wrt_layer3.to(get_accelerator().device_name()), - grad_multiplier * create_tensor([2] * 8).to(expected_grad_dtype)) - assert torch.allclose( - dloss_wrt_layer2.to(get_accelerator().device_name()), - grad_multiplier * create_tensor([3 * 1] * 8).to(expected_grad_dtype)) - assert torch.allclose( - dloss_wrt_layer1.to(get_accelerator().device_name()), - grad_multiplier * - create_tensor([3 * 2 * 1] * 8).to(expected_grad_dtype)) + assert torch.allclose(dloss_wrt_layer3.to(get_accelerator().device_name()), + grad_multiplier * create_tensor([2] * 8).to(expected_grad_dtype)) + assert torch.allclose(dloss_wrt_layer2.to(get_accelerator().device_name()), + grad_multiplier * create_tensor([3 * 1] * 8).to(expected_grad_dtype)) + assert torch.allclose(dloss_wrt_layer1.to(get_accelerator().device_name()), + grad_multiplier * create_tensor([3 * 2 * 1] * 8).to(expected_grad_dtype)) elif dist.get_rank() == 1: # parameters dont split evenly across ranks so rank 1 has a zero-padded # partition - assert torch.allclose( - dloss_wrt_layer3.to(get_accelerator().device_name()), - grad_multiplier * - create_tensor(([8] * 7) + [0]).to(expected_grad_dtype)) - assert torch.allclose( - dloss_wrt_layer2.to(get_accelerator().device_name()), - grad_multiplier * - create_tensor(([6 * 2] * 7) + [0]).to(expected_grad_dtype)) - assert torch.allclose( - dloss_wrt_layer1.to(get_accelerator().device_name()), - grad_multiplier * - create_tensor(([6 * 4 * 1] * 7) + [0]).to(expected_grad_dtype)) + assert torch.allclose(dloss_wrt_layer3.to(get_accelerator().device_name()), + grad_multiplier * create_tensor(([8] * 7) + [0]).to(expected_grad_dtype)) + assert torch.allclose(dloss_wrt_layer2.to(get_accelerator().device_name()), + grad_multiplier * create_tensor(([6 * 2] * 7) + [0]).to(expected_grad_dtype)) + assert torch.allclose(dloss_wrt_layer1.to(get_accelerator().device_name()), + grad_multiplier * create_tensor(([6 * 4 * 1] * 7) + [0]).to(expected_grad_dtype)) else: raise RuntimeError("test has world size of two") @@ -1193,13 +989,8 @@ class TestZeroOffloadStage1(DistributedTest): hidden_dim = 10 model = SimpleModel(hidden_dim) - model, _, _, _ = deepspeed.initialize(model=model, - model_parameters=model.parameters(), - config=config_dict) - data_loader = random_dataloader(model=model, - total_samples=50, - hidden_dim=hidden_dim, - device=model.device) + model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict) + data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) dist.barrier() for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) @@ -1231,6 +1022,7 @@ class TestZero3DictFwd(DistributedTest): hidden_dim = 10 class MyModel(torch.nn.Module): + def __init__(self, hidden_dim): super(MyModel, self).__init__() self.l1 = torch.nn.Linear(hidden_dim, hidden_dim) @@ -1252,13 +1044,8 @@ class TestZero3DictFwd(DistributedTest): with deepspeed.zero.Init(): model = MyModel(hidden_dim) - model, _, _, _ = deepspeed.initialize(model=model, - model_parameters=model.parameters(), - config=config_dict) - data_loader = random_dataloader(model=model, - total_samples=50, - hidden_dim=hidden_dim, - device=model.device) + model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict) + data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) dist.barrier() for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) @@ -1302,10 +1089,7 @@ class TestZeroAdamOptimizerStepCount(DistributedTest): model, optimizer, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) - data_loader = random_dataloader(model=model, - total_samples=16, - hidden_dim=hidden_dim, - device=model.device) + data_loader = random_dataloader(model=model, total_samples=16, hidden_dim=hidden_dim, device=model.device) for i, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) @@ -1350,6 +1134,7 @@ class TestZeroFrozenWeights(DistributedTest): hidden_dim = 10 class MyModel(torch.nn.Module): + def __init__(self, hidden_dim): super(MyModel, self).__init__() self.l1 = torch.nn.Linear(hidden_dim, hidden_dim) @@ -1372,13 +1157,8 @@ class TestZeroFrozenWeights(DistributedTest): with deepspeed.zero.Init(config_dict_or_path=config_dict): model = MyModel(hidden_dim) - model, _, _, _ = deepspeed.initialize(model=model, - model_parameters=model.parameters(), - config=config_dict) - data_loader = random_dataloader(model=model, - total_samples=50, - hidden_dim=hidden_dim, - device=model.device) + model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict) + data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) dist.barrier() for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) @@ -1415,13 +1195,9 @@ class TestZeroOffloadOptim(DistributedTest): if force_ds_optim: with pytest.raises(ZeRORuntimeException): - model, _, _, _ = deepspeed.initialize(model=model, - optimizer=optimizer, - config=config_dict) + model, _, _, _ = deepspeed.initialize(model=model, optimizer=optimizer, config=config_dict) else: - model, _, _, _ = deepspeed.initialize(model=model, - optimizer=optimizer, - config=config_dict) + model, _, _, _ = deepspeed.initialize(model=model, optimizer=optimizer, config=config_dict) @pytest.mark.parametrize('training', [True, False]) diff --git a/tests/unit/runtime/zero/test_zero_config.py b/tests/unit/runtime/zero/test_zero_config.py index 84852ec2e..4520d29a2 100644 --- a/tests/unit/runtime/zero/test_zero_config.py +++ b/tests/unit/runtime/zero/test_zero_config.py @@ -55,20 +55,11 @@ def test_zero_offload_optimizer_config_pipeline(): config = DeepSpeedZeroOffloadOptimizerConfig() assert config.pipeline == False - config = DeepSpeedZeroOffloadOptimizerConfig(**{ - "pipeline_read": True, - "pipeline_write": False - }) + config = DeepSpeedZeroOffloadOptimizerConfig(**{"pipeline_read": True, "pipeline_write": False}) assert config.pipeline == True - config = DeepSpeedZeroOffloadOptimizerConfig(**{ - "pipeline_read": False, - "pipeline_write": True - }) + config = DeepSpeedZeroOffloadOptimizerConfig(**{"pipeline_read": False, "pipeline_write": True}) assert config.pipeline == True - config = DeepSpeedZeroOffloadOptimizerConfig(**{ - "pipeline_read": True, - "pipeline_write": True - }) + config = DeepSpeedZeroOffloadOptimizerConfig(**{"pipeline_read": True, "pipeline_write": True}) assert config.pipeline == True diff --git a/tests/unit/runtime/zero/test_zero_context.py b/tests/unit/runtime/zero/test_zero_context.py index a88db4488..92658d71a 100644 --- a/tests/unit/runtime/zero/test_zero_context.py +++ b/tests/unit/runtime/zero/test_zero_context.py @@ -14,6 +14,7 @@ from utils import setup_serial_env # Test that no sub-class or super-class is missed class ConvX(torch.nn.Conv1d): + def __init__(self, *args): super().__init__(*args) # This would not be partitioned before bugfix 5ca8167 @@ -24,6 +25,7 @@ class ConvX(torch.nn.Conv1d): class ConvNet(torch.nn.Module): + def __init__(self): super().__init__() self.conv1 = ConvX(1, 3, 4) @@ -61,6 +63,7 @@ class TestZeroGatheredParametersFree(DistributedTest): hidden_dim = 10 class MyModel(torch.nn.Module): + def __init__(self, hidden_dim): super(MyModel, self).__init__() self.l1 = torch.nn.Linear(hidden_dim, hidden_dim) @@ -126,9 +129,9 @@ class TestSerialContext(DistributedTest): args = SimpleNamespace(local_rank=0) net = SimpleModel(hidden_dim=4) engine, _, _, _ = deepspeed.initialize(args=args, - config=config_dict, - model=net, - model_parameters=net.parameters()) + config=config_dict, + model=net, + model_parameters=net.parameters()) assert engine.tput_timer.batch_size == train_micro_batch_size_per_gpu * gradient_accumulation_steps assert not engine.tput_timer.initialized @@ -167,11 +170,9 @@ class TestSerialContext(DistributedTest): assert engine.tput_timer.total_elapsed_time == 0 # calling start()/stop() to increment the step counter until start_step - while engine.tput_timer.micro_step_count < (gradient_accumulation_steps * - engine.tput_timer.start_step): + while engine.tput_timer.micro_step_count < (gradient_accumulation_steps * engine.tput_timer.start_step): engine.tput_timer.start() - global_step = (engine.tput_timer.micro_step_count + - 1) % gradient_accumulation_steps == 0 + global_step = (engine.tput_timer.micro_step_count + 1) % gradient_accumulation_steps == 0 engine.tput_timer.stop(global_step=global_step) assert engine.tput_timer.global_step_count == engine.tput_timer.start_step assert engine.tput_timer.total_elapsed_time == 0 @@ -182,20 +183,20 @@ class TestSerialContext(DistributedTest): current_duration = engine.tput_timer.step_elapsed_time total_duration = engine.tput_timer.total_elapsed_time - global_step = (engine.tput_timer.micro_step_count + - 1) % gradient_accumulation_steps == 0 + global_step = (engine.tput_timer.micro_step_count + 1) % gradient_accumulation_steps == 0 engine.tput_timer.stop(global_step=global_step) duration = engine.tput_timer.end_time - engine.tput_timer.start_time # step elapsed time is reset after gradient accumulation steps assert engine.tput_timer.step_elapsed_time == ( - 0 if engine.tput_timer.global_step_count != engine.tput_timer.start_step - else current_duration + duration) + 0 if engine.tput_timer.global_step_count != engine.tput_timer.start_step else current_duration + + duration) assert engine.tput_timer.total_elapsed_time == total_duration + duration def test_ext_param_getattr(self): setup_serial_env() class ExtLinear(torch.nn.Module): + def __init__(self, dim=16): super().__init__() self.dim = dim @@ -214,9 +215,9 @@ class TestSerialContext(DistributedTest): args = SimpleNamespace(local_rank=0) engine, optim, _, _ = deepspeed.initialize(args=args, - model=net, - model_parameters=net.parameters(), - config=config) + model=net, + model_parameters=net.parameters(), + config=config) with deepspeed.zero.GatheredParameters(net.linear1.weight): assert net.linear1.weight.numel() == net.dim**2 diff --git a/tests/unit/runtime/zero/test_zero_context_ancestry.py b/tests/unit/runtime/zero/test_zero_context_ancestry.py index 38ae52490..7f3e2bc96 100644 --- a/tests/unit/runtime/zero/test_zero_context_ancestry.py +++ b/tests/unit/runtime/zero/test_zero_context_ancestry.py @@ -31,32 +31,30 @@ config = { # test that sub-classes get params that aren't prematurely partitioned and thus requiring gathering # fixed by https://github.com/microsoft/DeepSpeed/pull/1202 class GrandPa(torch.nn.Module): + def __init__(self, *args): super().__init__(*args) self.param_grandpa = torch.nn.Parameter(torch.ones(5)) - self.param_grandpa.data = (self.param_grandpa.data + - 1).data # test param is not yet partitioned + self.param_grandpa.data = (self.param_grandpa.data + 1).data # test param is not yet partitioned class Pa(GrandPa): + def __init__(self, *args): super().__init__(*args) self.param_pa = torch.nn.Parameter(torch.ones(5)) - self.param_pa.data = (self.param_pa.data + - 1).data # test param is not yet partitioned - self.param_grandpa.data = (self.param_grandpa.data + - 1).data # test param is not yet partitioned + self.param_pa.data = (self.param_pa.data + 1).data # test param is not yet partitioned + self.param_grandpa.data = (self.param_grandpa.data + 1).data # test param is not yet partitioned class Son(Pa): + def __init__(self): super().__init__() self.param = torch.nn.Parameter(torch.ones(5)) self.param.data = (self.param.data + 1).data # test param is not yet partitioned - self.param_pa.data = (self.param_pa.data + - 1).data # test param is not yet partitioned - self.param_grandpa.data = (self.param_grandpa.data + - 1).data # test param is not yet partitioned + self.param_pa.data = (self.param_pa.data + 1).data # test param is not yet partitioned + self.param_grandpa.data = (self.param_grandpa.data + 1).data # test param is not yet partitioned class TestSerialParamInit(DistributedTest): @@ -98,6 +96,7 @@ class TestDSInitWZinit(DistributedTest): } class Model(torch.nn.Module): + def __init__(self): super(Model, self).__init__() self.linear = torch.nn.Linear(4, 4) diff --git a/tests/unit/runtime/zero/test_zero_context_return.py b/tests/unit/runtime/zero/test_zero_context_return.py index 68329cb88..c4e7e86c6 100644 --- a/tests/unit/runtime/zero/test_zero_context_return.py +++ b/tests/unit/runtime/zero/test_zero_context_return.py @@ -11,6 +11,7 @@ from unit.common import DistributedTest class DanglingBias(torch.nn.Linear): + def forward(self, *inputs): out = super().forward(*inputs) # return the bias to trigger a dangling external param @@ -19,18 +20,21 @@ class DanglingBias(torch.nn.Linear): class DataClass: """Just wraps data in an object. """ + def __init__(self, out=None, bias=None): self.out = out self.bias = bias class DanglingBiasClass(DanglingBias): + def forward(self, *inputs): out, bias = super().forward(*inputs) return DataClass(out=out, bias=bias) class DanglingAttention(torch.nn.Linear): + def __init__(self, dim=16, return_obj=False): super().__init__(dim, dim) self.dim = dim @@ -56,6 +60,7 @@ class DanglingAttention(torch.nn.Linear): class ModelContainer(torch.nn.Module): + def __init__(self, dim=16, return_obj=False): super().__init__() self.dim = dim @@ -70,6 +75,7 @@ class ModelContainer(torch.nn.Module): class DanglingExt(torch.nn.Module): + def __init__(self, dim=16): super().__init__() self.dim = dim @@ -86,6 +92,7 @@ class DanglingExt(torch.nn.Module): class ModelContainerVariableOutputType(ModelContainer): + def __init__(self, dim=16, output_type=dict): super().__init__() self.output_type = output_type @@ -129,10 +136,7 @@ class TestReturnParam(DistributedTest): net = DanglingExt() args = SimpleNamespace(local_rank=0) - engine, _, _, _ = deepspeed.initialize(args=args, - model=net, - model_parameters=net.parameters(), - config=config) + engine, _, _, _ = deepspeed.initialize(args=args, model=net, model_parameters=net.parameters(), config=config) for _ in range(5): input = torch.rand(net.dim).to(engine.device).half() @@ -148,10 +152,7 @@ class TestReturnParam(DistributedTest): net = ModelContainer(return_obj=True) args = SimpleNamespace(local_rank=0) - engine, _, _, _ = deepspeed.initialize(args=args, - model=net, - model_parameters=net.parameters(), - config=config) + engine, _, _, _ = deepspeed.initialize(args=args, model=net, model_parameters=net.parameters(), config=config) for _ in range(5): input = torch.rand(net.dim).to(engine.device).half() @@ -169,10 +170,7 @@ class TestReturnParam(DistributedTest): net = ModelContainerVariableOutputType(output_type=output_type) args = SimpleNamespace(local_rank=0) - engine, _, _, _ = deepspeed.initialize(args=args, - model=net, - model_parameters=net.parameters(), - config=config) + engine, _, _, _ = deepspeed.initialize(args=args, model=net, model_parameters=net.parameters(), config=config) for _ in range(1): input = torch.rand(net.dim).to(engine.device).half() diff --git a/tests/unit/runtime/zero/test_zero_tensor_fragment.py b/tests/unit/runtime/zero/test_zero_tensor_fragment.py index 20caf05dd..77db33591 100644 --- a/tests/unit/runtime/zero/test_zero_tensor_fragment.py +++ b/tests/unit/runtime/zero/test_zero_tensor_fragment.py @@ -28,18 +28,15 @@ def validate_full_tensors(model): class MyModel(torch.nn.Module): + def __init__(self, hidden_dim, frozen_weights): super(MyModel, self).__init__() self.act = torch.nn.ReLU() self.cel = torch.nn.CrossEntropyLoss() - self.linears = torch.nn.ModuleList([ - torch.nn.Linear(hidden_dim, - 1), - torch.nn.Linear(1, - 1), - torch.nn.Linear(1, - hidden_dim) - ]) + self.linears = torch.nn.ModuleList( + [torch.nn.Linear(hidden_dim, 1), + torch.nn.Linear(1, 1), + torch.nn.Linear(1, hidden_dim)]) if frozen_weights: self.linears[0].weight.requires_grad = False self.linears[0].bias.requires_grad = False @@ -54,9 +51,7 @@ class MyModel(torch.nn.Module): def run_fragmented_model(model, config_dict, hidden_dim, dtype): - model, _, _, _ = deepspeed.initialize(model=model, - model_parameters=model.parameters(), - config=config_dict) + model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict) data_loader = random_dataloader(model=model, total_samples=10, hidden_dim=hidden_dim, @@ -77,11 +72,7 @@ class TestTensorFragment(DistributedTest): world_size = 2 @pytest.mark.parametrize('zero_stage', [1, 2, 3]) - @pytest.mark.parametrize( - 'offload_device', - [OffloadDeviceEnum.none, - OffloadDeviceEnum.cpu, - OffloadDeviceEnum.nvme]) + @pytest.mark.parametrize('offload_device', [OffloadDeviceEnum.none, OffloadDeviceEnum.cpu, OffloadDeviceEnum.nvme]) def test_zero_fragments(self, tmpdir, zero_stage, offload_device, frozen_weights): if offload_device == OffloadDeviceEnum.nvme: if zero_stage != 3: @@ -108,9 +99,7 @@ class TestTensorFragment(DistributedTest): } if offload_device == OffloadDeviceEnum.cpu: - config_dict["zero_optimization"]["offload_optimizer"] = { - "device": offload_device - } + config_dict["zero_optimization"]["offload_optimizer"] = {"device": offload_device} elif offload_device == OffloadDeviceEnum.nvme: config_dict["zero_optimization"]["offload_optimizer"] = { "device": offload_device, diff --git a/tests/unit/runtime/zero/test_zero_tiled.py b/tests/unit/runtime/zero/test_zero_tiled.py index 5858b5936..8eb44d2ba 100644 --- a/tests/unit/runtime/zero/test_zero_tiled.py +++ b/tests/unit/runtime/zero/test_zero_tiled.py @@ -120,6 +120,7 @@ class LinearWrapper(torch.nn.Linear): Megatron-LM optionally delays the bias addition to fuse with a proceeding kernel. """ + def forward(self, input): out = super().forward(input) return out, self.bias diff --git a/tests/unit/simple_model.py b/tests/unit/simple_model.py index 481aae0bf..58170c4b3 100644 --- a/tests/unit/simple_model.py +++ b/tests/unit/simple_model.py @@ -13,11 +13,10 @@ import deepspeed.comm as dist class SimpleModel(torch.nn.Module): + def __init__(self, hidden_dim, empty_grad=False, nlayers=1): super(SimpleModel, self).__init__() - self.linears = torch.nn.ModuleList( - [torch.nn.Linear(hidden_dim, - hidden_dim) for i in range(nlayers)]) + self.linears = torch.nn.ModuleList([torch.nn.Linear(hidden_dim, hidden_dim) for i in range(nlayers)]) if empty_grad: self.linear2 = torch.nn.Linear(hidden_dim, hidden_dim) self.cross_entropy_loss = torch.nn.CrossEntropyLoss() @@ -33,6 +32,7 @@ class SimpleModel(torch.nn.Module): class Curriculum_SimpleModel(SimpleModel): + def __init__(self, hidden_dim, empty_grad=False): super(Curriculum_SimpleModel, self).__init__(hidden_dim, empty_grad) @@ -43,6 +43,7 @@ class Curriculum_SimpleModel(SimpleModel): class SimpleMoEModel(torch.nn.Module): + def __init__(self, hidden_dim, num_experts=4, ep_size=1, use_residual=False): super(SimpleMoEModel, self).__init__() self.linear = torch.nn.Linear(hidden_dim, hidden_dim) @@ -72,6 +73,7 @@ class SimpleMoEModel(torch.nn.Module): class SimplePRMoEModel(torch.nn.Module): + def __init__(self, hidden_dim, num_experts=2, ep_size=1, use_residual=False): super(SimplePRMoEModel, self).__init__() self.linear = torch.nn.Linear(hidden_dim, hidden_dim) @@ -102,6 +104,7 @@ class SimplePRMoEModel(torch.nn.Module): class UnusedParametersModel(SimpleModel): + def __init__(self, hidden_dim, empty_grad=False): super().__init__(hidden_dim, empty_grad) @@ -109,21 +112,19 @@ class UnusedParametersModel(SimpleModel): class LinearStack(torch.nn.Module): + def __init__(self, input_dim=128, hidden_dim=128, output_dim=128, num_layers=4): super().__init__() self.input_dim = input_dim self.output_dim = output_dim self.hidden_dim = hidden_dim - self.input_layer = torch.nn.Linear(in_features=self.input_dim, - out_features=self.hidden_dim) + self.input_layer = torch.nn.Linear(in_features=self.input_dim, out_features=self.hidden_dim) self.layers = torch.nn.ModuleList([ - torch.nn.Linear(in_features=self.hidden_dim, - out_features=self.hidden_dim, - bias=False) for x in range(num_layers) + torch.nn.Linear(in_features=self.hidden_dim, out_features=self.hidden_dim, bias=False) + for x in range(num_layers) ]) - self.output_layer = torch.nn.Linear(in_features=self.hidden_dim, - out_features=self.output_dim) + self.output_layer = torch.nn.Linear(in_features=self.hidden_dim, out_features=self.output_dim) self.cross_entropy_loss = torch.nn.CrossEntropyLoss() @@ -136,12 +137,8 @@ class LinearStack(torch.nn.Module): class LinearStackPipe(PipelineModule): - def __init__(self, - input_dim=128, - hidden_dim=128, - output_dim=128, - num_layers=4, - **kwargs): + + def __init__(self, input_dim=128, hidden_dim=128, output_dim=128, num_layers=4, **kwargs): self.input_dim = input_dim self.output_dim = output_dim self.hidden_dim = hidden_dim @@ -150,11 +147,7 @@ class LinearStackPipe(PipelineModule): layers = [] layers.append(LayerSpec(torch.nn.Linear, self.input_dim, self.hidden_dim)) for x in range(self.num_layers): - layers.append( - LayerSpec(torch.nn.Linear, - self.hidden_dim, - self.hidden_dim, - bias=False)) + layers.append(LayerSpec(torch.nn.Linear, self.hidden_dim, self.hidden_dim, bias=False)) layers.append(lambda x: x) layers.append(LayerSpec(torch.nn.Linear, self.hidden_dim, self.output_dim)) @@ -162,6 +155,7 @@ class LinearStackPipe(PipelineModule): class SimpleOptimizer(torch.optim.Optimizer): + def __init__(self, params, lr=0.11072018): defaults = dict(lr=lr) super(SimpleOptimizer, self).__init__(params, defaults) @@ -185,6 +179,7 @@ class SimpleOptimizer(torch.optim.Optimizer): class HybridStateOptimizer(torch.optim.Optimizer): + def __init__(self, params, lr=0.11072018): defaults = dict(lr=lr) super(HybridStateOptimizer, self).__init__(params, defaults) @@ -216,6 +211,7 @@ class HybridStateOptimizer(torch.optim.Optimizer): class PLD_SimpleModel(SimpleModel): + def __init__(self, hidden_dim, empty_grad=False): super(PLD_SimpleModel, self).__init__(hidden_dim, empty_grad) @@ -228,9 +224,7 @@ class PLD_SimpleModel(SimpleModel): def random_dataset(total_samples, hidden_dim, device, dtype=torch.half): train_data = torch.randn(total_samples, hidden_dim, device=device, dtype=dtype) - train_label = torch.empty(total_samples, - dtype=torch.long, - device=device).random_(hidden_dim) + train_label = torch.empty(total_samples, dtype=torch.long, device=device).random_(hidden_dim) train_dataset = torch.utils.data.TensorDataset(train_data, train_label) return train_dataset @@ -242,21 +236,10 @@ def random_dataloader(model, total_samples, hidden_dim, device, dtype=torch.half return train_loader -def sequence_dataloader(model, - total_samples, - hidden_dim, - device, - seq_len: int = 32, - dtype=torch.half): +def sequence_dataloader(model, total_samples, hidden_dim, device, seq_len: int = 32, dtype=torch.half): batch_size = model.train_micro_batch_size_per_gpu() - train_data = torch.randn(total_samples, - seq_len, - hidden_dim, - device=device, - dtype=dtype) - train_label = torch.empty(total_samples, - dtype=torch.long, - device=device).random_(hidden_dim) + train_data = torch.randn(total_samples, seq_len, hidden_dim, device=device, dtype=dtype) + train_label = torch.empty(total_samples, dtype=torch.long, device=device).random_(hidden_dim) train_dataset = torch.utils.data.TensorDataset(train_data, train_label) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size) return train_loader diff --git a/tests/unit/util.py b/tests/unit/util.py index 01aa66005..c7d2c431e 100644 --- a/tests/unit/util.py +++ b/tests/unit/util.py @@ -49,10 +49,8 @@ def bf16_required_version_check(accelerator_check=True): else: accelerator_pass = True - if (TORCH_MAJOR > 1 or - (TORCH_MAJOR == 1 and TORCH_MINOR >= 10)) and (CUDA_MAJOR >= 11) and ( - NCCL_MAJOR > 2 or - (NCCL_MAJOR == 2 and NCCL_MINOR >= 10)) and accelerator_pass: + if (TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10)) and (CUDA_MAJOR >= 11) and ( + NCCL_MAJOR > 2 or (NCCL_MAJOR == 2 and NCCL_MINOR >= 10)) and accelerator_pass: return True else: return False diff --git a/tests/unit/utils/test_groups.py b/tests/unit/utils/test_groups.py index 06b391e2e..f683b4b3c 100644 --- a/tests/unit/utils/test_groups.py +++ b/tests/unit/utils/test_groups.py @@ -14,42 +14,22 @@ def test_get_expert_parallel_ranks(): expert_parallel_group = [0,2,4,6], [8,10,12,14] [1,3,5,7], [9,11,13,15] expert_data_parallel_group = [0,8],[2,10],[4,12],[6,14], [1,9],[3,11],[5,13],[7,15] """ - expert_parallel_groups, expert_data_parallel_groups = _get_expert_parallel_ranks( - world_size=16, model_parallel_size_=2, expert_parallel_size_=4 - ) + expert_parallel_groups, expert_data_parallel_groups = _get_expert_parallel_ranks(world_size=16, + model_parallel_size_=2, + expert_parallel_size_=4) assert expert_parallel_groups == [ - [0, - 2, - 4, - 6], - [8, - 10, - 12, - 14], - [1, - 3, - 5, - 7], - [9, - 11, - 13, - 15], + [0, 2, 4, 6], + [8, 10, 12, 14], + [1, 3, 5, 7], + [9, 11, 13, 15], ] assert expert_data_parallel_groups == [ - [0, - 8], - [2, - 10], - [4, - 12], - [6, - 14], - [1, - 9], - [3, - 11], - [5, - 13], - [7, - 15], + [0, 8], + [2, 10], + [4, 12], + [6, 14], + [1, 9], + [3, 11], + [5, 13], + [7, 15], ] diff --git a/tests/unit/utils/test_init_on_device.py b/tests/unit/utils/test_init_on_device.py index 25d102fd0..5ab7eef03 100644 --- a/tests/unit/utils/test_init_on_device.py +++ b/tests/unit/utils/test_init_on_device.py @@ -14,8 +14,7 @@ class TestOnDevice(DistributedTest): world_size = 1 def test_on_device(self, device): - if device == "meta" and pkg_version.parse( - torch.__version__) < pkg_version.parse("1.10"): + if device == "meta" and pkg_version.parse(torch.__version__) < pkg_version.parse("1.10"): pytest.skip("meta tensors only became stable after torch 1.10") with OnDevice(dtype=torch.half, device=device):