From 1c03d1b1bb9228739ac879213098b10fa29fc00e Mon Sep 17 00:00:00 2001 From: Yuanyuan Chen Date: Sun, 17 Aug 2025 02:22:19 +0800 Subject: [PATCH] Fix invalid f-strings (#7457) Fix invalid f-strings detected by ruff. --------- Signed-off-by: cyy Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> Co-authored-by: Olatunji Ruwase Co-authored-by: Michael Wyatt --- accelerator/hpu_accelerator.py | 2 +- accelerator/real_accelerator.py | 14 ++-- csrc/aio/py_test/ds_aio_args.py | 8 +- csrc/aio/py_test/ds_aio_basic.py | 2 +- csrc/aio/py_test/ds_aio_handle.py | 2 +- csrc/aio/py_test/io_engine.py | 4 +- csrc/aio/py_test/test_ds_aio.py | 2 +- csrc/aio/py_test/torch_fastio_engine.py | 4 +- csrc/aio/py_test/torch_io.py | 2 +- deepspeed/autotuning/autotuner.py | 6 +- deepspeed/checkpoint/deepspeed_checkpoint.py | 12 +-- deepspeed/checkpoint/ds_to_universal.py | 8 +- deepspeed/checkpoint/reshape_meg_2d.py | 12 +-- deepspeed/compile/fx.py | 2 +- .../compile/passes/offload_activation.py | 2 +- .../compile/passes/offload_parameters.py | 2 +- deepspeed/compression/config.py | 4 +- deepspeed/env_report.py | 2 +- deepspeed/launcher/launch.py | 2 +- deepspeed/launcher/launcher_helper.py | 8 +- deepspeed/linear/quantization.py | 2 +- .../transformers/ds_transformer.py | 2 +- deepspeed/module_inject/fusedqkv_utils.py | 4 +- deepspeed/module_inject/replace_module.py | 2 +- deepspeed/moe/sharded_moe.py | 2 +- deepspeed/nvme/ds_aio_args.py | 8 +- deepspeed/nvme/ds_aio_basic.py | 2 +- deepspeed/nvme/ds_aio_handle.py | 2 +- deepspeed/nvme/io_engine.py | 4 +- deepspeed/nvme/parse_nvme_stats.py | 2 +- deepspeed/nvme/test_ds_aio.py | 2 +- deepspeed/nvme/torch_fastio_engine.py | 4 +- deepspeed/nvme/torch_io.py | 2 +- deepspeed/ops/fp_quantizer/quantize.py | 4 +- .../ops/sparse_attention/sparsity_config.py | 2 +- .../transformer/inference/triton/attention.py | 2 +- .../activation_checkpointing/checkpointing.py | 2 +- deepspeed/runtime/bf16_optimizer.py | 6 +- .../decoupled_checkpoint_engine.py | 2 +- .../nebula_checkpoint_engine.py | 6 +- deepspeed/runtime/config_utils.py | 2 +- .../data_pipeline/curriculum_scheduler.py | 4 +- deepspeed/runtime/eigenvalue.py | 2 +- deepspeed/runtime/engine.py | 52 ++++++------- deepspeed/runtime/fp16/loss_scaler.py | 2 +- deepspeed/runtime/lr_schedules.py | 8 +- .../data_parallel_writer_factory.py | 2 +- deepspeed/runtime/pipe/engine.py | 15 ++-- .../runtime/sequence_parallel/ulysses_sp.py | 2 +- .../runtime/swap_tensor/optimizer_utils.py | 8 +- .../swap_tensor/partitioned_param_swapper.py | 4 +- .../pipelined_optimizer_swapper.py | 2 +- deepspeed/runtime/swap_tensor/utils.py | 4 +- deepspeed/runtime/utils.py | 2 +- .../zero/contiguous_memory_allocator.py | 2 +- deepspeed/runtime/zero/mics.py | 6 +- .../runtime/zero/partition_parameters.py | 2 +- deepspeed/runtime/zero/stage3.py | 34 ++++---- deepspeed/runtime/zero/stage_1_and_2.py | 18 ++--- deepspeed/utils/tensor_fragment.py | 12 +-- deepspeed/utils/timer.py | 2 +- deepspeed/utils/zero_to_fp32.py | 6 +- op_builder/fp_quantizer.py | 2 +- op_builder/sparse_attn.py | 4 +- .../DS4Sci_EvoformerAttention_bench.py | 2 +- tests/small_model_debugging/stage3_test.py | 4 +- .../model_parallelism/test_autotp_training.py | 6 +- tests/unit/moe/test_moe.py | 2 +- .../half_precision/onebit/test_onebit.py | 78 +++++++++---------- tests/unit/runtime/test_data_efficiency.py | 2 +- tests/unit/runtime/test_multi_output_model.py | 4 +- .../unit/runtime/zero/test_offload_states.py | 4 +- .../runtime/zero/test_zero_leaf_module.py | 2 +- 73 files changed, 231 insertions(+), 232 deletions(-) diff --git a/accelerator/hpu_accelerator.py b/accelerator/hpu_accelerator.py index cae1fa038..9d82eb590 100644 --- a/accelerator/hpu_accelerator.py +++ b/accelerator/hpu_accelerator.py @@ -27,7 +27,7 @@ class HPU_Accelerator(DeepSpeedAccelerator): torch.utils.deterministic.fill_uninitialized_memory = False except ImportError as e: raise ValueError( - f"HPU_Accelerator requires habana_frameworks.torch.hpu, which is not installed on this system.") + "HPU_Accelerator requires habana_frameworks.torch.hpu, which is not installed on this system.") self.fp16_supported = None diff --git a/accelerator/real_accelerator.py b/accelerator/real_accelerator.py index 7deaec1e6..bb79823a8 100644 --- a/accelerator/real_accelerator.py +++ b/accelerator/real_accelerator.py @@ -64,13 +64,13 @@ def get_accelerator(): assert ipex._C._has_xpu(), "XPU_Accelerator requires an intel_extension_for_pytorch that supports XPU." except ImportError as e: raise ValueError( - f"XPU_Accelerator requires intel_extension_for_pytorch, which is not installed on this system.") + "XPU_Accelerator requires intel_extension_for_pytorch, which is not installed on this system.") elif accelerator_name == "xpu.external": try: from intel_extension_for_deepspeed import XPU_Accelerator # noqa: F401 # type: ignore except ImportError as e: raise ValueError( - f"XPU_Accelerator external requires intel_extension_for_deepspeed, which is not installed on this system." + "XPU_Accelerator external requires intel_extension_for_deepspeed, which is not installed on this system." ) elif accelerator_name == "cpu": pass @@ -78,13 +78,13 @@ def get_accelerator(): try: import torch_npu # noqa: F401 # type: ignore except ImportError as e: - raise ValueError(f"NPU_Accelerator requires torch_npu, which is not installed on this system.") + raise ValueError("NPU_Accelerator requires torch_npu, which is not installed on this system.") pass elif accelerator_name == "sdaa": try: import torch_sdaa # noqa: F401 # type: ignore except ImportError as e: - raise ValueError(f"SDAA_Accelerator requires torch_sdaa, which is not installed on this system.") + raise ValueError("SDAA_Accelerator requires torch_sdaa, which is not installed on this system.") pass elif accelerator_name == "mps": try: @@ -93,18 +93,18 @@ def get_accelerator(): # should use torch.mps.is_available() if it exists someday but this is used as proxy torch.mps.current_allocated_memory() except (RuntimeError, ImportError) as e: - raise ValueError(f"MPS_Accelerator requires torch.mps, which is not installed on this system.") + raise ValueError("MPS_Accelerator requires torch.mps, which is not installed on this system.") elif accelerator_name == "hpu": try: import habana_frameworks.torch.hpu # noqa: F401 except ImportError as e: raise ValueError( - f"HPU_Accelerator requires habana_frameworks.torch.hpu, which is not installed on this system.") + "HPU_Accelerator requires habana_frameworks.torch.hpu, which is not installed on this system.") elif accelerator_name == "mlu": try: import torch_mlu # noqa: F401 except ImportError as e: - raise ValueError(f"MLU_Accelerator requires torch_mlu, which is not installed on this system.") + raise ValueError("MLU_Accelerator requires torch_mlu, which is not installed on this system.") elif accelerator_name not in SUPPORTED_ACCELERATOR_LIST: raise ValueError(f'DS_ACCELERATOR must be one of {SUPPORTED_ACCELERATOR_LIST}. ' f'Value "{accelerator_name}" is not supported') diff --git a/csrc/aio/py_test/ds_aio_args.py b/csrc/aio/py_test/ds_aio_args.py index 17c33cdd3..840bac8d5 100644 --- a/csrc/aio/py_test/ds_aio_args.py +++ b/csrc/aio/py_test/ds_aio_args.py @@ -70,10 +70,10 @@ def validate_args(args): error_messages = [] if args.folder is not None and len(args.folder_to_device_mapping) > 0: - error_messages.append(f'--folder and --folder_to_device_mapping cannot be specified together.') + error_messages.append('--folder and --folder_to_device_mapping cannot be specified together.') no_error = False elif args.folder is None and len(args.folder_to_device_mapping) == 0: - error_messages.append(f'At least one of --folder or --folder_to_device_mapping must be specified.') + error_messages.append('At least one of --folder or --folder_to_device_mapping must be specified.') no_error = False # Validate --folder @@ -102,7 +102,7 @@ def validate_args(args): print(f'Found {len(error_messages)} validation error(s)') # Validate --gpu, --use_gds if args.use_gds and not args.gpu: - error_messages.append(f'--gpu must be set to transfer with --use_gds') + error_messages.append('--gpu must be set to transfer with --use_gds') no_error = False if not no_error: @@ -201,7 +201,7 @@ def get_validated_args(): args = refine_args(args) if not validate_args(args): quit() - print(f'Successful validation of command line arguments') + print('Successful validation of command line arguments') args.total_loops = args.warmup_loops + args.loops peer_tag = 'gpu' if args.gpu else 'process' args.mapping_dict = _get_mapping_dict(args) diff --git a/csrc/aio/py_test/ds_aio_basic.py b/csrc/aio/py_test/ds_aio_basic.py index 1c1b36de6..6003bcbf2 100755 --- a/csrc/aio/py_test/ds_aio_basic.py +++ b/csrc/aio/py_test/ds_aio_basic.py @@ -54,7 +54,7 @@ class AIOBasic_Engine(object): task_log(tid, f'{io_string} file {filename} of size {args.io_size} bytes from buffer on device {buffer.device}') - task_log(tid, f'created deepspeed aio basic engine') + task_log(tid, 'created deepspeed aio basic engine') ctxt = {} ctxt[FILE] = filename diff --git a/csrc/aio/py_test/ds_aio_handle.py b/csrc/aio/py_test/ds_aio_handle.py index 24844d78a..aeb8f0862 100755 --- a/csrc/aio/py_test/ds_aio_handle.py +++ b/csrc/aio/py_test/ds_aio_handle.py @@ -80,7 +80,7 @@ class AIOHandle_Engine(object): io_parallel = args.io_parallel if args.io_parallel else 1 handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit, not args.sequential_requests, io_parallel) - task_log(tid, f'created deepspeed aio handle engine') + task_log(tid, 'created deepspeed aio handle engine') bounce_buffer = None if args.gpu: diff --git a/csrc/aio/py_test/io_engine.py b/csrc/aio/py_test/io_engine.py index 0a4dc31d5..b62628fe5 100644 --- a/csrc/aio/py_test/io_engine.py +++ b/csrc/aio/py_test/io_engine.py @@ -76,7 +76,7 @@ def io_engine_tasklet(pool_params): task_barrier(aio_barrier, num_processes) # Run pre task - task_log(tid, f'running pre-task') + task_log(tid, 'running pre-task') io_engine = schedule["pre"]((args, tid)) task_barrier(aio_barrier, num_processes) @@ -91,7 +91,7 @@ def io_engine_tasklet(pool_params): io_engine.ctxt["main_task_sec"].append(stop_time - start_time) # Run post task - task_log(tid, f'running post-task') + task_log(tid, 'running post-task') schedule["post"]((args, tid, io_engine)) task_barrier(aio_barrier, num_processes) diff --git a/csrc/aio/py_test/test_ds_aio.py b/csrc/aio/py_test/test_ds_aio.py index c7e2c995c..32aa74611 100755 --- a/csrc/aio/py_test/test_ds_aio.py +++ b/csrc/aio/py_test/test_ds_aio.py @@ -12,7 +12,7 @@ from io_engine import io_engine_multiprocessing def main(): - print(f'Testing deepspeed_aio python frontend') + print('Testing deepspeed_aio python frontend') args = get_validated_args() mp.set_start_method('spawn', force=True) diff --git a/csrc/aio/py_test/torch_fastio_engine.py b/csrc/aio/py_test/torch_fastio_engine.py index 825938c29..e16ac4c04 100644 --- a/csrc/aio/py_test/torch_fastio_engine.py +++ b/csrc/aio/py_test/torch_fastio_engine.py @@ -15,7 +15,7 @@ from deepspeed.io import FastFileWriter class Torch_FastIO_Engine(object): def __init__(self, args, tid, read_op): - assert read_op is False, f'Read operation is not currently supported' + assert read_op is False, 'Read operation is not currently supported' self.ctxt = self._create_context(args, tid, read_op) self.zipfile_serialization = not args.torch_legacy_save @@ -69,7 +69,7 @@ class Torch_FastIO_Engine(object): fast_io_buffer = create_page_locked_tensor(args.fast_io_size, args.use_accelerator_pin_memory, aio_handle) - task_log(tid, f'created torch_fastio engine') + task_log(tid, 'created torch_fastio engine') ctxt = {} ctxt[FILE] = filename diff --git a/csrc/aio/py_test/torch_io.py b/csrc/aio/py_test/torch_io.py index 0f2857de3..1177f4672 100644 --- a/csrc/aio/py_test/torch_io.py +++ b/csrc/aio/py_test/torch_io.py @@ -54,7 +54,7 @@ class TorchIO_Engine(object): f'{io_string} file {filename} of size {args.io_size} bytes from buffer on device {buffer.device}', force=True) - task_log(tid, f'created torch_io engine') + task_log(tid, 'created torch_io engine') ctxt = {} ctxt[FILE] = filename diff --git a/deepspeed/autotuning/autotuner.py b/deepspeed/autotuning/autotuner.py index dd96ab0bc..00b22342b 100755 --- a/deepspeed/autotuning/autotuner.py +++ b/deepspeed/autotuning/autotuner.py @@ -145,7 +145,7 @@ class Autotuner: f"{best_exp['name']} is the optimal setup after tuning. The exp result is at {best_exp['result_dir']}." ) else: - logger.info(f"No optimal setup is found. Please check that experiments were run successfully.") + logger.info("No optimal setup is found. Please check that experiments were run successfully.") tuning_duration = datetime.timedelta(seconds=(time.time() - self.start_time)) logger.info(f"Tuning completed in {tuning_duration}") @@ -410,7 +410,7 @@ class Autotuner: self.start_time = time.time() if self.fast_enabled(): - logger.info(f"Fast mode is enabled. Tuning micro batch size only.") + logger.info("Fast mode is enabled. Tuning micro batch size only.") # model info profile run with DEFAULT_MIN_MEM_CONFIG model_info = self.model_info_profile_run() @@ -1110,4 +1110,4 @@ class Autotuner: logger.info(f"Done running with the optimal DeepSpeed configuration using {self.optimal_cmd}") else: - logger.info(f"No optimal DeepSpeed configuration found by autotuning.") + logger.info("No optimal DeepSpeed configuration found by autotuning.") diff --git a/deepspeed/checkpoint/deepspeed_checkpoint.py b/deepspeed/checkpoint/deepspeed_checkpoint.py index 46450cb9e..3f97ec067 100644 --- a/deepspeed/checkpoint/deepspeed_checkpoint.py +++ b/deepspeed/checkpoint/deepspeed_checkpoint.py @@ -94,14 +94,14 @@ class DeepSpeedCheckpoint(object): return self.dp_degree != self.zero_checkpoint.get_src_dp_degree() def show_2d_mapping(self): - print(f'reshaped 2d map ---- begin') + print('reshaped 2d map ---- begin') for i in range(self.pp_degree): for j in range(self.tp_degree): file_list = self.get_2d_parallel_files(pp_index=i, tp_index=j) print(f'[{i}, {j}] = {file_list}') - print(f'reshaped 2d map ---- end') + print('reshaped 2d map ---- end') def show_tp_embedding_map(self): self._dump_mapping(self.tp_to_embedding_map, 'tp_to_embedding_layers') @@ -137,7 +137,7 @@ class DeepSpeedCheckpoint(object): return self.layer_keys[self.final_layer_norm_idx] def get_iteration(self): - if not ITERATION_KEY in self.global_state: + if ITERATION_KEY not in self.global_state: sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'), weights_only=False) self.global_state[ITERATION_KEY] = sd.get(ITERATION_KEY, 0) @@ -157,7 +157,7 @@ class DeepSpeedCheckpoint(object): return self.tp_to_embedding_map[tp_index] def _get_checkpoint_value(self, key): - if not key in self.global_state: + if key not in self.global_state: sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'), weights_only=False) self.global_state[key] = sd.get(key, None) @@ -254,7 +254,7 @@ class DeepSpeedCheckpoint(object): layer_file_partitions = partition_data(layer_files, self.tp_degree) for tp_index in range(self.tp_degree): map_key = (tp_index, pp_index) - if not map_key in file_map.keys(): + if map_key not in file_map.keys(): file_map[map_key] = [] file_map[map_key].append(layer_file_partitions[tp_index]) @@ -286,7 +286,7 @@ class DeepSpeedCheckpoint(object): def _merge_state_dicts(self, sd_list): merged_sd = {} for key in sd_list[0].keys(): - if not key in SEQUENTIAL_LAYERS: + if key not in SEQUENTIAL_LAYERS: cat_dim = LAYER_CONCAT_DIM.get(key, 0) merged_sd[key] = torch.cat([sd[key] for sd in sd_list], dim=cat_dim) else: diff --git a/deepspeed/checkpoint/ds_to_universal.py b/deepspeed/checkpoint/ds_to_universal.py index f7b75eee6..2c8cb280d 100755 --- a/deepspeed/checkpoint/ds_to_universal.py +++ b/deepspeed/checkpoint/ds_to_universal.py @@ -269,7 +269,7 @@ def merge_tp_slices(ds_checkpoint, dir, slice_dir, tp_degree, name_and_shape): step_merged = _merge_zero_shards(slice_base_path, "step", tp_degree, shape) if step_merged: - _save_checkpoint(os.path.join(param_base_path, f"step.pt"), step_merged[0]) + _save_checkpoint(os.path.join(param_base_path, "step.pt"), step_merged[0]) for state in ("fp32", "exp_avg", "exp_avg_sq"): slices = _merge_zero_shards(slice_base_path, state, tp_degree, shape) @@ -415,7 +415,7 @@ def _save_optimizer_state(args, ds_checkpoint): output_sd = {k: v for k, v in optim_sd.items() if k not in sharded_states} output_sd[PARAM_GROUPS] = optim_sd[BASE_OPTIMIZER_STATE][PARAM_GROUPS] zero_output_folder = os.path.join(args.output_folder, "zero") - output_file_path = os.path.join(zero_output_folder, f"optimizer_state.pt") + output_file_path = os.path.join(zero_output_folder, "optimizer_state.pt") _save_checkpoint(output_file_path, output_sd) @@ -424,7 +424,7 @@ def _save_optimizer_state_stage3(args, optim_files): output_sd = sd[OPTIMIZER_STATE_DICT] output_sd[PARAM_GROUPS] = output_sd[OPTIMIZER_STATE_DICT][PARAM_GROUPS] zero_output_folder = os.path.join(args.output_folder, "zero") - output_file_path = os.path.join(zero_output_folder, f"optimizer_state.pt") + output_file_path = os.path.join(zero_output_folder, "optimizer_state.pt") _save_checkpoint(output_file_path, output_sd) @@ -467,7 +467,7 @@ def _check_for_required_state(ds_checkpoint): def main(args): - print(f'Convert DeepSpeed Checkpoint to Universal Checkpoint') + print('Convert DeepSpeed Checkpoint to Universal Checkpoint') print(f'Converting DeepSpeed checkpoint in {args.input_folder} to Universal checkpoint in {args.output_folder}') diff --git a/deepspeed/checkpoint/reshape_meg_2d.py b/deepspeed/checkpoint/reshape_meg_2d.py index 3bff87f43..cef8fa5cc 100644 --- a/deepspeed/checkpoint/reshape_meg_2d.py +++ b/deepspeed/checkpoint/reshape_meg_2d.py @@ -24,7 +24,7 @@ class meg_2d_parallel_map(object): assert type(data) is list key = self._make_key(pp_index, tp_index) - if not key in self.map.keys(): + if key not in self.map.keys(): self.map[key] = [] self.map[key] += data @@ -84,14 +84,14 @@ def reshape_meg_2d_parallel(old_pp_degree, old_tp_degree, new_pp_degree, new_tp_ old_2d_map = meg_2d_parallel_map(old_pp_degree, old_tp_degree) old_2d_map.simple_init() if verbose: - old_2d_map.print_data(f'original_2d_map:') + old_2d_map.print_data('original_2d_map:') if old_tp_degree != new_tp_degree: new_tp_map = _reshape_tp_dimension(old_2d_map, new_tp_degree) else: new_tp_map = old_2d_map if verbose: - new_tp_map.print_data(f'after_tp_reshape:') + new_tp_map.print_data('after_tp_reshape:') if old_pp_degree != new_pp_degree: final_map = _reshape_pp_dimension(new_tp_map, new_pp_degree) @@ -99,7 +99,7 @@ def reshape_meg_2d_parallel(old_pp_degree, old_tp_degree, new_pp_degree, new_tp_ final_map = new_tp_map if verbose: - final_map.print_data(f'final_2d_map:') + final_map.print_data('final_2d_map:') return final_map @@ -159,7 +159,7 @@ def get_mpu_ranks(tp_size=1, pp_size=1, dp_size=1, virtual_pp_size=None): ranks = [data_parallel_group_ranks[i] for data_parallel_group_ranks in all_dp_group_ranks] all_pp_group_ranks.append(list(ranks)) - print(f"PP", all_pp_group_ranks) + print("PP", all_pp_group_ranks) # Build the tensor model-parallel groups. all_tp_group_ranks = [] @@ -167,7 +167,7 @@ def get_mpu_ranks(tp_size=1, pp_size=1, dp_size=1, virtual_pp_size=None): ranks = range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size) all_tp_group_ranks.append(list(ranks)) - print(f"TP", all_tp_group_ranks) + print("TP", all_tp_group_ranks) return all_tp_group_ranks, all_pp_group_ranks, all_dp_group_ranks diff --git a/deepspeed/compile/fx.py b/deepspeed/compile/fx.py index 3506aef70..7770b80f3 100644 --- a/deepspeed/compile/fx.py +++ b/deepspeed/compile/fx.py @@ -115,7 +115,7 @@ def add_free_activations(graph_id: int, graph: Graph, activation_node_names: Lis def _should_free(node: Node) -> bool: if not hasattr(node, "meta"): return False - if not "tensor_meta" in node.meta: + if "tensor_meta" not in node.meta: return False return True diff --git a/deepspeed/compile/passes/offload_activation.py b/deepspeed/compile/passes/offload_activation.py index dc62184d0..1b4bf615e 100644 --- a/deepspeed/compile/passes/offload_activation.py +++ b/deepspeed/compile/passes/offload_activation.py @@ -34,7 +34,7 @@ def get_random_id() -> int: def _should_offload(node: Node) -> bool: if not hasattr(node, "meta"): return False - if not "tensor_meta" in node.meta: + if "tensor_meta" not in node.meta: return False return True diff --git a/deepspeed/compile/passes/offload_parameters.py b/deepspeed/compile/passes/offload_parameters.py index 29468f497..3abb3e8d8 100644 --- a/deepspeed/compile/passes/offload_parameters.py +++ b/deepspeed/compile/passes/offload_parameters.py @@ -34,7 +34,7 @@ def add_reload_parameter(graph_id: int, gm: GraphModule, node: Node, ds_id: int) new_node = gm.graph.create_node('call_function', torch.ops.dc.reload_parameter.default, args, {}, - name=f"reload_parameter") + name="reload_parameter") return new_node diff --git a/deepspeed/compression/config.py b/deepspeed/compression/config.py index e1fa5ef4b..0fab1032f 100644 --- a/deepspeed/compression/config.py +++ b/deepspeed/compression/config.py @@ -249,7 +249,7 @@ def get_sparse_pruning_shared_parameters(param_dict): output[SPARSE_PRUNING_DENSE_RATIO] = get_scalar_param(sub_param_dict, SPARSE_PRUNING_DENSE_RATIO, SPARSE_PRUNING_DENSE_RATIO_DEFAULT) assert output[SPARSE_PRUNING_DENSE_RATIO] > 0 and output[ - SPARSE_PRUNING_DENSE_RATIO] < 1, f"Invalid dense_ratio value. Must be less than 1" + SPARSE_PRUNING_DENSE_RATIO] < 1, "Invalid dense_ratio value. Must be less than 1" output[SPARSE_PRUNING_SCHEDULE_OFFSET_STRIDE] = get_scalar_param( sub_param_dict, SPARSE_PRUNING_SCHEDULE_OFFSET_STRIDE, SPARSE_PRUNING_SCHEDULE_OFFSET_STRIDE_DEFAULT) output[SPARSE_PRUNING_EXCLUDED_MODULES] = get_list_param(sub_param_dict, SPARSE_PRUNING_EXCLUDED_MODULES, @@ -258,7 +258,7 @@ def get_sparse_pruning_shared_parameters(param_dict): SPARSE_PRUNING_SCHEDULE_OFFSET_END, output[SPARSE_PRUNING_SCHEDULE_OFFSET]) assert output[SPARSE_PRUNING_SCHEDULE_OFFSET] <= output[ - SPARSE_PRUNING_SCHEDULE_OFFSET_END], f"Invalid schedule_offset and schedule_offset_end values" + SPARSE_PRUNING_SCHEDULE_OFFSET_END], "Invalid schedule_offset and schedule_offset_end values" else: output[SPARSE_PRUNING_ENABLED] = SPARSE_PRUNING_ENABLED_DEFAULT output[SPARSE_PRUNING_METHOD] = SPARSE_PRUNING_METHOD_DEFAULT diff --git a/deepspeed/env_report.py b/deepspeed/env_report.py index 37e33b1e8..20932d318 100644 --- a/deepspeed/env_report.py +++ b/deepspeed/env_report.py @@ -91,7 +91,7 @@ def installed_cann_version(): import re ascend_path = installed_cann_path() if ascend_path is None: - return f"CANN_HOME does not exist, unable to compile NPU op(s)" + return "CANN_HOME does not exist, unable to compile NPU op(s)" cann_version = "" for dirpath, _, filenames in os.walk(os.path.realpath(ascend_path)): if cann_version: diff --git a/deepspeed/launcher/launch.py b/deepspeed/launcher/launch.py index 2abfcae80..d23eaa04b 100755 --- a/deepspeed/launcher/launch.py +++ b/deepspeed/launcher/launch.py @@ -188,7 +188,7 @@ def main(): if not is_torch_elastic_compatible(): if args.enable_elastic_training: - logger.info(f"Disabling elastic training support as \ + logger.info("Disabling elastic training support as \ PyTorch version should be greater than 1.11.x") args.enable_elastic_training = False diff --git a/deepspeed/launcher/launcher_helper.py b/deepspeed/launcher/launcher_helper.py index 05ce14bcc..b7d65f896 100644 --- a/deepspeed/launcher/launcher_helper.py +++ b/deepspeed/launcher/launcher_helper.py @@ -59,9 +59,9 @@ def env_mapping(env, rank_name_list=None, local_rank_name_list=None): if rank == None: rank = env.get(rank_name) elif rank != env.get(rank_name): - raise EnvironmentError(f"rank number doesn't match!") + raise EnvironmentError("rank number doesn't match!") if rank == None: - raise EnvironmentError(f"rank number is not in current env!") + raise EnvironmentError("rank number is not in current env!") env['RANK'] = rank local_rank = None @@ -70,9 +70,9 @@ def env_mapping(env, rank_name_list=None, local_rank_name_list=None): if local_rank == None: local_rank = env.get(local_rank_name) elif local_rank != env.get(local_rank_name): - raise EnvironmentError(f"local_rank number doesn't match!") + raise EnvironmentError("local_rank number doesn't match!") if local_rank == None: - raise EnvironmentError(f"rank number is not in current env!") + raise EnvironmentError("rank number is not in current env!") env['LOCAL_RANK'] = local_rank return env diff --git a/deepspeed/linear/quantization.py b/deepspeed/linear/quantization.py index 2023601be..beabd4f93 100644 --- a/deepspeed/linear/quantization.py +++ b/deepspeed/linear/quantization.py @@ -42,7 +42,7 @@ class QuantizedParameter(nn.Parameter): quantizer: Quantizer = None, ): if requires_grad: - raise ValueError(f"requires_grad=True is not supported with QuantizedParameter") + raise ValueError("requires_grad=True is not supported with QuantizedParameter") if data is None: data = torch.empty(0) self = torch.Tensor._make_subclass(cls, data, requires_grad) diff --git a/deepspeed/model_implementations/transformers/ds_transformer.py b/deepspeed/model_implementations/transformers/ds_transformer.py index b9d10761f..db6359701 100644 --- a/deepspeed/model_implementations/transformers/ds_transformer.py +++ b/deepspeed/model_implementations/transformers/ds_transformer.py @@ -56,7 +56,7 @@ class DeepSpeedTransformerInference(nn.Module): if DeepSpeedTransformerInference.layer_id == 1: log_dist(f"DeepSpeed-Inference config: {self.config.__dict__}", [0]) if deepspeed.HAS_TRITON and self.config.use_triton: - log_dist(f"Injecting Triton kernels ...", [0]) + log_dist("Injecting Triton kernels ...", [0]) if self.config.bigscience_bloom: self.attention = BloomSelfAttention(self.config, mp_group, quantize_scales, quantize_groups, merge_count) diff --git a/deepspeed/module_inject/fusedqkv_utils.py b/deepspeed/module_inject/fusedqkv_utils.py index 0609c6001..757dfc9ab 100644 --- a/deepspeed/module_inject/fusedqkv_utils.py +++ b/deepspeed/module_inject/fusedqkv_utils.py @@ -150,8 +150,8 @@ def prepare_tp_fused_qkvw(module, src, mp_size, gpu_index): module_name = max(module_name_matches, key=len) fused_type = fused_type_dict[module_name] return _transpose_fused_qkvw(src, mp_size, fused_type, module) - warning_once(f"Unrecognized fusedkqv weight type, default to using bloom type," - f"please check in prepare_tp_fused_qkvw() to avoid potential calculation errors") + warning_once("Unrecognized fusedkqv weight type, default to using bloom type," + "please check in prepare_tp_fused_qkvw() to avoid potential calculation errors") return _bloom_type_transpose(src, mp_size) diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index 96a34af85..26752cfa4 100644 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -497,7 +497,7 @@ def replace_transformer_layer(orig_layer_impl, model, checkpoint_dict, config, m if dist.is_initialized(): dist.barrier() transformer_name = get_transformer_name(replaced_module) - non_tp_ckpt_name = f'non-tp.pt' + non_tp_ckpt_name = 'non-tp.pt' ckpt_files = [non_tp_ckpt_name] os.makedirs(config.save_mp_checkpoint_path, exist_ok=True) diff --git a/deepspeed/moe/sharded_moe.py b/deepspeed/moe/sharded_moe.py index 091baa05c..14db1c9d9 100644 --- a/deepspeed/moe/sharded_moe.py +++ b/deepspeed/moe/sharded_moe.py @@ -496,7 +496,7 @@ class TopKGate(Module): self.top2_2nd_expert_sampling = top2_2nd_expert_sampling def _set_ep_group(self, ep_group): - assert self.ep_group is None, f'Attempting to override an existing ep_group' + assert self.ep_group is None, 'Attempting to override an existing ep_group' self.ep_group = ep_group def forward(self, diff --git a/deepspeed/nvme/ds_aio_args.py b/deepspeed/nvme/ds_aio_args.py index 95165d98e..210f21b7c 100644 --- a/deepspeed/nvme/ds_aio_args.py +++ b/deepspeed/nvme/ds_aio_args.py @@ -70,10 +70,10 @@ def validate_args(args): error_messages = [] if args.folder is not None and len(args.folder_to_device_mapping) > 0: - error_messages.append(f'--folder and --folder_to_device_mapping cannot be specified together.') + error_messages.append('--folder and --folder_to_device_mapping cannot be specified together.') no_error = False elif args.folder is None and len(args.folder_to_device_mapping) == 0: - error_messages.append(f'At least one of --folder or --folder_to_device_mapping must be specified.') + error_messages.append('At least one of --folder or --folder_to_device_mapping must be specified.') no_error = False # Validate --folder @@ -102,7 +102,7 @@ def validate_args(args): print(f'Found {len(error_messages)} validation error(s)') # Validate --gpu, --use_gds if args.use_gds and not args.gpu: - error_messages.append(f'--gpu must be set to transfer with --use_gds') + error_messages.append('--gpu must be set to transfer with --use_gds') no_error = False if not no_error: @@ -201,7 +201,7 @@ def get_validated_args(): args = refine_args(args) if not validate_args(args): quit() - print(f'Successful validation of command line arguments') + print('Successful validation of command line arguments') args.total_loops = args.warmup_loops + args.loops peer_tag = 'gpu' if args.gpu else 'process' args.mapping_dict = _get_mapping_dict(args) diff --git a/deepspeed/nvme/ds_aio_basic.py b/deepspeed/nvme/ds_aio_basic.py index 7be7bcd1c..a640d5626 100755 --- a/deepspeed/nvme/ds_aio_basic.py +++ b/deepspeed/nvme/ds_aio_basic.py @@ -54,7 +54,7 @@ class AIOBasic_Engine(object): task_log(tid, f'{io_string} file {filename} of size {args.io_size} bytes from buffer on device {buffer.device}') - task_log(tid, f'created deepspeed aio basic engine') + task_log(tid, 'created deepspeed aio basic engine') ctxt = {} ctxt[FILE] = filename diff --git a/deepspeed/nvme/ds_aio_handle.py b/deepspeed/nvme/ds_aio_handle.py index efc157078..19edd04a7 100755 --- a/deepspeed/nvme/ds_aio_handle.py +++ b/deepspeed/nvme/ds_aio_handle.py @@ -95,7 +95,7 @@ class AIOHandle_Engine(object): else: handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit, not args.sequential_requests, io_parallel) - task_log(tid, f'Created DeepNVMe handle engine') + task_log(tid, 'Created DeepNVMe handle engine') bounce_buffer = None if args.gpu: diff --git a/deepspeed/nvme/io_engine.py b/deepspeed/nvme/io_engine.py index aea66ec55..33a7c035c 100644 --- a/deepspeed/nvme/io_engine.py +++ b/deepspeed/nvme/io_engine.py @@ -76,7 +76,7 @@ def io_engine_tasklet(pool_params): task_barrier(aio_barrier, num_processes) # Run pre task - task_log(tid, f'running pre-task') + task_log(tid, 'running pre-task') io_engine = schedule["pre"]((args, tid)) task_barrier(aio_barrier, num_processes) @@ -91,7 +91,7 @@ def io_engine_tasklet(pool_params): io_engine.ctxt["main_task_sec"].append(stop_time - start_time) # Run post task - task_log(tid, f'running post-task') + task_log(tid, 'running post-task') schedule["post"]((args, tid, io_engine)) task_barrier(aio_barrier, num_processes) diff --git a/deepspeed/nvme/parse_nvme_stats.py b/deepspeed/nvme/parse_nvme_stats.py index 09c79ada5..f051ee164 100755 --- a/deepspeed/nvme/parse_nvme_stats.py +++ b/deepspeed/nvme/parse_nvme_stats.py @@ -101,7 +101,7 @@ def get_metric(file, metric): def validate_args(args): - if not args.metric in PERF_METRICS: + if args.metric not in PERF_METRICS: print(f'{args.metric} is not a valid performance metrics') return False diff --git a/deepspeed/nvme/test_ds_aio.py b/deepspeed/nvme/test_ds_aio.py index a71ad3ee9..3347da182 100755 --- a/deepspeed/nvme/test_ds_aio.py +++ b/deepspeed/nvme/test_ds_aio.py @@ -12,7 +12,7 @@ from .io_engine import io_engine_multiprocessing def ds_io_main(): - print(f'Testing DeepNVMe python frontend') + print('Testing DeepNVMe python frontend') args = get_validated_args() mp.set_start_method('spawn', force=True) diff --git a/deepspeed/nvme/torch_fastio_engine.py b/deepspeed/nvme/torch_fastio_engine.py index fd4918d84..8929e1758 100644 --- a/deepspeed/nvme/torch_fastio_engine.py +++ b/deepspeed/nvme/torch_fastio_engine.py @@ -15,7 +15,7 @@ from deepspeed.io import FastFileWriter class Torch_FastIO_Engine(object): def __init__(self, args, tid, read_op): - assert read_op is False, f'Read operation is not currently supported' + assert read_op is False, 'Read operation is not currently supported' self.ctxt = self._create_context(args, tid, read_op) self.zipfile_serialization = not args.torch_legacy_save @@ -69,7 +69,7 @@ class Torch_FastIO_Engine(object): fast_io_buffer = create_page_locked_tensor(args.fast_io_size, args.use_accelerator_pin_memory, aio_handle) - task_log(tid, f'created torch_fastio engine') + task_log(tid, 'created torch_fastio engine') ctxt = {} ctxt[FILE] = filename diff --git a/deepspeed/nvme/torch_io.py b/deepspeed/nvme/torch_io.py index 3371771c0..04d653544 100644 --- a/deepspeed/nvme/torch_io.py +++ b/deepspeed/nvme/torch_io.py @@ -54,7 +54,7 @@ class TorchIO_Engine(object): f'{io_string} file {filename} of size {args.io_size} bytes from buffer on device {buffer.device}', force=True) - task_log(tid, f'created torch_io engine') + task_log(tid, 'created torch_io engine') ctxt = {} ctxt[FILE] = filename diff --git a/deepspeed/ops/fp_quantizer/quantize.py b/deepspeed/ops/fp_quantizer/quantize.py index 47b3b08c7..71fe96267 100644 --- a/deepspeed/ops/fp_quantizer/quantize.py +++ b/deepspeed/ops/fp_quantizer/quantize.py @@ -126,7 +126,7 @@ class FP_Quantize(Quantizer): if scale is not None: assert input_q.numel() == fp_out.numel(), \ - f'[De-quantization Error]: quantized data should have the same size as original tensor when scale is not None!' + '[De-quantization Error]: quantized data should have the same size as original tensor when scale is not None!' input_q = torch.cat([input_q.reshape(-1, self.group_size), scale], dim=-1).contiguous() fp_quant_module.dequantize(fp_out, input_q, self.group_size, q_mantisa_bits, q_bits - q_mantisa_bits - 1) return fp_out @@ -159,7 +159,7 @@ class FP_Quantize(Quantizer): if scale is not None: assert input_q.numel() == fp_out.numel(), \ - f'[De-quantization Error]: quantized data should have the same size as original tensor when scale is not None!' + '[De-quantization Error]: quantized data should have the same size as original tensor when scale is not None!' input_q = torch.cat([input_q.reshape(-1, self.group_size), scale], dim=-1).contiguous() fp_quant_module.selective_dequantize(fp_out, input_q, indexes, self.group_size, q_mantisa_bits, diff --git a/deepspeed/ops/sparse_attention/sparsity_config.py b/deepspeed/ops/sparse_attention/sparsity_config.py index 1f59c4b46..b5d9be073 100644 --- a/deepspeed/ops/sparse_attention/sparsity_config.py +++ b/deepspeed/ops/sparse_attention/sparsity_config.py @@ -142,7 +142,7 @@ class FixedSparsityConfig(SparsityConfig): if (num_different_global_patterns > 1 and not different_layout_per_head): raise ValueError( - f'Number of different layouts cannot be more than one when you have set a single layout for all heads! Set different_layout_per_head to True.' + 'Number of different layouts cannot be more than one when you have set a single layout for all heads! Set different_layout_per_head to True.' ) if (num_different_global_patterns > (num_local_blocks // num_global_blocks)): raise ValueError( diff --git a/deepspeed/ops/transformer/inference/triton/attention.py b/deepspeed/ops/transformer/inference/triton/attention.py index 6845d91b0..023ef767a 100644 --- a/deepspeed/ops/transformer/inference/triton/attention.py +++ b/deepspeed/ops/transformer/inference/triton/attention.py @@ -103,7 +103,7 @@ class TritonSelfAttention(nn.Module): # triton autotune table update for score/context matmul if triton_autotune: - print(f"running triton autotune for regular attention kernel") + print("running triton autotune for regular attention kernel") __class__._triton_autotune(2, self.config.max_out_tokens, self.head_size, self.config.hidden_size, self.triangular_masking, self.scale) diff --git a/deepspeed/runtime/activation_checkpointing/checkpointing.py b/deepspeed/runtime/activation_checkpointing/checkpointing.py index 08c4b8193..fae0148ba 100644 --- a/deepspeed/runtime/activation_checkpointing/checkpointing.py +++ b/deepspeed/runtime/activation_checkpointing/checkpointing.py @@ -1122,7 +1122,7 @@ def configure( #print configuration only once see_memory_usage("After configuration", force=False) if dist.get_rank() == 0: - logger.info(f"Activation Checkpointing Information") + logger.info("Activation Checkpointing Information") logger.info(f"----Partition Activations {PARTITION_ACTIVATIONS}, CPU CHECKPOINTING {CPU_CHECKPOINT}") logger.info(f"----contiguous Memory Checkpointing {CONTIGUOUS_CHECKPOINTING} with {num_layers} total layers") logger.info(f"----Synchronization {SYNCHRONIZE}") diff --git a/deepspeed/runtime/bf16_optimizer.py b/deepspeed/runtime/bf16_optimizer.py index cd00e9e43..06a8d01e5 100644 --- a/deepspeed/runtime/bf16_optimizer.py +++ b/deepspeed/runtime/bf16_optimizer.py @@ -54,7 +54,7 @@ class BF16_Optimizer(ZeROOptimizer): self.param_names = param_names self.using_real_optimizer = not isinstance(self.optimizer, DummyOptim) - assert bfloat16_config.enabled, f"BF16Optimizer: requires bfloat16 to be enabled" + assert bfloat16_config.enabled, "BF16Optimizer: requires bfloat16 to be enabled" assert grad_acc_dtype in [torch.float32, torch.bfloat16 ], f"BF16Optimizer: Unsupported gradient accumulation data type: {grad_acc_dtype}" self.grad_acc_dtype = grad_acc_dtype @@ -504,13 +504,13 @@ class BF16_Optimizer(ZeROOptimizer): current_rank_sd = state_dict_list[dp_rank] ckpt_version = current_rank_sd.get(DS_VERSION, False) - assert ckpt_version, f"Empty ds_version in checkpoint, not clear how to proceed" + assert ckpt_version, "Empty ds_version in checkpoint, not clear how to proceed" ckpt_version = pkg_version.parse(ckpt_version) self.clip_grad = current_rank_sd.get(CLIP_GRAD, self.clip_grad) if load_optimizer_states: - print(f"_load_legacy_checkpoint current_rank_sd[BASE_OPTIMIZER_STATE]") + print("_load_legacy_checkpoint current_rank_sd[BASE_OPTIMIZER_STATE]") self.optimizer.load_state_dict(current_rank_sd[BASE_OPTIMIZER_STATE]) if load_from_fp32_weights: diff --git a/deepspeed/runtime/checkpoint_engine/decoupled_checkpoint_engine.py b/deepspeed/runtime/checkpoint_engine/decoupled_checkpoint_engine.py index d6c0ea888..1459e5bfc 100644 --- a/deepspeed/runtime/checkpoint_engine/decoupled_checkpoint_engine.py +++ b/deepspeed/runtime/checkpoint_engine/decoupled_checkpoint_engine.py @@ -40,7 +40,7 @@ class CheckpointSize(object): def init_decoupled_checkpoint(config_params, dp_writer_config, save_event, save_queue, optimize_dp_state): checkpoint_engine = FastCheckpointEngine(config_params, dp_writer_config, optimize_dp_state) - print(f'Created FastCheckpointEngine for Decoupled Checkpointing') + print('Created FastCheckpointEngine for Decoupled Checkpointing') save_path_list = [] while True: (save_info, event_type) = save_queue.get() diff --git a/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py b/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py index 485e21411..9e18c8c24 100644 --- a/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py +++ b/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py @@ -43,7 +43,7 @@ class NebulaCheckpointEngine(CheckpointEngine): self.checkpoint = torch_nebula.Checkpoint(info.tag, -2) def save(self, state_dict, path: str): - log_dist(f"[Nebula] Create dummy files for loading.") + log_dist("[Nebula] Create dummy files for loading.") torch.save("", path) tag = _get_tag_from_path(path) @@ -84,7 +84,7 @@ class NebulaCheckpointEngine(CheckpointEngine): checkpoint = torch_nebula.get_latest_checkpoint(persist_path=self.nebula_load_path) if checkpoint is None or (checkpoint is not None and checkpoint.tag == ''): logger.info( - f"Unable to find latest checkpoint from Nebula tier3, try to get latest checkpoint again from nebula tier1 path!" + "Unable to find latest checkpoint from Nebula tier3, try to get latest checkpoint again from nebula tier1 path!" ) # nebula tier1 latest checkpoint = torch_nebula.get_latest_checkpoint() @@ -103,6 +103,6 @@ class NebulaCheckpointEngine(CheckpointEngine): logger.info(f"[Nebula] all files for {tag} are saved in tier1. It is ready to start persisting") commit_rls = self.checkpoint.commit() if not commit_rls: - logger.error(f"[Nebula] failed to commit the checkpoint, please check the log.") + logger.error("[Nebula] failed to commit the checkpoint, please check the log.") return False return commit_rls diff --git a/deepspeed/runtime/config_utils.py b/deepspeed/runtime/config_utils.py index 2ae6921c3..2fbbf4c0f 100755 --- a/deepspeed/runtime/config_utils.py +++ b/deepspeed/runtime/config_utils.py @@ -169,7 +169,7 @@ class ScientificNotationEncoder(json.JSONEncoder): x = [f'\n{prefix}"{k}": {self.iterencode(v, level=level)}' for k, v in o.items()] return "{" + ", ".join(x) + f"\n{prefix_close}" + "}" elif isinstance(o, collections.abc.Sequence) and not isinstance(o, str): - return f"[{ f', '.join(map(self.iterencode, o)) }]" + return f"[{ ', '.join(map(self.iterencode, o)) }]" return "\n, ".join(super().iterencode(o, _one_shot)) diff --git a/deepspeed/runtime/data_pipeline/curriculum_scheduler.py b/deepspeed/runtime/data_pipeline/curriculum_scheduler.py index 23d747957..296cc7fcd 100644 --- a/deepspeed/runtime/data_pipeline/curriculum_scheduler.py +++ b/deepspeed/runtime/data_pipeline/curriculum_scheduler.py @@ -73,7 +73,7 @@ class CurriculumScheduler(object): f"Curriculum learning with fixed_root schedule requires the schedule_config '{CURRICULUM_LEARNING_SCHEDULE_ROOT_DEGREE}'" if config[CURRICULUM_LEARNING_SCHEDULE_CONFIG][CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP] % 8 != 0: logger.warning( - f'When using seqlen metric, the difficulty_step for curriculum learning has to be multiple of 8 (for FP16 data) or 16 (for INT8 data) to enable NVIDIA Tensor Core acceleration. Disregard this warning if this is unrelated to your metric/hardware.' + 'When using seqlen metric, the difficulty_step for curriculum learning has to be multiple of 8 (for FP16 data) or 16 (for INT8 data) to enable NVIDIA Tensor Core acceleration. Disregard this warning if this is unrelated to your metric/hardware.' ) self.state[CURRICULUM_LEARNING_SCHEDULE_CONFIG] = config[CURRICULUM_LEARNING_SCHEDULE_CONFIG] elif config[CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_LINEAR: @@ -91,7 +91,7 @@ class CurriculumScheduler(object): f"Curriculum learning with fixed_linear schedule requires the schedule_config '{CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP}'" if config[CURRICULUM_LEARNING_SCHEDULE_CONFIG][CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP] % 8 != 0: logger.warning( - f'When using seqlen metric, the difficulty_step for curriculum learning has to be multiple of 8 (for FP16 data) or 16 (for INT8 data) to enable NVIDIA Tensor Core acceleration. Disregard this warning if this is unrelated to your metric/hardware.' + 'When using seqlen metric, the difficulty_step for curriculum learning has to be multiple of 8 (for FP16 data) or 16 (for INT8 data) to enable NVIDIA Tensor Core acceleration. Disregard this warning if this is unrelated to your metric/hardware.' ) self.state[CURRICULUM_LEARNING_SCHEDULE_CONFIG] = config[CURRICULUM_LEARNING_SCHEDULE_CONFIG] elif config[CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_CUSTOM: diff --git a/deepspeed/runtime/eigenvalue.py b/deepspeed/runtime/eigenvalue.py index a82d8b1d5..a4609c5eb 100755 --- a/deepspeed/runtime/eigenvalue.py +++ b/deepspeed/runtime/eigenvalue.py @@ -107,7 +107,7 @@ class Eigenvalue(object): # Disable eigenvalue if the model doesn't support second order gradients computation, # e.g. when enabling DS transformer kernel. if len(grads) == 0 or len(params) == 0: - log_dist(f'The model does NOT support eigenvalue computation.', ranks=[0], level=logging.WARNING) + log_dist('The model does NOT support eigenvalue computation.', ranks=[0], level=logging.WARNING) return [] i = 0 diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index ea8dfed93..af55e48dc 100755 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -266,7 +266,7 @@ class DeepSpeedEngine(Module): self._do_sanity_check() if self.autotp_size() > 1: self._configure_tensor_parallel(model, self.tensor_parallel_config()) - see_memory_usage(f"DeepSpeed Engine: After args sanity test", force=self.memory_breakdown()) + see_memory_usage("DeepSpeed Engine: After args sanity test", force=self.memory_breakdown()) if mpu is not None: if self.elasticity_enabled(): if not self.is_elastic_model_parallel_supported(): @@ -280,7 +280,7 @@ class DeepSpeedEngine(Module): self.monitor = MonitorMaster(self._config.monitor_config) see_memory_usage( - f"DeepSpeed Engine: Before configure distributed model", + "DeepSpeed Engine: Before configure distributed model", force=self.memory_breakdown(), ) @@ -298,7 +298,7 @@ class DeepSpeedEngine(Module): self._get_model_parameters() - see_memory_usage(f"DeepSpeed Engine: After configure distributed model") + see_memory_usage("DeepSpeed Engine: After configure distributed model") # Configure wall clock timers self.timers = SynchronizedWallClockTimer() @@ -507,7 +507,7 @@ class DeepSpeedEngine(Module): broadcast_and_check(args, bcast_rank, bcast_group) broadcast_and_check(kwargs, bcast_rank, bcast_group) - logger.info(f":The Dataloader has passed the TP group consistency check.") + logger.info(":The Dataloader has passed the TP group consistency check.") self.first_dataloader_check.remove() self.first_dataloader_check = self.module.register_forward_pre_hook(check_dataloader_inputs_same_across_ranks, @@ -577,7 +577,7 @@ class DeepSpeedEngine(Module): """ if train_batch_size % (self.train_micro_batch_size_per_gpu() * self.dp_world_size) != 0: #print(f'{train_batch_size=} {self.train_micro_batch_size_per_gpu()=} {self.dp_world_size=}') - raise ValueError(f'Train batch size must be divisible by micro-batch data parallelism') + raise ValueError('Train batch size must be divisible by micro-batch data parallelism') new_gas = train_batch_size // (self.train_micro_batch_size_per_gpu() * self.dp_world_size) # overwrite config self._config.train_batch_size = train_batch_size @@ -736,7 +736,7 @@ class DeepSpeedEngine(Module): if random_ltd_config[RANDOM_LTD_LAYER_TOKEN_LR_SCHEDULE][RANDOM_LTD_LAYER_TOKEN_LR_ENABLED]: assert self.client_lr_scheduler is None - raise ValueError(f'not yet support') + raise ValueError('not yet support') #self.lr_scheduler = lr_schedules.WarmupLayerTokenDecayLR(self.optimizer, self.random_ltd_scheduler) def get_data_parallel_rank(self): @@ -1534,21 +1534,21 @@ class DeepSpeedEngine(Module): optimizer = OnebitAdam(model_parameters, self, **optimizer_parameters) if not self.fp16_enabled(): - logger.warning(f"Currently the convergence of 1-bit Adam is only verified under FP16") + logger.warning("Currently the convergence of 1-bit Adam is only verified under FP16") elif self.optimizer_name() == ZERO_ONE_ADAM_OPTIMIZER: assert not self.zero_optimization(), "0/1 Adam is not compatible with ZeRO" from deepspeed.runtime.fp16.onebit.zoadam import ZeroOneAdam optimizer = ZeroOneAdam(model_parameters, self, **optimizer_parameters) if not self.fp16_enabled(): - logger.warning(f'Currently the convergence of 0/1 Adam is only verified under FP16') + logger.warning('Currently the convergence of 0/1 Adam is only verified under FP16') elif self.optimizer_name() == ONEBIT_LAMB_OPTIMIZER: assert not self.zero_optimization(), "1bit-Lamb is not compatible with ZeRO" from deepspeed.runtime.fp16.onebit.lamb import OnebitLamb optimizer = OnebitLamb(model_parameters, self, **optimizer_parameters) if not self.fp16_enabled(): - logger.warning(f"Currently the convergence of 1-bit Lamb is only verified under FP16") + logger.warning("Currently the convergence of 1-bit Lamb is only verified under FP16") elif self.optimizer_name() == LION_OPTIMIZER: if self.zero_use_cpu_optimizer(): from deepspeed.ops.lion import DeepSpeedCPULion @@ -1560,19 +1560,19 @@ class DeepSpeedEngine(Module): try: from mup import MuAdam except ImportError: - logger.error(f"Install mup to use MuAdam optimizer") + logger.error("Install mup to use MuAdam optimizer") optimizer = MuAdam(model_parameters, **optimizer_parameters) elif self.optimizer_name() == MUADAMW_OPTIMIZER: try: from mup import MuAdamW except ImportError: - logger.error(f"Install mup to use MuAdamW optimizer") + logger.error("Install mup to use MuAdamW optimizer") optimizer = MuAdamW(model_parameters, **optimizer_parameters) elif self.optimizer_name() == MUSGD_OPTIMIZER: try: from mup import MuSGD except ImportError: - logger.error(f"Install mup to use MuSGD optimizer") + logger.error("Install mup to use MuSGD optimizer") optimizer = MuSGD(model_parameters, **optimizer_parameters) else: torch_optimizer = getattr(torch.optim, self.optimizer_name()) @@ -1630,7 +1630,7 @@ class DeepSpeedEngine(Module): if isinstance(optimizer, fused_opts) \ or self.optimizer_name() in [ONEBIT_ADAM_OPTIMIZER, ZERO_ONE_ADAM_OPTIMIZER]: if self.dynamic_loss_scale(): - log_dist(f'Creating fp16 optimizer with dynamic loss scale', ranks=[0]) + log_dist('Creating fp16 optimizer with dynamic loss scale', ranks=[0]) timers = self.timers if self.wall_clock_breakdown() else NoopTimer() optimizer = FP16_Optimizer( optimizer, @@ -1658,7 +1658,7 @@ class DeepSpeedEngine(Module): has_moe_layers=self.has_moe_layers, ) else: - log_dist(f'Creating fp16 unfused optimizer with dynamic loss scale', ranks=[0]) + log_dist('Creating fp16 unfused optimizer with dynamic loss scale', ranks=[0]) optimizer = FP16_UnfusedOptimizer( optimizer, deepspeed=self, @@ -2214,7 +2214,7 @@ class DeepSpeedEngine(Module): if self.is_gradient_accumulation_boundary(): if self.global_rank == 0: self.summary_events = [( - f"Train/Samples/train_loss", + "Train/Samples/train_loss", self.losses.item(), self.global_samples, )] @@ -2274,7 +2274,7 @@ class DeepSpeedEngine(Module): assert not self.zero_optimization_partition_gradients(), \ f"no_sync context manager is incompatible with gradient partitioning logic of ZeRO stage {self.zero_optimization_stage()}" - assert not self.inside_no_sync_ctxt, f"no_sync context manager reentry is unsupported" + assert not self.inside_no_sync_ctxt, "no_sync context manager reentry is unsupported" self.inside_no_sync_ctxt = True try: @@ -2456,7 +2456,7 @@ class DeepSpeedEngine(Module): if (self.eigenvalue_enabled() and (self.gas_boundary_ctr % self.eigenvalue_gas_boundary_resolution() == 0) and self.quantizer.any_precision_switch()): - log_dist(f"computing eigenvalue...", ranks=[0]) + log_dist("computing eigenvalue...", ranks=[0]) self.block_eigenvalue = self.eigenvalue.compute_eigenvalue(self.module, self.device, self.optimizer.cur_scale) @@ -2482,11 +2482,11 @@ class DeepSpeedEngine(Module): if self.monitor.enabled: if self.is_gradient_accumulation_boundary(): if self.global_rank == 0: - self.summary_events = [(f"Train/Samples/lr", self.get_lr()[0], self.global_samples)] + self.summary_events = [("Train/Samples/lr", self.get_lr()[0], self.global_samples)] if self.fp16_enabled() and hasattr(self.optimizer, "cur_scale"): self.summary_events.append(( - f"Train/Samples/loss_scale", + "Train/Samples/loss_scale", self.optimizer.cur_scale, self.global_samples, )) @@ -2578,27 +2578,27 @@ class DeepSpeedEngine(Module): if self.global_rank == 0: self.summary_events = [ ( - f"Train/Samples/elapsed_time_ms_forward", + "Train/Samples/elapsed_time_ms_forward", self.timers(FORWARD_GLOBAL_TIMER).elapsed(reset=False), self.global_samples, ), ( - f"Train/Samples/elapsed_time_ms_backward", + "Train/Samples/elapsed_time_ms_backward", self.timers(BACKWARD_GLOBAL_TIMER).elapsed(reset=False), self.global_samples, ), ( - f"Train/Samples/elapsed_time_ms_backward_inner", + "Train/Samples/elapsed_time_ms_backward_inner", self.timers(BACKWARD_INNER_GLOBAL_TIMER).elapsed(reset=False), self.global_samples, ), ( - f"Train/Samples/elapsed_time_ms_backward_allreduce", + "Train/Samples/elapsed_time_ms_backward_allreduce", self.timers(BACKWARD_REDUCE_GLOBAL_TIMER).elapsed(reset=False), self.global_samples, ), ( - f"Train/Samples/elapsed_time_ms_step", + "Train/Samples/elapsed_time_ms_step", self.timers(STEP_GLOBAL_TIMER).elapsed(reset=False), self.global_samples, ), @@ -3239,7 +3239,7 @@ class DeepSpeedEngine(Module): if load_optimizer_states: deepspeed_states.append('optimizer') - client_state = {key: value for key, value in checkpoint.items() if not key in deepspeed_states} + client_state = {key: value for key, value in checkpoint.items() if key not in deepspeed_states} if optim_checkpoint is not None: client_state['optimizer'] = optim_checkpoint['optimizer'] @@ -3739,7 +3739,7 @@ class DeepSpeedEngine(Module): numel += param.ds_numel if hasattr(param, "ds_numel") else param.numel() shape = param.ds_shape if hasattr(param, "ds_shape") else param.shape if param not in self.param_names: - raise ValueError(f"failed to find optimizer param in named params") + raise ValueError("failed to find optimizer param in named params") name = self.param_names[param] param_shapes[name] = shape diff --git a/deepspeed/runtime/fp16/loss_scaler.py b/deepspeed/runtime/fp16/loss_scaler.py index 3f5ed019f..78c012cfa 100755 --- a/deepspeed/runtime/fp16/loss_scaler.py +++ b/deepspeed/runtime/fp16/loss_scaler.py @@ -210,7 +210,7 @@ class DynamicLossScaler(LossScalerBase): # we still create a scaler for other dtypes (fp32, bf16) which does not perform any scaling. def CreateLossScaler(dtype, static_loss_scale, dynamic_scaling, dynamic_loss_args): if dtype == torch.half and dynamic_scaling: - assert dynamic_loss_args is not None, f"Dynamic loss scaling parameters must be defined." + assert dynamic_loss_args is not None, "Dynamic loss scaling parameters must be defined." return DynamicLossScaler(dtype=dtype, **dynamic_loss_args) loss_scale_value = static_loss_scale if dtype == torch.half else 1.0 diff --git a/deepspeed/runtime/lr_schedules.py b/deepspeed/runtime/lr_schedules.py index bb0866e0d..0ff74695b 100755 --- a/deepspeed/runtime/lr_schedules.py +++ b/deepspeed/runtime/lr_schedules.py @@ -209,7 +209,7 @@ def get_config_from_args(args): if not hasattr(args, LR_SCHEDULE) or args.lr_schedule is None: return None, '--{} not specified on command line'.format(LR_SCHEDULE) - if not args.lr_schedule in VALID_LR_SCHEDULES: + if args.lr_schedule not in VALID_LR_SCHEDULES: return None, '{} is not supported LR schedule'.format(args.lr_schedule) config = {} @@ -227,16 +227,16 @@ def get_config_from_args(args): def get_lr_from_config(config): - if not 'type' in config: + if 'type' not in config: return None, 'LR schedule type not defined in config' - if not 'params' in config: + if 'params' not in config: return None, 'LR schedule params not defined in config' lr_schedule = config['type'] lr_params = config['params'] - if not lr_schedule in VALID_LR_SCHEDULES: + if lr_schedule not in VALID_LR_SCHEDULES: return None, '{} is not a valid LR schedule'.format(lr_schedule) if lr_schedule == LR_RANGE_TEST: diff --git a/deepspeed/runtime/model_checkpointing/data_parallel_writer_factory.py b/deepspeed/runtime/model_checkpointing/data_parallel_writer_factory.py index 15ed235e4..0a9740a38 100644 --- a/deepspeed/runtime/model_checkpointing/data_parallel_writer_factory.py +++ b/deepspeed/runtime/model_checkpointing/data_parallel_writer_factory.py @@ -57,7 +57,7 @@ class DataParallelWriterFactory(object): return self._create_config(1, 0) if dp_rank == 0 else None assert self._uni_parallel_info.pure_dp, \ - f'3D parallelism is not yet supported for data parallel checkpointing.' + '3D parallelism is not yet supported for data parallel checkpointing.' if self._parallel_unit == CheckpointDataParallel.REPLICA or ep_info.ep_world_size == 1: return self._get_parallel_write_for_ddp(ep_info.dp_world_size, ep_info.dp_rank) diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py index efeda9e7d..3ca4a3b49 100644 --- a/deepspeed/runtime/pipe/engine.py +++ b/deepspeed/runtime/pipe/engine.py @@ -217,7 +217,7 @@ class PipelineEngine(DeepSpeedEngine): # set activation_checkpoint_func to non_reentrant_checkpoint func. self.module.activation_checkpoint_func = ds_checkpointing.non_reentrant_checkpoint if self.grid.get_global_rank() == 0: - logger.info(f'CONFIG: activation_checkpoint_func=non_reentrant_checkpoint') + logger.info('CONFIG: activation_checkpoint_func=non_reentrant_checkpoint') if self.module.activation_checkpoint_interval > 0: self.module._precompute_checkpointable_values() @@ -359,7 +359,7 @@ class PipelineEngine(DeepSpeedEngine): The arithmetic mean of the losses computed this batch. """ if not torch._C.is_grad_enabled(): - raise RuntimeError(f'train_batch() requires gradients enabled. Use eval_batch() instead.') + raise RuntimeError('train_batch() requires gradients enabled. Use eval_batch() instead.') # Curriculum learning could change activation shape if self.curriculum_enabled_legacy(): @@ -408,8 +408,8 @@ class PipelineEngine(DeepSpeedEngine): # Monitoring if self.global_rank == 0 and self.monitor.enabled: - self.summary_events = [(f'Train/Samples/train_loss', self.agg_train_loss.mean().item(), - self.global_samples)] + self.summary_events = [('Train/Samples/train_loss', self.agg_train_loss.mean().item(), self.global_samples) + ] self.monitor.write_events(self.summary_events) if self.steps_per_print() is not None and self.wall_clock_breakdown( @@ -498,7 +498,7 @@ class PipelineEngine(DeepSpeedEngine): eval_output = self._bcast_pipe_scalar(eval_output) if self.global_rank == 0 and self.monitor.enabled: - self.summary_events = [(f'Train/Samples/eval_loss', eval_output.mean().item(), self.global_samples)] + self.summary_events = [('Train/Samples/eval_loss', eval_output.mean().item(), self.global_samples)] self.monitor.write_events(self.summary_events) # Restore the training iterator @@ -1220,10 +1220,9 @@ class PipelineEngine(DeepSpeedEngine): self._force_grad_boundary = False if self.global_rank == 0 and self.monitor.enabled: - self.summary_events = [(f'Train/Samples/lr', self.get_lr()[0], self.global_samples)] + self.summary_events = [('Train/Samples/lr', self.get_lr()[0], self.global_samples)] if self.fp16_enabled() and hasattr(self.optimizer, 'cur_scale'): - self.summary_events.append( - (f'Train/Samples/loss_scale', self.optimizer.cur_scale, self.global_samples)) + self.summary_events.append(('Train/Samples/loss_scale', self.optimizer.cur_scale, self.global_samples)) self.monitor.write_events(self.summary_events) if self.wall_clock_breakdown(): diff --git a/deepspeed/runtime/sequence_parallel/ulysses_sp.py b/deepspeed/runtime/sequence_parallel/ulysses_sp.py index ae41a3d8b..bf7eba7e7 100644 --- a/deepspeed/runtime/sequence_parallel/ulysses_sp.py +++ b/deepspeed/runtime/sequence_parallel/ulysses_sp.py @@ -1266,7 +1266,7 @@ class UlyssesSPFwdLossBwdWithLogits: def sp_fwd_loss_bwd(self, batch) -> torch.Tensor: - see_memory_usage(f"entered sp_fwd_loss_bwd", force=True) + see_memory_usage("entered sp_fwd_loss_bwd", force=True) # ensure shapes are correct if not (batch["input_ids"].shape == batch["position_ids"].shape == batch["labels"].shape): diff --git a/deepspeed/runtime/swap_tensor/optimizer_utils.py b/deepspeed/runtime/swap_tensor/optimizer_utils.py index 6729fd28b..f23443d05 100644 --- a/deepspeed/runtime/swap_tensor/optimizer_utils.py +++ b/deepspeed/runtime/swap_tensor/optimizer_utils.py @@ -102,7 +102,7 @@ class OptimizerStateSwapInfo(object): def get_or_create_gradient_paths(self, offsets, lengths): gradient_paths = [] for offset, length in zip(offsets, lengths): - if not offset in self.swapped_gradients.keys(): + if offset not in self.swapped_gradients.keys(): path = os.path.join(self.swap_folder, f'{self.param_id}_gradient_{offset}_{length}.tensor.swp') self.swapped_gradients[offset] = FlattenedTensorSwapInfo(path, length, offset) @@ -233,7 +233,7 @@ class OptimizerSwapper(object): self.timer_names.update(gradient_swapper.get_timer_names()) def _swap_out_gradients(self, parameter, gradient_offsets, gradient_tensors, gradient_swapper): - if not OptimizerSwapper.parameter_id(parameter) in self.swap_params_info.keys(): + if OptimizerSwapper.parameter_id(parameter) not in self.swap_params_info.keys(): return swap_info = self.swap_params_info[OptimizerSwapper.parameter_id(parameter)] @@ -471,7 +471,7 @@ class OptimizerSwapper(object): ) def _get_state_tensors(self, parameter): - if not parameter in self.optimizer.state: + if parameter not in self.optimizer.state: return [] tensor_list = [] @@ -490,7 +490,7 @@ class OptimizerSwapper(object): def _create_param_swap_info(self, parameter, numel): param_id = OptimizerSwapper.parameter_id(parameter) - assert not param_id in self.swap_params_info + assert param_id not in self.swap_params_info self.swap_params_info[param_id] = OptimizerStateSwapInfo(parameter=parameter, numel=numel, diff --git a/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py b/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py index 3e00a1860..06a030146 100644 --- a/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py +++ b/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py @@ -399,8 +399,8 @@ class AsyncPartitionedParameterSwapper(object): self.partitioned_swap_pool = SwapBufferPool([self.partitioned_swap_buffer]) def swap_out_partitioned_params(self, dst_fp16_params, src_fp32_params): - assert self.partitioned_swap_buffer is not None, f'partitioned swap buffers for fp16 params not initialized' - assert self.partitioned_swap_pool is not None, f'partitioned swap pool for fp16 params not initialized' + assert self.partitioned_swap_buffer is not None, 'partitioned swap buffers for fp16 params not initialized' + assert self.partitioned_swap_pool is not None, 'partitioned swap pool for fp16 params not initialized' assert len(dst_fp16_params) == len(src_fp32_params), \ f'mismatch in number of fp16 params {len(dst_fp16_params)} and fp32 params {len(src_fp32_params)}' diff --git a/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py b/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py index 1ff570ed3..17d7a655c 100644 --- a/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py +++ b/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py @@ -213,7 +213,7 @@ class PipelinedOptimizerSwapper(OptimizerSwapper): count=required_buffer_count, dtype=parameter.dtype) assert allocated_buffers is not None, \ - f"PipelinedOptimizerSwapper ran out of swap buffers, try increasing 'buffer_count'" + "PipelinedOptimizerSwapper ran out of swap buffers, try increasing 'buffer_count'" state_buffers = allocated_buffers[:num_swap_tensors] param_info.set_swap_buffers(state_buffers, aligned_numel) diff --git a/deepspeed/runtime/swap_tensor/utils.py b/deepspeed/runtime/swap_tensor/utils.py index 60a400438..3cfe95c13 100644 --- a/deepspeed/runtime/swap_tensor/utils.py +++ b/deepspeed/runtime/swap_tensor/utils.py @@ -30,7 +30,7 @@ def swap_out_tensors(swap_handle, tensor_buffers, swap_paths): def print_object(obj, name, exclude_list=[]): logger.info('{}:'.format(name)) for arg in sorted(vars(obj)): - if not arg in exclude_list: + if arg not in exclude_list: dots = '.' * (29 - len(arg)) logger.info(' {} {} {}'.format(arg, dots, getattr(obj, arg))) @@ -55,7 +55,7 @@ class SwapBuffer(object): def allocate_tensor(self, swap_path, numel, aligned_numel): assert self.has_space(aligned_numel) - assert not self.offset in self.swap_tensors + assert self.offset not in self.swap_tensors allocate_offset = self.offset swap_tensor = self.buffer.narrow(0, allocate_offset, aligned_numel) diff --git a/deepspeed/runtime/utils.py b/deepspeed/runtime/utils.py index fa65b9041..f36411c4a 100755 --- a/deepspeed/runtime/utils.py +++ b/deepspeed/runtime/utils.py @@ -846,7 +846,7 @@ def get_global_norm_of_tensors(input_tensors, norm_type=2, mpu=None, use_graph=F Total norm of the tensors (viewed as a single vector). """ assert isinstance(input_tensors, Iterable), f'expected Iterable type not {type(input_tensors)}' - assert all([torch.is_tensor(t) for t in input_tensors]), f'expected list of only tensors' + assert all([torch.is_tensor(t) for t in input_tensors]), 'expected list of only tensors' norm_type = float(norm_type) all_norms = [] diff --git a/deepspeed/runtime/zero/contiguous_memory_allocator.py b/deepspeed/runtime/zero/contiguous_memory_allocator.py index 35b3d5c7d..3e3b11b68 100644 --- a/deepspeed/runtime/zero/contiguous_memory_allocator.py +++ b/deepspeed/runtime/zero/contiguous_memory_allocator.py @@ -85,7 +85,7 @@ class ContiguousMemoryAllocator(object): assert tensor_id in self.tensor_map.keys(), "No such tensor allocated by the allocator." assert tensor.numel() >= numel, "Assert tensor buffer does is not large enough" - assert not tensor_id in self.id_to_params.keys(), "This tensor has already been assigned to a param" + assert tensor_id not in self.id_to_params.keys(), "This tensor has already been assigned to a param" self.id_to_params[tensor_id] = [param] diff --git a/deepspeed/runtime/zero/mics.py b/deepspeed/runtime/zero/mics.py index 628bf86a6..92b129f5d 100755 --- a/deepspeed/runtime/zero/mics.py +++ b/deepspeed/runtime/zero/mics.py @@ -47,7 +47,7 @@ class MiCS_AllGatherCoalescedHandle(AllGatherCoalescedHandle): instrument_w_nvtx(self.allgather_handle.wait)() except (ValueError, RuntimeError) as e: log_dist( - f"WARNING: Runtime Error while waiting the collective all-gather, possibly due to the _IllegalWork", + "WARNING: Runtime Error while waiting the collective all-gather, possibly due to the _IllegalWork", ranks=[0]) log_dist(f"Error message: {e}", ranks=[0]) @@ -158,7 +158,7 @@ class MiCS_Init(Init): if sequence_data_parallel_group is not None: logger.warning( - f"sequence_data_parallel_group' is deprecated and will be removed. Use 'data_parallel_group' instead.") + "sequence_data_parallel_group' is deprecated and will be removed. Use 'data_parallel_group' instead.") if data_parallel_group is not None: raise ValueError( "Both 'data_parallel_group' and 'sequence_data_parallel_group' were specified. Please provide only one of these arguments." @@ -339,7 +339,7 @@ class MiCS_Offload(DeepSpeedZeRoOffload): """ overload the parent class function for convert the parameters """ - log_dist(f'Convert to zero parameters from MiCS Offload manager', ranks=[0]) + log_dist('Convert to zero parameters from MiCS Offload manager', ranks=[0]) non_zero_params = [p for p in module.parameters() if not is_zero_param(p)] if non_zero_params: zero_params = [p for p in module.parameters() if is_zero_param(p)] diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py index 14286b1da..b65e89d99 100755 --- a/deepspeed/runtime/zero/partition_parameters.py +++ b/deepspeed/runtime/zero/partition_parameters.py @@ -1020,7 +1020,7 @@ class Init(InsertPostInitMethodToModuleSubClasses): if sequence_data_parallel_group is not None: logger.warning( - f"sequence_data_parallel_group' is deprecated and will be removed. Use 'data_parallel_group' instead.") + "sequence_data_parallel_group' is deprecated and will be removed. Use 'data_parallel_group' instead.") if data_parallel_group is not None: raise ValueError( "Both 'data_parallel_group' and 'sequence_data_parallel_group' were specified. Please provide only one of these arguments." diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py index 1757f3c66..a9c28d898 100644 --- a/deepspeed/runtime/zero/stage3.py +++ b/deepspeed/runtime/zero/stage3.py @@ -459,7 +459,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): self.offloaded_states: Set[OffloadDeviceEnum] = set() if dist.get_rank(group=self.dp_process_group) == 0: - see_memory_usage(f"After initializing ZeRO optimizer", force=True) + see_memory_usage("After initializing ZeRO optimizer", force=True) def destroy(self): self.parameter_offload.destroy() @@ -551,7 +551,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): dist.barrier() if dist.get_rank() == 0: - logger.info(f"optimizer state initialized") + logger.info("optimizer state initialized") # IPG if self.contiguous_gradients: @@ -647,7 +647,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): nvme_swap_folder = os.path.join(offload_optimizer_config.nvme_path, 'zero_stage_3') os.makedirs(nvme_swap_folder, exist_ok=True) if dist.get_rank() == 0: - logger.info(f'Tensor Swapping: Adding optimizer tensors') + logger.info('Tensor Swapping: Adding optimizer tensors') swapper_type = PipelinedOptimizerSwapper if offload_optimizer_config.pipeline else PartitionedOptimizerSwapper @@ -797,7 +797,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): largest_partition_numel = [t.ds_numel for t in sub_group] max_partition_numel = total_elements - assert len(largest_partition_numel) > 0, f'Unexpected that largest partition is empty' + assert len(largest_partition_numel) > 0, 'Unexpected that largest partition is empty' self.fp16_groups[0][0].nvme_swapper.reserve_partitioned_swap_space(largest_partition_numel) def _get_parameter_partitions(self) -> List[Tensor]: @@ -1142,10 +1142,10 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): @instrument_w_nvtx def independent_gradient_partition_epilogue(self): - self.report_ipg_memory_usage(f"In ipg_epilogue before reduce_ipg_grads", 0) + self.report_ipg_memory_usage("In ipg_epilogue before reduce_ipg_grads", 0) for comm_dtype in sort_dtypes(self.ipg_buckets.keys()): self.__reduce_and_partition_ipg_grads(comm_dtype) - self.report_ipg_memory_usage(f"In ipg_epilogue after reduce_ipg_grads", 0) + self.report_ipg_memory_usage("In ipg_epilogue after reduce_ipg_grads", 0) if not get_accelerator().resolves_data_dependency(): self.reduce_and_partition_stream.synchronize() @@ -1173,7 +1173,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): self.independent_gradient_partition_epilogue() def create_reduce_and_remove_grad_hooks(self): - print_rank_0(f'[Begin] Create gradient reduction hooks') + print_rank_0('[Begin] Create gradient reduction hooks') self.leaf_parameters = defaultdict(list) for i, param_group in enumerate(self.fp16_groups): for param in param_group: @@ -1256,7 +1256,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): self._leaf_module_hooks.append(leaf_module.register_forward_pre_hook(wrapper_pre_hook(leaf_parameters))) self._leaf_module_hooks.append(leaf_module.register_forward_hook(wrapper_post_hook())) - print_rank_0(f'[End] Create gradient reduction hooks') + print_rank_0('[End] Create gradient reduction hooks') def get_param_id(self, param): return OptimizerSwapper.parameter_id(param) @@ -1426,7 +1426,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): self.grad_position[param_id] = [int(i), int(current_offset), int(num_elements)] #print(f"param id {param_id} i:{i}, ds_tensor {num_elements} numel {param.numel()}") current_offset += num_elements - see_memory_usage(f"After Set Grad positions", force=False) + see_memory_usage("After Set Grad positions", force=False) def _constant_buffered_norm2(self, input, buffer_size=250000000): norm = None @@ -1515,7 +1515,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): self.norm_for_param_grads[self.get_param_id(param)] = self._constant_buffered_norm2(grad_buffer) if self._swappable_optimizer_subgroup(i): - if not i in offload_fp32_gradients.keys(): + if i not in offload_fp32_gradients.keys(): offload_fp32_gradients[i] = [] offload_fp32_offsets[i] = [] @@ -1560,7 +1560,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): """ if not self.zero_quantized_nontrainable_weights: print_rank_0( - f"Warning: quantize_nontrainable_params() called with zero_quantized_nontrainable_weights disabled, return without doing anything", + "Warning: quantize_nontrainable_params() called with zero_quantized_nontrainable_weights disabled, return without doing anything", force=True) return quantizer_module = CUDAQuantizer() @@ -1881,8 +1881,8 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): def _pre_step(self): self.micro_step_id = 0 - print_rank_0(f"Inside Step function") - see_memory_usage(f"In step before checking overflow", force=False) + print_rank_0("Inside Step function") + see_memory_usage("In step before checking overflow", force=False) print_rank_0("Finished Tracing at Beginning of Step") self._get_param_coordinator().hierarchy = 0 @@ -2084,7 +2084,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): self.timers.log(timer_names) see_memory_usage('After zero_optimizer step', force=False) - print_rank_0(f"------------------Finishing Step-----------------------") + print_rank_0("------------------Finishing Step-----------------------") @instrument_w_nvtx def _reassign_or_swap_out_partitioned_parameters(self, sub_group_id): @@ -2296,7 +2296,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): if self.swap_optimizer: self.optimizer_swapper.pre_backward() - see_memory_usage(f"Before backward", force=False) + see_memory_usage("Before backward", force=False) if self.custom_loss_scaler: scaled_loss = self.external_loss_scale * loss @@ -2486,7 +2486,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): if not param.requires_grad: return - assert hasattr(param, "ds_tensor"), f" The parameter does not contain the partitioned copy of the tensor." + assert hasattr(param, "ds_tensor"), " The parameter does not contain the partitioned copy of the tensor." assert value.numel() == param.ds_tensor.numel( ), f" Number of elements do not match: {value.numel()} != {param.ds_tensor.ds_numel}" @@ -2961,7 +2961,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): self.empty_partition_cache() - assert self.optimizer.__class__ == deepspeed.ops.adam.fused_adam.FusedAdam, f"Offloading is supported only for DeepSpeed FusedAdam." + assert self.optimizer.__class__ == deepspeed.ops.adam.fused_adam.FusedAdam, "Offloading is supported only for DeepSpeed FusedAdam." def needs_offload(target): # return True diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py index f8610bdaa..301a14aae 100755 --- a/deepspeed/runtime/zero/stage_1_and_2.py +++ b/deepspeed/runtime/zero/stage_1_and_2.py @@ -602,10 +602,10 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): see_memory_usage("After initializing optimizer states", force=True) if dist.get_rank() == 0: - logger.info(f"optimizer state initialized") + logger.info("optimizer state initialized") if dist.get_rank(group=self.dp_process_group) == 0: - see_memory_usage(f"After initializing ZeRO optimizer", force=True) + see_memory_usage("After initializing ZeRO optimizer", force=True) self._link_all_hp_params() self._hp_optimizer_states_linked = False @@ -722,7 +722,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): for i, tensor in enumerate(tensor_list): j = i % num_partitions - if not j in partition_tensors: + if j not in partition_tensors: partition_tensors[j] = [] partition_tensors[j].append((i, tensor)) @@ -828,9 +828,9 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): i, param_group, partition_id) def independent_gradient_partition_epilogue(self): - self.report_ipg_memory_usage(f"In ipg_epilogue before reduce_ipg_grads", 0) + self.report_ipg_memory_usage("In ipg_epilogue before reduce_ipg_grads", 0) self.reduce_ipg_grads() - self.report_ipg_memory_usage(f"In ipg_epilogue after reduce_ipg_grads", 0) + self.report_ipg_memory_usage("In ipg_epilogue after reduce_ipg_grads", 0) # if dist.get_rank() == 0: # logger.info("Params already reduced %s", self.params_already_reduced) @@ -846,7 +846,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): if self.cpu_offload is False: for i, _ in enumerate(self.bit16_groups): - if not i in self.averaged_gradients or self.averaged_gradients[i] is None: + if i not in self.averaged_gradients or self.averaged_gradients[i] is None: self.averaged_gradients[i] = self.get_flat_partition( self.params_in_partition[i], self.first_offset[i], @@ -871,7 +871,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): # All gradients required by the step # are in self.averaged_gradients self.zero_grad(set_to_none=True) - see_memory_usage(f"End ipg_epilogue") + see_memory_usage("End ipg_epilogue") # resets all partition to no reduced # sets remaining grads to the total number of grads in each partition @@ -1958,7 +1958,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): """ self.micro_step_id = INITIAL_MICRO_STEP_ID - see_memory_usage(f"In step before checking overflow") + see_memory_usage("In step before checking overflow") # First compute norm for all group so we know if there is overflow if self.check_grad_overflow: @@ -2448,7 +2448,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): self.clip_grad = sd.get(CLIP_GRAD, self.clip_grad) ckpt_version = sd.get(DS_VERSION, False) - assert ckpt_version, f"Empty ds_version in checkpoint, not clear how to proceed" + assert ckpt_version, "Empty ds_version in checkpoint, not clear how to proceed" ckpt_version = pkg_version.parse(ckpt_version) # zero stage 1 mode diff --git a/deepspeed/utils/tensor_fragment.py b/deepspeed/utils/tensor_fragment.py index 305f8d56f..1947ec3d8 100644 --- a/deepspeed/utils/tensor_fragment.py +++ b/deepspeed/utils/tensor_fragment.py @@ -263,7 +263,7 @@ def safe_get_local_grad(param): Returns: Union[torch.Tensor, None]: A tensor on accelerator device """ - assert hasattr(param, 'ds_id'), f'This API is only defined for ZeRO-3 partitioned parameters' + assert hasattr(param, 'ds_id'), 'This API is only defined for ZeRO-3 partitioned parameters' return param._z3_optimizer.get_local_fp32_grad_for_param(param) @@ -277,7 +277,7 @@ def safe_set_local_grad(param, value): param (``torch.nn.Parameter``): A model parameter. value (``torch.Tensor``): New value of local gradient partition. """ - assert hasattr(param, 'ds_id'), f'This API is only defined for ZeRO-3 partitioned parameters' + assert hasattr(param, 'ds_id'), 'This API is only defined for ZeRO-3 partitioned parameters' param._z3_optimizer.set_local_grad_for_param(value, param) @@ -290,7 +290,7 @@ def safe_get_local_fp32_param(param): Returns: Union[torch.Tensor, None]: A tensor on accelerator device """ - assert hasattr(param, 'ds_id'), f'This API is only defined for ZeRO-3 partitioned parameters' + assert hasattr(param, 'ds_id'), 'This API is only defined for ZeRO-3 partitioned parameters' return param._z3_optimizer.get_local_fp32_param(param) @@ -304,7 +304,7 @@ def safe_get_local_optimizer_state(param, optim_state_key): Returns: Union[torch.Tensor, None]: A tensor on accelerator device """ - assert hasattr(param, 'ds_id'), f'This API is only defined for ZeRO-3 partitioned parameters' + assert hasattr(param, 'ds_id'), 'This API is only defined for ZeRO-3 partitioned parameters' return param._z3_optimizer.get_local_fp32_param(param, optim_state_key) @@ -316,7 +316,7 @@ def safe_set_local_optimizer_state(param, value, optim_state_key): value (``torch.Tensor``): New value of local optimizer state partition. optim_state_key (``string``): Key value of optimizer state (e.g., `exp_avg` in Adam optimizer). """ - assert hasattr(param, 'ds_id'), f'This API is only defined for ZeRO-3 partitioned parameters' + assert hasattr(param, 'ds_id'), 'This API is only defined for ZeRO-3 partitioned parameters' param._z3_optimizer.set_local_hp_param(value, param, optim_state_key) @@ -327,7 +327,7 @@ def safe_set_local_fp32_param(param, value): param (``torch.nn.Parameter``): A model parameter. value (``torch.Tensor``): New value of local parameter partition. """ - assert hasattr(param, 'ds_id'), f'This API is only defined for ZeRO-3 partitioned parameters' + assert hasattr(param, 'ds_id'), 'This API is only defined for ZeRO-3 partitioned parameters' param._z3_optimizer.set_local_hp_param(value, param) diff --git a/deepspeed/utils/timer.py b/deepspeed/utils/timer.py index 64ae8ac0e..772b099ea 100755 --- a/deepspeed/utils/timer.py +++ b/deepspeed/utils/timer.py @@ -142,7 +142,7 @@ class SynchronizedWallClockTimer: def log(self, names, normalizer=1.0, reset=True, memory_breakdown=False, ranks=None): """Log a group of timers.""" assert normalizer > 0.0 - string = f"time (ms)" + string = "time (ms)" for name in names: if name in self.timers: elapsed_time = (self.timers[name].elapsed(reset=reset) / normalizer) diff --git a/deepspeed/utils/zero_to_fp32.py b/deepspeed/utils/zero_to_fp32.py index 0e759146c..5995d6e6f 100755 --- a/deepspeed/utils/zero_to_fp32.py +++ b/deepspeed/utils/zero_to_fp32.py @@ -155,7 +155,7 @@ def parse_optim_states(files, ds_checkpoint_dir): state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) state_dicts.append(state_dict) - if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]: raise ValueError(f"{files[0]} is not a zero checkpoint") zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] @@ -709,10 +709,10 @@ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. """ - logger.info(f"Extracting fp32 weights") + logger.info("Extracting fp32 weights") state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) - logger.info(f"Overwriting model with fp32 weights") + logger.info("Overwriting model with fp32 weights") model = model.cpu() model.load_state_dict(state_dict, strict=False) diff --git a/op_builder/fp_quantizer.py b/op_builder/fp_quantizer.py index 2b962ac2c..5ccc35ac2 100644 --- a/op_builder/fp_quantizer.py +++ b/op_builder/fp_quantizer.py @@ -50,7 +50,7 @@ class FPQuantizerBuilder(CUDAOpBuilder): except ImportError: if verbose: self.warning( - f"please install triton==2.3.0, 2.3.1 or 3.0.0 if you want to use the FP Quantizer Kernels") + "please install triton==2.3.0, 2.3.1 or 3.0.0 if you want to use the FP Quantizer Kernels") return False # triton 2.3.{0,1} and 3.0.0 are ok. diff --git a/op_builder/sparse_attn.py b/op_builder/sparse_attn.py index b8e5e8a6f..05d8ed375 100644 --- a/op_builder/sparse_attn.py +++ b/op_builder/sparse_attn.py @@ -42,7 +42,7 @@ class SparseAttnBuilder(OpBuilder): import torch except ImportError: if verbose: - self.warning(f"unable to import torch, please install it first") + self.warning("unable to import torch, please install it first") return False # torch-cpu will not have a cuda version @@ -70,7 +70,7 @@ class SparseAttnBuilder(OpBuilder): # auto-install of triton is broken on some systems, reverting to manual install for now # see this issue: https://github.com/deepspeedai/DeepSpeed/issues/1710 if verbose: - self.warning(f"please install triton==1.0.0 if you want to use sparse attention") + self.warning("please install triton==1.0.0 if you want to use sparse attention") return False if pkg_version: diff --git a/tests/benchmarks/DS4Sci_EvoformerAttention_bench.py b/tests/benchmarks/DS4Sci_EvoformerAttention_bench.py index e3d8825f5..f85a7e4b6 100644 --- a/tests/benchmarks/DS4Sci_EvoformerAttention_bench.py +++ b/tests/benchmarks/DS4Sci_EvoformerAttention_bench.py @@ -99,7 +99,7 @@ def benchmark(): with cuda_timer(baseline_bw): ref_out.backward(d_out) - print(f"batch size\tours (FW)\tbaseline (FW)\tours (BW)\tbaseline (BW)") + print("batch size\tours (FW)\tbaseline (FW)\tours (BW)\tbaseline (BW)") for i in range(len(ours_fw)): print(f"{i+1}\t{ours_fw[i]}\t{baseline_fw[i]}\t{ours_bw[i]}\t{baseline_bw[i]}") diff --git a/tests/small_model_debugging/stage3_test.py b/tests/small_model_debugging/stage3_test.py index 3a92d31f1..5bd8e728c 100644 --- a/tests/small_model_debugging/stage3_test.py +++ b/tests/small_model_debugging/stage3_test.py @@ -15,9 +15,9 @@ import deepspeed class VerboseLinear(torch.nn.Linear): def __init__(self, **kwargs): - print(f'Begin VerboseLinear.__init__') + print('Begin VerboseLinear.__init__') super().__init__(**kwargs) - print(f'End VerboseLinear.__init__') + print('End VerboseLinear.__init__') class LinearStack(torch.nn.Module): diff --git a/tests/unit/model_parallelism/test_autotp_training.py b/tests/unit/model_parallelism/test_autotp_training.py index cba285b06..af458857d 100644 --- a/tests/unit/model_parallelism/test_autotp_training.py +++ b/tests/unit/model_parallelism/test_autotp_training.py @@ -24,7 +24,7 @@ from deepspeed.runtime.utils import is_model_parallel_parameter def skip_on_device(): if get_accelerator().device_name() == 'xpu': - pytest.skip(f"XPU requires a higher version for test") + pytest.skip("XPU requires a higher version for test") class SequentialLinearModel(torch.nn.Module): @@ -449,9 +449,9 @@ class TestSave(DistributedTest): base_state_dict = base_model.state_dict() if dist.get_rank() == 0: # we should consider the case when zero3 is used in the future. - assert compare_state_dicts(base_state_dict, tp_state_dict), f"State_dict is not the same!" + assert compare_state_dicts(base_state_dict, tp_state_dict), "State_dict is not the same!" else: - assert tp_state_dict is None, f"noly rank0 should have the state_dict" + assert tp_state_dict is None, "noly rank0 should have the state_dict" def test_ckpt_save(self, tmpdir, tp_size: int, zero_stage: int): skip_on_device() diff --git a/tests/unit/moe/test_moe.py b/tests/unit/moe/test_moe.py index 8c7c6a165..fc7faceae 100644 --- a/tests/unit/moe/test_moe.py +++ b/tests/unit/moe/test_moe.py @@ -288,7 +288,7 @@ class TestExpertWeightGradWithZero(DistributedTest): """ rank = int(deepspeed.comm.get_rank()) ep_state_dict = dict() - dst_sub_key = f"deepspeed_moe.experts.deepspeed_experts.0" + dst_sub_key = "deepspeed_moe.experts.deepspeed_experts.0" src_sub_key = f"deepspeed_moe.experts.deepspeed_experts.{rank}" for moe_layer in ["moe_1", "moe_2"]: for mlp_in_moe in [0, 1]: diff --git a/tests/unit/runtime/half_precision/onebit/test_onebit.py b/tests/unit/runtime/half_precision/onebit/test_onebit.py index e52398c83..014c29bae 100644 --- a/tests/unit/runtime/half_precision/onebit/test_onebit.py +++ b/tests/unit/runtime/half_precision/onebit/test_onebit.py @@ -149,7 +149,7 @@ class TestOneBitAdamExpAvgMask(DistributedTest): v["exp_avg"], v["exp_avg"].mul_(mask1.to(device=v["exp_avg"].device)), atol=1e-07, - ), f"Momentum mask is not working properly" + ), "Momentum mask is not working properly" class TestOneBitAdamCheckpointing(DistributedTest): @@ -241,11 +241,11 @@ class TestOneBitAdamCheckpointing(DistributedTest): assert optimizer_1.optimizer.adam_freeze_key is True mask1 = mask1.to(device=optimizer_1.param_groups[0]["exp_avg_mask"].device) assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1, - atol=1e-07), f"Incorrect momentum mask" + atol=1e-07), "Incorrect momentum mask" save_folder = os.path.join(tmpdir, "saved_checkpoint") model_1.save_checkpoint(save_folder, tag=None) assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1, - atol=1e-07), f"Momentum mask should not change after saving checkpoint" + atol=1e-07), "Momentum mask should not change after saving checkpoint" model_2, optimizer_2, _, _ = deepspeed.initialize( config=config_dict, @@ -255,7 +255,7 @@ class TestOneBitAdamCheckpointing(DistributedTest): # Test whether momentum mask stays the same after loading checkpoint mask2 = mask2.to(device=optimizer_2.param_groups[0]["exp_avg_mask"].device) assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2, - atol=1e-07), f"Incorrect momentum mask" + atol=1e-07), "Incorrect momentum mask" model_2.load_checkpoint( save_folder, tag=None, @@ -263,11 +263,11 @@ class TestOneBitAdamCheckpointing(DistributedTest): load_lr_scheduler_states=True, ) assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2, - atol=1e-07), f"Momentum mask should not change after loading checkpoint" + atol=1e-07), "Momentum mask should not change after loading checkpoint" # Test whether worker&server error is reset for v in optimizer_2.state.values(): - assert "worker_error" not in v, f"Incorrect worker error" - assert "server_error" not in v, f"Incorrect server error" + assert "worker_error" not in v, "Incorrect worker error" + assert "server_error" not in v, "Incorrect server error" assert optimizer_2.optimizer.adam_freeze_key is True model_3, optimizer_3, _, _ = deepspeed.initialize( @@ -287,7 +287,7 @@ class TestOneBitAdamCheckpointing(DistributedTest): model_3.step() assert optimizer_3.optimizer.adam_freeze_key is True # Test whether momentum mask stays the same after loading checkpoint - assert ("exp_avg_mask" not in optimizer_3.param_groups[0]), f"Incorrect momentum mask" + assert ("exp_avg_mask" not in optimizer_3.param_groups[0]), "Incorrect momentum mask" model_3.load_checkpoint( save_folder, tag=None, @@ -295,11 +295,11 @@ class TestOneBitAdamCheckpointing(DistributedTest): load_lr_scheduler_states=True, ) assert ("exp_avg_mask" - not in optimizer_3.param_groups[0]), f"Momentum mask should not change after loading checkpoint" + not in optimizer_3.param_groups[0]), "Momentum mask should not change after loading checkpoint" # Test whether worker&server error is reset for v in optimizer_3.state.values(): - assert "worker_error" not in v, f"Incorrect worker error" - assert "server_error" not in v, f"Incorrect server error" + assert "worker_error" not in v, "Incorrect worker error" + assert "server_error" not in v, "Incorrect server error" assert optimizer_3.optimizer.adam_freeze_key is False def test_overflow(self, tmpdir): @@ -518,7 +518,7 @@ class TestZeroOneAdamExpAvgMask(DistributedTest): v["exp_avg"], v["exp_avg"].mul_(mask1.to(device=v["exp_avg"].device)), atol=1e-07, - ), f"Momentum mask is not working properly" + ), "Momentum mask is not working properly" class TestZeroOneAdamCheckpointing(DistributedTest): @@ -614,11 +614,11 @@ class TestZeroOneAdamCheckpointing(DistributedTest): # Test whether momentum mask still exist after saving checkpoint mask1 = mask1.to(device=optimizer_1.param_groups[0]["exp_avg_mask"].device) assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1, - atol=1e-07), f"Incorrect momentum mask" + atol=1e-07), "Incorrect momentum mask" save_folder = os.path.join(tmpdir, "saved_checkpoint") model_1.save_checkpoint(save_folder, tag=None) assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1, - atol=1e-07), f"Momentum mask should not change after saving checkpoint" + atol=1e-07), "Momentum mask should not change after saving checkpoint" model_2, optimizer_2, _, _ = deepspeed.initialize( config=config_dict, @@ -628,7 +628,7 @@ class TestZeroOneAdamCheckpointing(DistributedTest): # Test whether momentum mask stays the same after loading checkpoint mask2 = mask2.to(device=optimizer_2.param_groups[0]["exp_avg_mask"].device) assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2, - atol=1e-07), f"Incorrect momentum mask" + atol=1e-07), "Incorrect momentum mask" model_2.load_checkpoint( save_folder, tag=None, @@ -636,11 +636,11 @@ class TestZeroOneAdamCheckpointing(DistributedTest): load_lr_scheduler_states=True, ) assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2, - atol=1e-07), f"Momentum mask should not change after loading checkpoint" + atol=1e-07), "Momentum mask should not change after loading checkpoint" # Test whether worker&server error is reset for v in optimizer_2.state.values(): - assert "worker_error" not in v, f"Incorrect worker error" - assert "server_error" not in v, f"Incorrect server error" + assert "worker_error" not in v, "Incorrect worker error" + assert "server_error" not in v, "Incorrect server error" model_3, optimizer_3, _, _ = deepspeed.initialize( config=config_dict, @@ -658,7 +658,7 @@ class TestZeroOneAdamCheckpointing(DistributedTest): model_3.backward(loss) model_3.step() # Test whether momentum mask stays the same after loading checkpoint - assert ("exp_avg_mask" not in optimizer_3.param_groups[0]), f"Incorrect momentum mask" + assert ("exp_avg_mask" not in optimizer_3.param_groups[0]), "Incorrect momentum mask" model_3.load_checkpoint( save_folder, tag=None, @@ -666,11 +666,11 @@ class TestZeroOneAdamCheckpointing(DistributedTest): load_lr_scheduler_states=True, ) assert ("exp_avg_mask" - not in optimizer_3.param_groups[0]), f"Momentum mask should not change after loading checkpoint" + not in optimizer_3.param_groups[0]), "Momentum mask should not change after loading checkpoint" # Test whether worker&server error is reset for v in optimizer_3.state.values(): - assert "worker_error" not in v, f"Incorrect worker error" - assert "server_error" not in v, f"Incorrect server error" + assert "worker_error" not in v, "Incorrect worker error" + assert "server_error" not in v, "Incorrect server error" def test_overflow(self, tmpdir): if not get_accelerator().is_fp16_supported(): @@ -899,7 +899,7 @@ class TestOneBitLampExpAvgMask(DistributedTest): v["exp_avg"], v["exp_avg"].mul_(mask1.to(device=v["exp_avg"].device)), atol=1e-07, - ), f"Momentum mask is not working properly" + ), "Momentum mask is not working properly" class TestOneBitLambCheckpointing(DistributedTest): @@ -997,15 +997,15 @@ class TestOneBitLambCheckpointing(DistributedTest): assert optimizer_1.optimizer.lamb_freeze_key is True mask1 = mask1.to(device=optimizer_1.param_groups[0]["exp_avg_mask"].device) assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1, - atol=1e-07), f"Incorrect momentum mask" + atol=1e-07), "Incorrect momentum mask" scaling_coeff_1 = [] for v in optimizer_1.state.values(): - assert "scaling_coeff" in v, f"Incorrect scaling_coeff" + assert "scaling_coeff" in v, "Incorrect scaling_coeff" scaling_coeff_1.append(v["scaling_coeff"]) save_folder = os.path.join(tmpdir, "saved_checkpoint") model_1.save_checkpoint(save_folder, tag=None) assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1, - atol=1e-07), f"Momentum mask should not change after saving checkpoint" + atol=1e-07), "Momentum mask should not change after saving checkpoint" model_2, optimizer_2, _, _ = deepspeed.initialize( config=config_dict, @@ -1015,7 +1015,7 @@ class TestOneBitLambCheckpointing(DistributedTest): # Test whether momentum mask stays the same after loading checkpoint mask2 = mask2.to(device=optimizer_2.param_groups[0]["exp_avg_mask"].device) assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2, - atol=1e-07), f"Incorrect momentum mask" + atol=1e-07), "Incorrect momentum mask" model_2.load_checkpoint( save_folder, tag=None, @@ -1023,16 +1023,16 @@ class TestOneBitLambCheckpointing(DistributedTest): load_lr_scheduler_states=True, ) assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2, - atol=1e-07), f"Momentum mask should not change after loading checkpoint" + atol=1e-07), "Momentum mask should not change after loading checkpoint" # Test whether worker&server error is reset - assert len(optimizer_2.optimizer.worker_errors) == 0, f"Incorrect worker error" - assert len(optimizer_2.optimizer.server_errors) == 0, f"Incorrect server error" + assert len(optimizer_2.optimizer.worker_errors) == 0, "Incorrect worker error" + assert len(optimizer_2.optimizer.server_errors) == 0, "Incorrect server error" # Test whether scaling_coeffs is loaded correctly scaling_coeff_2 = [] for v in optimizer_2.state.values(): - assert "scaling_coeff" in v, f"Incorrect scaling_coeff" + assert "scaling_coeff" in v, "Incorrect scaling_coeff" scaling_coeff_2.append(v["scaling_coeff"]) - assert list(sorted(scaling_coeff_2)) == list(sorted(scaling_coeff_1)), f"Incorrect scaling_coeffs" + assert list(sorted(scaling_coeff_2)) == list(sorted(scaling_coeff_1)), "Incorrect scaling_coeffs" assert optimizer_2.optimizer.lamb_freeze_key is True model_3, optimizer_3, _, _ = deepspeed.initialize( @@ -1052,7 +1052,7 @@ class TestOneBitLambCheckpointing(DistributedTest): model_3.step() assert optimizer_3.optimizer.lamb_freeze_key is True # Test whether momentum mask stays the same after loading checkpoint - assert ("exp_avg_mask" not in optimizer_3.param_groups[0]), f"Incorrect momentum mask" + assert ("exp_avg_mask" not in optimizer_3.param_groups[0]), "Incorrect momentum mask" model_3.load_checkpoint( save_folder, tag=None, @@ -1060,15 +1060,15 @@ class TestOneBitLambCheckpointing(DistributedTest): load_lr_scheduler_states=True, ) assert ("exp_avg_mask" - not in optimizer_3.param_groups[0]), f"Momentum mask should not change after loading checkpoint" + not in optimizer_3.param_groups[0]), "Momentum mask should not change after loading checkpoint" # Test whether worker&server error is reset - assert len(optimizer_3.optimizer.worker_errors) == 0, f"Incorrect worker error" - assert len(optimizer_3.optimizer.server_errors) == 0, f"Incorrect server error" + assert len(optimizer_3.optimizer.worker_errors) == 0, "Incorrect worker error" + assert len(optimizer_3.optimizer.server_errors) == 0, "Incorrect server error" # Test whether scaling_coeffs, lamb_coeff_freeze, last_factor are reset for v in optimizer_3.state.values(): - assert v["lamb_coeff_freeze"] == 0.0, f"Incorrect lamb_coeff_freeze" - assert v["last_factor"] == 1.0, f"Incorrect last_factor" - assert "scaling_coeff" not in v, f"Incorrect scaling_coeff" + assert v["lamb_coeff_freeze"] == 0.0, "Incorrect lamb_coeff_freeze" + assert v["last_factor"] == 1.0, "Incorrect last_factor" + assert "scaling_coeff" not in v, "Incorrect scaling_coeff" assert optimizer_3.optimizer.lamb_freeze_key is False def test_overflow(self, tmpdir): diff --git a/tests/unit/runtime/test_data_efficiency.py b/tests/unit/runtime/test_data_efficiency.py index a52ca2982..61a91c9ab 100644 --- a/tests/unit/runtime/test_data_efficiency.py +++ b/tests/unit/runtime/test_data_efficiency.py @@ -236,4 +236,4 @@ class TestLegacyCurriculumScheduler(DistributedTest): model.step() if n + 1 in ground_truths: true_seqlen = ground_truths[n + 1] - assert seqlen == true_seqlen, f"Incorrect curriculum schedule" + assert seqlen == true_seqlen, "Incorrect curriculum schedule" diff --git a/tests/unit/runtime/test_multi_output_model.py b/tests/unit/runtime/test_multi_output_model.py index 270ac4206..ab1b37a1f 100644 --- a/tests/unit/runtime/test_multi_output_model.py +++ b/tests/unit/runtime/test_multi_output_model.py @@ -49,7 +49,7 @@ class TestTwoOutputModel(DistributedTest): targets=[1, 2]) for n, batch in enumerate(data_loader): assert len(batch) % 2 == 0, \ - f"multi_output_dataloader failed to return even number of data samples (input+target)" + "multi_output_dataloader failed to return even number of data samples (input+target)" midpoint = len(batch) // 2 inputs, targets = batch[:midpoint], batch[midpoint:] @@ -107,7 +107,7 @@ class TestThreeOutputModel(DistributedTest): targets=[1, 2, 3]) for n, batch in enumerate(data_loader): assert len(batch) % 2 == 0, \ - f"multi_output_dataloader failed to return even number of data samples (input+target)" + "multi_output_dataloader failed to return even number of data samples (input+target)" midpoint = len(batch) // 2 inputs, targets = batch[:midpoint], batch[midpoint:] diff --git a/tests/unit/runtime/zero/test_offload_states.py b/tests/unit/runtime/zero/test_offload_states.py index ef50efc3b..bf58d7a4b 100644 --- a/tests/unit/runtime/zero/test_offload_states.py +++ b/tests/unit/runtime/zero/test_offload_states.py @@ -62,14 +62,14 @@ def run_model(model, param_groups, config_dict, hidden_dim, dtype, offloaded_sta pin_memory=pin_memory, non_blocking=non_blocking) alloc_after_offload = get_accelerator().memory_allocated() - assert alloc_after_offload < alloc_before_offload, f"Allocated memory should decrease after offload" + assert alloc_after_offload < alloc_before_offload, "Allocated memory should decrease after offload" validate_device(model, torch.device(offload_device.value), offloaded_states) # Reload states model.reload_states() assert alloc_after_offload < get_accelerator().memory_allocated( - ), f"Allocated memory should increase after offload back" + ), "Allocated memory should increase after offload back" # Verify restored states hp_param_restored = [safe_get_local_fp32_param(p) for p in model.parameters()] diff --git a/tests/unit/runtime/zero/test_zero_leaf_module.py b/tests/unit/runtime/zero/test_zero_leaf_module.py index 74c709883..6073a8c44 100644 --- a/tests/unit/runtime/zero/test_zero_leaf_module.py +++ b/tests/unit/runtime/zero/test_zero_leaf_module.py @@ -255,7 +255,7 @@ class TestZ3LeafOptimization(DistributedTest): loss, duration = bench_loss_and_time(config_dict) if dist.get_rank() == 0: - print(f"baseline exec time:", baseline_exec_time) + print("baseline exec time:", baseline_exec_time) print( f"finegrained optimziation exec time: {duration},granularity threshold:{module_granularity_threshold} " )