Fix invalid f-strings (#7457)

Fix invalid f-strings detected by ruff.

---------

Signed-off-by: cyy <cyyever@outlook.com>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
Co-authored-by: Olatunji Ruwase <tunji.ruwase@snowflake.com>
Co-authored-by: Michael Wyatt <michael.wyatt@snowflake.com>
This commit is contained in:
Yuanyuan Chen
2025-08-17 02:22:19 +08:00
committed by GitHub
parent 1d7b90adc4
commit 1c03d1b1bb
73 changed files with 231 additions and 232 deletions

View File

@ -27,7 +27,7 @@ class HPU_Accelerator(DeepSpeedAccelerator):
torch.utils.deterministic.fill_uninitialized_memory = False torch.utils.deterministic.fill_uninitialized_memory = False
except ImportError as e: except ImportError as e:
raise ValueError( raise ValueError(
f"HPU_Accelerator requires habana_frameworks.torch.hpu, which is not installed on this system.") "HPU_Accelerator requires habana_frameworks.torch.hpu, which is not installed on this system.")
self.fp16_supported = None self.fp16_supported = None

View File

@ -64,13 +64,13 @@ def get_accelerator():
assert ipex._C._has_xpu(), "XPU_Accelerator requires an intel_extension_for_pytorch that supports XPU." assert ipex._C._has_xpu(), "XPU_Accelerator requires an intel_extension_for_pytorch that supports XPU."
except ImportError as e: except ImportError as e:
raise ValueError( raise ValueError(
f"XPU_Accelerator requires intel_extension_for_pytorch, which is not installed on this system.") "XPU_Accelerator requires intel_extension_for_pytorch, which is not installed on this system.")
elif accelerator_name == "xpu.external": elif accelerator_name == "xpu.external":
try: try:
from intel_extension_for_deepspeed import XPU_Accelerator # noqa: F401 # type: ignore from intel_extension_for_deepspeed import XPU_Accelerator # noqa: F401 # type: ignore
except ImportError as e: except ImportError as e:
raise ValueError( raise ValueError(
f"XPU_Accelerator external requires intel_extension_for_deepspeed, which is not installed on this system." "XPU_Accelerator external requires intel_extension_for_deepspeed, which is not installed on this system."
) )
elif accelerator_name == "cpu": elif accelerator_name == "cpu":
pass pass
@ -78,13 +78,13 @@ def get_accelerator():
try: try:
import torch_npu # noqa: F401 # type: ignore import torch_npu # noqa: F401 # type: ignore
except ImportError as e: except ImportError as e:
raise ValueError(f"NPU_Accelerator requires torch_npu, which is not installed on this system.") raise ValueError("NPU_Accelerator requires torch_npu, which is not installed on this system.")
pass pass
elif accelerator_name == "sdaa": elif accelerator_name == "sdaa":
try: try:
import torch_sdaa # noqa: F401 # type: ignore import torch_sdaa # noqa: F401 # type: ignore
except ImportError as e: except ImportError as e:
raise ValueError(f"SDAA_Accelerator requires torch_sdaa, which is not installed on this system.") raise ValueError("SDAA_Accelerator requires torch_sdaa, which is not installed on this system.")
pass pass
elif accelerator_name == "mps": elif accelerator_name == "mps":
try: try:
@ -93,18 +93,18 @@ def get_accelerator():
# should use torch.mps.is_available() if it exists someday but this is used as proxy # should use torch.mps.is_available() if it exists someday but this is used as proxy
torch.mps.current_allocated_memory() torch.mps.current_allocated_memory()
except (RuntimeError, ImportError) as e: except (RuntimeError, ImportError) as e:
raise ValueError(f"MPS_Accelerator requires torch.mps, which is not installed on this system.") raise ValueError("MPS_Accelerator requires torch.mps, which is not installed on this system.")
elif accelerator_name == "hpu": elif accelerator_name == "hpu":
try: try:
import habana_frameworks.torch.hpu # noqa: F401 import habana_frameworks.torch.hpu # noqa: F401
except ImportError as e: except ImportError as e:
raise ValueError( raise ValueError(
f"HPU_Accelerator requires habana_frameworks.torch.hpu, which is not installed on this system.") "HPU_Accelerator requires habana_frameworks.torch.hpu, which is not installed on this system.")
elif accelerator_name == "mlu": elif accelerator_name == "mlu":
try: try:
import torch_mlu # noqa: F401 import torch_mlu # noqa: F401
except ImportError as e: except ImportError as e:
raise ValueError(f"MLU_Accelerator requires torch_mlu, which is not installed on this system.") raise ValueError("MLU_Accelerator requires torch_mlu, which is not installed on this system.")
elif accelerator_name not in SUPPORTED_ACCELERATOR_LIST: elif accelerator_name not in SUPPORTED_ACCELERATOR_LIST:
raise ValueError(f'DS_ACCELERATOR must be one of {SUPPORTED_ACCELERATOR_LIST}. ' raise ValueError(f'DS_ACCELERATOR must be one of {SUPPORTED_ACCELERATOR_LIST}. '
f'Value "{accelerator_name}" is not supported') f'Value "{accelerator_name}" is not supported')

View File

@ -70,10 +70,10 @@ def validate_args(args):
error_messages = [] error_messages = []
if args.folder is not None and len(args.folder_to_device_mapping) > 0: if args.folder is not None and len(args.folder_to_device_mapping) > 0:
error_messages.append(f'--folder and --folder_to_device_mapping cannot be specified together.') error_messages.append('--folder and --folder_to_device_mapping cannot be specified together.')
no_error = False no_error = False
elif args.folder is None and len(args.folder_to_device_mapping) == 0: elif args.folder is None and len(args.folder_to_device_mapping) == 0:
error_messages.append(f'At least one of --folder or --folder_to_device_mapping must be specified.') error_messages.append('At least one of --folder or --folder_to_device_mapping must be specified.')
no_error = False no_error = False
# Validate --folder # Validate --folder
@ -102,7 +102,7 @@ def validate_args(args):
print(f'Found {len(error_messages)} validation error(s)') print(f'Found {len(error_messages)} validation error(s)')
# Validate --gpu, --use_gds # Validate --gpu, --use_gds
if args.use_gds and not args.gpu: if args.use_gds and not args.gpu:
error_messages.append(f'--gpu must be set to transfer with --use_gds') error_messages.append('--gpu must be set to transfer with --use_gds')
no_error = False no_error = False
if not no_error: if not no_error:
@ -201,7 +201,7 @@ def get_validated_args():
args = refine_args(args) args = refine_args(args)
if not validate_args(args): if not validate_args(args):
quit() quit()
print(f'Successful validation of command line arguments') print('Successful validation of command line arguments')
args.total_loops = args.warmup_loops + args.loops args.total_loops = args.warmup_loops + args.loops
peer_tag = 'gpu' if args.gpu else 'process' peer_tag = 'gpu' if args.gpu else 'process'
args.mapping_dict = _get_mapping_dict(args) args.mapping_dict = _get_mapping_dict(args)

View File

@ -54,7 +54,7 @@ class AIOBasic_Engine(object):
task_log(tid, task_log(tid,
f'{io_string} file {filename} of size {args.io_size} bytes from buffer on device {buffer.device}') f'{io_string} file {filename} of size {args.io_size} bytes from buffer on device {buffer.device}')
task_log(tid, f'created deepspeed aio basic engine') task_log(tid, 'created deepspeed aio basic engine')
ctxt = {} ctxt = {}
ctxt[FILE] = filename ctxt[FILE] = filename

View File

@ -80,7 +80,7 @@ class AIOHandle_Engine(object):
io_parallel = args.io_parallel if args.io_parallel else 1 io_parallel = args.io_parallel if args.io_parallel else 1
handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit, handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit,
not args.sequential_requests, io_parallel) not args.sequential_requests, io_parallel)
task_log(tid, f'created deepspeed aio handle engine') task_log(tid, 'created deepspeed aio handle engine')
bounce_buffer = None bounce_buffer = None
if args.gpu: if args.gpu:

View File

@ -76,7 +76,7 @@ def io_engine_tasklet(pool_params):
task_barrier(aio_barrier, num_processes) task_barrier(aio_barrier, num_processes)
# Run pre task # Run pre task
task_log(tid, f'running pre-task') task_log(tid, 'running pre-task')
io_engine = schedule["pre"]((args, tid)) io_engine = schedule["pre"]((args, tid))
task_barrier(aio_barrier, num_processes) task_barrier(aio_barrier, num_processes)
@ -91,7 +91,7 @@ def io_engine_tasklet(pool_params):
io_engine.ctxt["main_task_sec"].append(stop_time - start_time) io_engine.ctxt["main_task_sec"].append(stop_time - start_time)
# Run post task # Run post task
task_log(tid, f'running post-task') task_log(tid, 'running post-task')
schedule["post"]((args, tid, io_engine)) schedule["post"]((args, tid, io_engine))
task_barrier(aio_barrier, num_processes) task_barrier(aio_barrier, num_processes)

View File

@ -12,7 +12,7 @@ from io_engine import io_engine_multiprocessing
def main(): def main():
print(f'Testing deepspeed_aio python frontend') print('Testing deepspeed_aio python frontend')
args = get_validated_args() args = get_validated_args()
mp.set_start_method('spawn', force=True) mp.set_start_method('spawn', force=True)

View File

@ -15,7 +15,7 @@ from deepspeed.io import FastFileWriter
class Torch_FastIO_Engine(object): class Torch_FastIO_Engine(object):
def __init__(self, args, tid, read_op): def __init__(self, args, tid, read_op):
assert read_op is False, f'Read operation is not currently supported' assert read_op is False, 'Read operation is not currently supported'
self.ctxt = self._create_context(args, tid, read_op) self.ctxt = self._create_context(args, tid, read_op)
self.zipfile_serialization = not args.torch_legacy_save self.zipfile_serialization = not args.torch_legacy_save
@ -69,7 +69,7 @@ class Torch_FastIO_Engine(object):
fast_io_buffer = create_page_locked_tensor(args.fast_io_size, args.use_accelerator_pin_memory, aio_handle) fast_io_buffer = create_page_locked_tensor(args.fast_io_size, args.use_accelerator_pin_memory, aio_handle)
task_log(tid, f'created torch_fastio engine') task_log(tid, 'created torch_fastio engine')
ctxt = {} ctxt = {}
ctxt[FILE] = filename ctxt[FILE] = filename

View File

@ -54,7 +54,7 @@ class TorchIO_Engine(object):
f'{io_string} file {filename} of size {args.io_size} bytes from buffer on device {buffer.device}', f'{io_string} file {filename} of size {args.io_size} bytes from buffer on device {buffer.device}',
force=True) force=True)
task_log(tid, f'created torch_io engine') task_log(tid, 'created torch_io engine')
ctxt = {} ctxt = {}
ctxt[FILE] = filename ctxt[FILE] = filename

View File

@ -145,7 +145,7 @@ class Autotuner:
f"{best_exp['name']} is the optimal setup after tuning. The exp result is at {best_exp['result_dir']}." f"{best_exp['name']} is the optimal setup after tuning. The exp result is at {best_exp['result_dir']}."
) )
else: else:
logger.info(f"No optimal setup is found. Please check that experiments were run successfully.") logger.info("No optimal setup is found. Please check that experiments were run successfully.")
tuning_duration = datetime.timedelta(seconds=(time.time() - self.start_time)) tuning_duration = datetime.timedelta(seconds=(time.time() - self.start_time))
logger.info(f"Tuning completed in {tuning_duration}") logger.info(f"Tuning completed in {tuning_duration}")
@ -410,7 +410,7 @@ class Autotuner:
self.start_time = time.time() self.start_time = time.time()
if self.fast_enabled(): if self.fast_enabled():
logger.info(f"Fast mode is enabled. Tuning micro batch size only.") logger.info("Fast mode is enabled. Tuning micro batch size only.")
# model info profile run with DEFAULT_MIN_MEM_CONFIG # model info profile run with DEFAULT_MIN_MEM_CONFIG
model_info = self.model_info_profile_run() model_info = self.model_info_profile_run()
@ -1110,4 +1110,4 @@ class Autotuner:
logger.info(f"Done running with the optimal DeepSpeed configuration using {self.optimal_cmd}") logger.info(f"Done running with the optimal DeepSpeed configuration using {self.optimal_cmd}")
else: else:
logger.info(f"No optimal DeepSpeed configuration found by autotuning.") logger.info("No optimal DeepSpeed configuration found by autotuning.")

View File

@ -94,14 +94,14 @@ class DeepSpeedCheckpoint(object):
return self.dp_degree != self.zero_checkpoint.get_src_dp_degree() return self.dp_degree != self.zero_checkpoint.get_src_dp_degree()
def show_2d_mapping(self): def show_2d_mapping(self):
print(f'reshaped 2d map ---- begin') print('reshaped 2d map ---- begin')
for i in range(self.pp_degree): for i in range(self.pp_degree):
for j in range(self.tp_degree): for j in range(self.tp_degree):
file_list = self.get_2d_parallel_files(pp_index=i, tp_index=j) file_list = self.get_2d_parallel_files(pp_index=i, tp_index=j)
print(f'[{i}, {j}] = {file_list}') print(f'[{i}, {j}] = {file_list}')
print(f'reshaped 2d map ---- end') print('reshaped 2d map ---- end')
def show_tp_embedding_map(self): def show_tp_embedding_map(self):
self._dump_mapping(self.tp_to_embedding_map, 'tp_to_embedding_layers') self._dump_mapping(self.tp_to_embedding_map, 'tp_to_embedding_layers')
@ -137,7 +137,7 @@ class DeepSpeedCheckpoint(object):
return self.layer_keys[self.final_layer_norm_idx] return self.layer_keys[self.final_layer_norm_idx]
def get_iteration(self): def get_iteration(self):
if not ITERATION_KEY in self.global_state: if ITERATION_KEY not in self.global_state:
sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'), weights_only=False) sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'), weights_only=False)
self.global_state[ITERATION_KEY] = sd.get(ITERATION_KEY, 0) self.global_state[ITERATION_KEY] = sd.get(ITERATION_KEY, 0)
@ -157,7 +157,7 @@ class DeepSpeedCheckpoint(object):
return self.tp_to_embedding_map[tp_index] return self.tp_to_embedding_map[tp_index]
def _get_checkpoint_value(self, key): def _get_checkpoint_value(self, key):
if not key in self.global_state: if key not in self.global_state:
sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'), weights_only=False) sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'), weights_only=False)
self.global_state[key] = sd.get(key, None) self.global_state[key] = sd.get(key, None)
@ -254,7 +254,7 @@ class DeepSpeedCheckpoint(object):
layer_file_partitions = partition_data(layer_files, self.tp_degree) layer_file_partitions = partition_data(layer_files, self.tp_degree)
for tp_index in range(self.tp_degree): for tp_index in range(self.tp_degree):
map_key = (tp_index, pp_index) map_key = (tp_index, pp_index)
if not map_key in file_map.keys(): if map_key not in file_map.keys():
file_map[map_key] = [] file_map[map_key] = []
file_map[map_key].append(layer_file_partitions[tp_index]) file_map[map_key].append(layer_file_partitions[tp_index])
@ -286,7 +286,7 @@ class DeepSpeedCheckpoint(object):
def _merge_state_dicts(self, sd_list): def _merge_state_dicts(self, sd_list):
merged_sd = {} merged_sd = {}
for key in sd_list[0].keys(): for key in sd_list[0].keys():
if not key in SEQUENTIAL_LAYERS: if key not in SEQUENTIAL_LAYERS:
cat_dim = LAYER_CONCAT_DIM.get(key, 0) cat_dim = LAYER_CONCAT_DIM.get(key, 0)
merged_sd[key] = torch.cat([sd[key] for sd in sd_list], dim=cat_dim) merged_sd[key] = torch.cat([sd[key] for sd in sd_list], dim=cat_dim)
else: else:

View File

@ -269,7 +269,7 @@ def merge_tp_slices(ds_checkpoint, dir, slice_dir, tp_degree, name_and_shape):
step_merged = _merge_zero_shards(slice_base_path, "step", tp_degree, shape) step_merged = _merge_zero_shards(slice_base_path, "step", tp_degree, shape)
if step_merged: if step_merged:
_save_checkpoint(os.path.join(param_base_path, f"step.pt"), step_merged[0]) _save_checkpoint(os.path.join(param_base_path, "step.pt"), step_merged[0])
for state in ("fp32", "exp_avg", "exp_avg_sq"): for state in ("fp32", "exp_avg", "exp_avg_sq"):
slices = _merge_zero_shards(slice_base_path, state, tp_degree, shape) slices = _merge_zero_shards(slice_base_path, state, tp_degree, shape)
@ -415,7 +415,7 @@ def _save_optimizer_state(args, ds_checkpoint):
output_sd = {k: v for k, v in optim_sd.items() if k not in sharded_states} output_sd = {k: v for k, v in optim_sd.items() if k not in sharded_states}
output_sd[PARAM_GROUPS] = optim_sd[BASE_OPTIMIZER_STATE][PARAM_GROUPS] output_sd[PARAM_GROUPS] = optim_sd[BASE_OPTIMIZER_STATE][PARAM_GROUPS]
zero_output_folder = os.path.join(args.output_folder, "zero") zero_output_folder = os.path.join(args.output_folder, "zero")
output_file_path = os.path.join(zero_output_folder, f"optimizer_state.pt") output_file_path = os.path.join(zero_output_folder, "optimizer_state.pt")
_save_checkpoint(output_file_path, output_sd) _save_checkpoint(output_file_path, output_sd)
@ -424,7 +424,7 @@ def _save_optimizer_state_stage3(args, optim_files):
output_sd = sd[OPTIMIZER_STATE_DICT] output_sd = sd[OPTIMIZER_STATE_DICT]
output_sd[PARAM_GROUPS] = output_sd[OPTIMIZER_STATE_DICT][PARAM_GROUPS] output_sd[PARAM_GROUPS] = output_sd[OPTIMIZER_STATE_DICT][PARAM_GROUPS]
zero_output_folder = os.path.join(args.output_folder, "zero") zero_output_folder = os.path.join(args.output_folder, "zero")
output_file_path = os.path.join(zero_output_folder, f"optimizer_state.pt") output_file_path = os.path.join(zero_output_folder, "optimizer_state.pt")
_save_checkpoint(output_file_path, output_sd) _save_checkpoint(output_file_path, output_sd)
@ -467,7 +467,7 @@ def _check_for_required_state(ds_checkpoint):
def main(args): def main(args):
print(f'Convert DeepSpeed Checkpoint to Universal Checkpoint') print('Convert DeepSpeed Checkpoint to Universal Checkpoint')
print(f'Converting DeepSpeed checkpoint in {args.input_folder} to Universal checkpoint in {args.output_folder}') print(f'Converting DeepSpeed checkpoint in {args.input_folder} to Universal checkpoint in {args.output_folder}')

View File

@ -24,7 +24,7 @@ class meg_2d_parallel_map(object):
assert type(data) is list assert type(data) is list
key = self._make_key(pp_index, tp_index) key = self._make_key(pp_index, tp_index)
if not key in self.map.keys(): if key not in self.map.keys():
self.map[key] = [] self.map[key] = []
self.map[key] += data self.map[key] += data
@ -84,14 +84,14 @@ def reshape_meg_2d_parallel(old_pp_degree, old_tp_degree, new_pp_degree, new_tp_
old_2d_map = meg_2d_parallel_map(old_pp_degree, old_tp_degree) old_2d_map = meg_2d_parallel_map(old_pp_degree, old_tp_degree)
old_2d_map.simple_init() old_2d_map.simple_init()
if verbose: if verbose:
old_2d_map.print_data(f'original_2d_map:') old_2d_map.print_data('original_2d_map:')
if old_tp_degree != new_tp_degree: if old_tp_degree != new_tp_degree:
new_tp_map = _reshape_tp_dimension(old_2d_map, new_tp_degree) new_tp_map = _reshape_tp_dimension(old_2d_map, new_tp_degree)
else: else:
new_tp_map = old_2d_map new_tp_map = old_2d_map
if verbose: if verbose:
new_tp_map.print_data(f'after_tp_reshape:') new_tp_map.print_data('after_tp_reshape:')
if old_pp_degree != new_pp_degree: if old_pp_degree != new_pp_degree:
final_map = _reshape_pp_dimension(new_tp_map, new_pp_degree) final_map = _reshape_pp_dimension(new_tp_map, new_pp_degree)
@ -99,7 +99,7 @@ def reshape_meg_2d_parallel(old_pp_degree, old_tp_degree, new_pp_degree, new_tp_
final_map = new_tp_map final_map = new_tp_map
if verbose: if verbose:
final_map.print_data(f'final_2d_map:') final_map.print_data('final_2d_map:')
return final_map return final_map
@ -159,7 +159,7 @@ def get_mpu_ranks(tp_size=1, pp_size=1, dp_size=1, virtual_pp_size=None):
ranks = [data_parallel_group_ranks[i] for data_parallel_group_ranks in all_dp_group_ranks] ranks = [data_parallel_group_ranks[i] for data_parallel_group_ranks in all_dp_group_ranks]
all_pp_group_ranks.append(list(ranks)) all_pp_group_ranks.append(list(ranks))
print(f"PP", all_pp_group_ranks) print("PP", all_pp_group_ranks)
# Build the tensor model-parallel groups. # Build the tensor model-parallel groups.
all_tp_group_ranks = [] all_tp_group_ranks = []
@ -167,7 +167,7 @@ def get_mpu_ranks(tp_size=1, pp_size=1, dp_size=1, virtual_pp_size=None):
ranks = range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size) ranks = range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size)
all_tp_group_ranks.append(list(ranks)) all_tp_group_ranks.append(list(ranks))
print(f"TP", all_tp_group_ranks) print("TP", all_tp_group_ranks)
return all_tp_group_ranks, all_pp_group_ranks, all_dp_group_ranks return all_tp_group_ranks, all_pp_group_ranks, all_dp_group_ranks

View File

@ -115,7 +115,7 @@ def add_free_activations(graph_id: int, graph: Graph, activation_node_names: Lis
def _should_free(node: Node) -> bool: def _should_free(node: Node) -> bool:
if not hasattr(node, "meta"): if not hasattr(node, "meta"):
return False return False
if not "tensor_meta" in node.meta: if "tensor_meta" not in node.meta:
return False return False
return True return True

View File

@ -34,7 +34,7 @@ def get_random_id() -> int:
def _should_offload(node: Node) -> bool: def _should_offload(node: Node) -> bool:
if not hasattr(node, "meta"): if not hasattr(node, "meta"):
return False return False
if not "tensor_meta" in node.meta: if "tensor_meta" not in node.meta:
return False return False
return True return True

View File

@ -34,7 +34,7 @@ def add_reload_parameter(graph_id: int, gm: GraphModule, node: Node, ds_id: int)
new_node = gm.graph.create_node('call_function', new_node = gm.graph.create_node('call_function',
torch.ops.dc.reload_parameter.default, torch.ops.dc.reload_parameter.default,
args, {}, args, {},
name=f"reload_parameter") name="reload_parameter")
return new_node return new_node

View File

@ -249,7 +249,7 @@ def get_sparse_pruning_shared_parameters(param_dict):
output[SPARSE_PRUNING_DENSE_RATIO] = get_scalar_param(sub_param_dict, SPARSE_PRUNING_DENSE_RATIO, output[SPARSE_PRUNING_DENSE_RATIO] = get_scalar_param(sub_param_dict, SPARSE_PRUNING_DENSE_RATIO,
SPARSE_PRUNING_DENSE_RATIO_DEFAULT) SPARSE_PRUNING_DENSE_RATIO_DEFAULT)
assert output[SPARSE_PRUNING_DENSE_RATIO] > 0 and output[ assert output[SPARSE_PRUNING_DENSE_RATIO] > 0 and output[
SPARSE_PRUNING_DENSE_RATIO] < 1, f"Invalid dense_ratio value. Must be less than 1" SPARSE_PRUNING_DENSE_RATIO] < 1, "Invalid dense_ratio value. Must be less than 1"
output[SPARSE_PRUNING_SCHEDULE_OFFSET_STRIDE] = get_scalar_param( output[SPARSE_PRUNING_SCHEDULE_OFFSET_STRIDE] = get_scalar_param(
sub_param_dict, SPARSE_PRUNING_SCHEDULE_OFFSET_STRIDE, SPARSE_PRUNING_SCHEDULE_OFFSET_STRIDE_DEFAULT) sub_param_dict, SPARSE_PRUNING_SCHEDULE_OFFSET_STRIDE, SPARSE_PRUNING_SCHEDULE_OFFSET_STRIDE_DEFAULT)
output[SPARSE_PRUNING_EXCLUDED_MODULES] = get_list_param(sub_param_dict, SPARSE_PRUNING_EXCLUDED_MODULES, output[SPARSE_PRUNING_EXCLUDED_MODULES] = get_list_param(sub_param_dict, SPARSE_PRUNING_EXCLUDED_MODULES,
@ -258,7 +258,7 @@ def get_sparse_pruning_shared_parameters(param_dict):
SPARSE_PRUNING_SCHEDULE_OFFSET_END, SPARSE_PRUNING_SCHEDULE_OFFSET_END,
output[SPARSE_PRUNING_SCHEDULE_OFFSET]) output[SPARSE_PRUNING_SCHEDULE_OFFSET])
assert output[SPARSE_PRUNING_SCHEDULE_OFFSET] <= output[ assert output[SPARSE_PRUNING_SCHEDULE_OFFSET] <= output[
SPARSE_PRUNING_SCHEDULE_OFFSET_END], f"Invalid schedule_offset and schedule_offset_end values" SPARSE_PRUNING_SCHEDULE_OFFSET_END], "Invalid schedule_offset and schedule_offset_end values"
else: else:
output[SPARSE_PRUNING_ENABLED] = SPARSE_PRUNING_ENABLED_DEFAULT output[SPARSE_PRUNING_ENABLED] = SPARSE_PRUNING_ENABLED_DEFAULT
output[SPARSE_PRUNING_METHOD] = SPARSE_PRUNING_METHOD_DEFAULT output[SPARSE_PRUNING_METHOD] = SPARSE_PRUNING_METHOD_DEFAULT

View File

@ -91,7 +91,7 @@ def installed_cann_version():
import re import re
ascend_path = installed_cann_path() ascend_path = installed_cann_path()
if ascend_path is None: if ascend_path is None:
return f"CANN_HOME does not exist, unable to compile NPU op(s)" return "CANN_HOME does not exist, unable to compile NPU op(s)"
cann_version = "" cann_version = ""
for dirpath, _, filenames in os.walk(os.path.realpath(ascend_path)): for dirpath, _, filenames in os.walk(os.path.realpath(ascend_path)):
if cann_version: if cann_version:

View File

@ -188,7 +188,7 @@ def main():
if not is_torch_elastic_compatible(): if not is_torch_elastic_compatible():
if args.enable_elastic_training: if args.enable_elastic_training:
logger.info(f"Disabling elastic training support as \ logger.info("Disabling elastic training support as \
PyTorch version should be greater than 1.11.x") PyTorch version should be greater than 1.11.x")
args.enable_elastic_training = False args.enable_elastic_training = False

View File

@ -59,9 +59,9 @@ def env_mapping(env, rank_name_list=None, local_rank_name_list=None):
if rank == None: if rank == None:
rank = env.get(rank_name) rank = env.get(rank_name)
elif rank != env.get(rank_name): elif rank != env.get(rank_name):
raise EnvironmentError(f"rank number doesn't match!") raise EnvironmentError("rank number doesn't match!")
if rank == None: if rank == None:
raise EnvironmentError(f"rank number is not in current env!") raise EnvironmentError("rank number is not in current env!")
env['RANK'] = rank env['RANK'] = rank
local_rank = None local_rank = None
@ -70,9 +70,9 @@ def env_mapping(env, rank_name_list=None, local_rank_name_list=None):
if local_rank == None: if local_rank == None:
local_rank = env.get(local_rank_name) local_rank = env.get(local_rank_name)
elif local_rank != env.get(local_rank_name): elif local_rank != env.get(local_rank_name):
raise EnvironmentError(f"local_rank number doesn't match!") raise EnvironmentError("local_rank number doesn't match!")
if local_rank == None: if local_rank == None:
raise EnvironmentError(f"rank number is not in current env!") raise EnvironmentError("rank number is not in current env!")
env['LOCAL_RANK'] = local_rank env['LOCAL_RANK'] = local_rank
return env return env

View File

@ -42,7 +42,7 @@ class QuantizedParameter(nn.Parameter):
quantizer: Quantizer = None, quantizer: Quantizer = None,
): ):
if requires_grad: if requires_grad:
raise ValueError(f"requires_grad=True is not supported with QuantizedParameter") raise ValueError("requires_grad=True is not supported with QuantizedParameter")
if data is None: if data is None:
data = torch.empty(0) data = torch.empty(0)
self = torch.Tensor._make_subclass(cls, data, requires_grad) self = torch.Tensor._make_subclass(cls, data, requires_grad)

View File

@ -56,7 +56,7 @@ class DeepSpeedTransformerInference(nn.Module):
if DeepSpeedTransformerInference.layer_id == 1: if DeepSpeedTransformerInference.layer_id == 1:
log_dist(f"DeepSpeed-Inference config: {self.config.__dict__}", [0]) log_dist(f"DeepSpeed-Inference config: {self.config.__dict__}", [0])
if deepspeed.HAS_TRITON and self.config.use_triton: if deepspeed.HAS_TRITON and self.config.use_triton:
log_dist(f"Injecting Triton kernels ...", [0]) log_dist("Injecting Triton kernels ...", [0])
if self.config.bigscience_bloom: if self.config.bigscience_bloom:
self.attention = BloomSelfAttention(self.config, mp_group, quantize_scales, quantize_groups, merge_count) self.attention = BloomSelfAttention(self.config, mp_group, quantize_scales, quantize_groups, merge_count)

View File

@ -150,8 +150,8 @@ def prepare_tp_fused_qkvw(module, src, mp_size, gpu_index):
module_name = max(module_name_matches, key=len) module_name = max(module_name_matches, key=len)
fused_type = fused_type_dict[module_name] fused_type = fused_type_dict[module_name]
return _transpose_fused_qkvw(src, mp_size, fused_type, module) return _transpose_fused_qkvw(src, mp_size, fused_type, module)
warning_once(f"Unrecognized fusedkqv weight type, default to using bloom type," warning_once("Unrecognized fusedkqv weight type, default to using bloom type,"
f"please check in prepare_tp_fused_qkvw() to avoid potential calculation errors") "please check in prepare_tp_fused_qkvw() to avoid potential calculation errors")
return _bloom_type_transpose(src, mp_size) return _bloom_type_transpose(src, mp_size)

View File

@ -497,7 +497,7 @@ def replace_transformer_layer(orig_layer_impl, model, checkpoint_dict, config, m
if dist.is_initialized(): if dist.is_initialized():
dist.barrier() dist.barrier()
transformer_name = get_transformer_name(replaced_module) transformer_name = get_transformer_name(replaced_module)
non_tp_ckpt_name = f'non-tp.pt' non_tp_ckpt_name = 'non-tp.pt'
ckpt_files = [non_tp_ckpt_name] ckpt_files = [non_tp_ckpt_name]
os.makedirs(config.save_mp_checkpoint_path, exist_ok=True) os.makedirs(config.save_mp_checkpoint_path, exist_ok=True)

View File

@ -496,7 +496,7 @@ class TopKGate(Module):
self.top2_2nd_expert_sampling = top2_2nd_expert_sampling self.top2_2nd_expert_sampling = top2_2nd_expert_sampling
def _set_ep_group(self, ep_group): def _set_ep_group(self, ep_group):
assert self.ep_group is None, f'Attempting to override an existing ep_group' assert self.ep_group is None, 'Attempting to override an existing ep_group'
self.ep_group = ep_group self.ep_group = ep_group
def forward(self, def forward(self,

View File

@ -70,10 +70,10 @@ def validate_args(args):
error_messages = [] error_messages = []
if args.folder is not None and len(args.folder_to_device_mapping) > 0: if args.folder is not None and len(args.folder_to_device_mapping) > 0:
error_messages.append(f'--folder and --folder_to_device_mapping cannot be specified together.') error_messages.append('--folder and --folder_to_device_mapping cannot be specified together.')
no_error = False no_error = False
elif args.folder is None and len(args.folder_to_device_mapping) == 0: elif args.folder is None and len(args.folder_to_device_mapping) == 0:
error_messages.append(f'At least one of --folder or --folder_to_device_mapping must be specified.') error_messages.append('At least one of --folder or --folder_to_device_mapping must be specified.')
no_error = False no_error = False
# Validate --folder # Validate --folder
@ -102,7 +102,7 @@ def validate_args(args):
print(f'Found {len(error_messages)} validation error(s)') print(f'Found {len(error_messages)} validation error(s)')
# Validate --gpu, --use_gds # Validate --gpu, --use_gds
if args.use_gds and not args.gpu: if args.use_gds and not args.gpu:
error_messages.append(f'--gpu must be set to transfer with --use_gds') error_messages.append('--gpu must be set to transfer with --use_gds')
no_error = False no_error = False
if not no_error: if not no_error:
@ -201,7 +201,7 @@ def get_validated_args():
args = refine_args(args) args = refine_args(args)
if not validate_args(args): if not validate_args(args):
quit() quit()
print(f'Successful validation of command line arguments') print('Successful validation of command line arguments')
args.total_loops = args.warmup_loops + args.loops args.total_loops = args.warmup_loops + args.loops
peer_tag = 'gpu' if args.gpu else 'process' peer_tag = 'gpu' if args.gpu else 'process'
args.mapping_dict = _get_mapping_dict(args) args.mapping_dict = _get_mapping_dict(args)

View File

@ -54,7 +54,7 @@ class AIOBasic_Engine(object):
task_log(tid, task_log(tid,
f'{io_string} file {filename} of size {args.io_size} bytes from buffer on device {buffer.device}') f'{io_string} file {filename} of size {args.io_size} bytes from buffer on device {buffer.device}')
task_log(tid, f'created deepspeed aio basic engine') task_log(tid, 'created deepspeed aio basic engine')
ctxt = {} ctxt = {}
ctxt[FILE] = filename ctxt[FILE] = filename

View File

@ -95,7 +95,7 @@ class AIOHandle_Engine(object):
else: else:
handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit, handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit,
not args.sequential_requests, io_parallel) not args.sequential_requests, io_parallel)
task_log(tid, f'Created DeepNVMe handle engine') task_log(tid, 'Created DeepNVMe handle engine')
bounce_buffer = None bounce_buffer = None
if args.gpu: if args.gpu:

View File

@ -76,7 +76,7 @@ def io_engine_tasklet(pool_params):
task_barrier(aio_barrier, num_processes) task_barrier(aio_barrier, num_processes)
# Run pre task # Run pre task
task_log(tid, f'running pre-task') task_log(tid, 'running pre-task')
io_engine = schedule["pre"]((args, tid)) io_engine = schedule["pre"]((args, tid))
task_barrier(aio_barrier, num_processes) task_barrier(aio_barrier, num_processes)
@ -91,7 +91,7 @@ def io_engine_tasklet(pool_params):
io_engine.ctxt["main_task_sec"].append(stop_time - start_time) io_engine.ctxt["main_task_sec"].append(stop_time - start_time)
# Run post task # Run post task
task_log(tid, f'running post-task') task_log(tid, 'running post-task')
schedule["post"]((args, tid, io_engine)) schedule["post"]((args, tid, io_engine))
task_barrier(aio_barrier, num_processes) task_barrier(aio_barrier, num_processes)

View File

@ -101,7 +101,7 @@ def get_metric(file, metric):
def validate_args(args): def validate_args(args):
if not args.metric in PERF_METRICS: if args.metric not in PERF_METRICS:
print(f'{args.metric} is not a valid performance metrics') print(f'{args.metric} is not a valid performance metrics')
return False return False

View File

@ -12,7 +12,7 @@ from .io_engine import io_engine_multiprocessing
def ds_io_main(): def ds_io_main():
print(f'Testing DeepNVMe python frontend') print('Testing DeepNVMe python frontend')
args = get_validated_args() args = get_validated_args()
mp.set_start_method('spawn', force=True) mp.set_start_method('spawn', force=True)

View File

@ -15,7 +15,7 @@ from deepspeed.io import FastFileWriter
class Torch_FastIO_Engine(object): class Torch_FastIO_Engine(object):
def __init__(self, args, tid, read_op): def __init__(self, args, tid, read_op):
assert read_op is False, f'Read operation is not currently supported' assert read_op is False, 'Read operation is not currently supported'
self.ctxt = self._create_context(args, tid, read_op) self.ctxt = self._create_context(args, tid, read_op)
self.zipfile_serialization = not args.torch_legacy_save self.zipfile_serialization = not args.torch_legacy_save
@ -69,7 +69,7 @@ class Torch_FastIO_Engine(object):
fast_io_buffer = create_page_locked_tensor(args.fast_io_size, args.use_accelerator_pin_memory, aio_handle) fast_io_buffer = create_page_locked_tensor(args.fast_io_size, args.use_accelerator_pin_memory, aio_handle)
task_log(tid, f'created torch_fastio engine') task_log(tid, 'created torch_fastio engine')
ctxt = {} ctxt = {}
ctxt[FILE] = filename ctxt[FILE] = filename

View File

@ -54,7 +54,7 @@ class TorchIO_Engine(object):
f'{io_string} file {filename} of size {args.io_size} bytes from buffer on device {buffer.device}', f'{io_string} file {filename} of size {args.io_size} bytes from buffer on device {buffer.device}',
force=True) force=True)
task_log(tid, f'created torch_io engine') task_log(tid, 'created torch_io engine')
ctxt = {} ctxt = {}
ctxt[FILE] = filename ctxt[FILE] = filename

View File

@ -126,7 +126,7 @@ class FP_Quantize(Quantizer):
if scale is not None: if scale is not None:
assert input_q.numel() == fp_out.numel(), \ assert input_q.numel() == fp_out.numel(), \
f'[De-quantization Error]: quantized data should have the same size as original tensor when scale is not None!' '[De-quantization Error]: quantized data should have the same size as original tensor when scale is not None!'
input_q = torch.cat([input_q.reshape(-1, self.group_size), scale], dim=-1).contiguous() input_q = torch.cat([input_q.reshape(-1, self.group_size), scale], dim=-1).contiguous()
fp_quant_module.dequantize(fp_out, input_q, self.group_size, q_mantisa_bits, q_bits - q_mantisa_bits - 1) fp_quant_module.dequantize(fp_out, input_q, self.group_size, q_mantisa_bits, q_bits - q_mantisa_bits - 1)
return fp_out return fp_out
@ -159,7 +159,7 @@ class FP_Quantize(Quantizer):
if scale is not None: if scale is not None:
assert input_q.numel() == fp_out.numel(), \ assert input_q.numel() == fp_out.numel(), \
f'[De-quantization Error]: quantized data should have the same size as original tensor when scale is not None!' '[De-quantization Error]: quantized data should have the same size as original tensor when scale is not None!'
input_q = torch.cat([input_q.reshape(-1, self.group_size), scale], dim=-1).contiguous() input_q = torch.cat([input_q.reshape(-1, self.group_size), scale], dim=-1).contiguous()
fp_quant_module.selective_dequantize(fp_out, input_q, indexes, self.group_size, q_mantisa_bits, fp_quant_module.selective_dequantize(fp_out, input_q, indexes, self.group_size, q_mantisa_bits,

View File

@ -142,7 +142,7 @@ class FixedSparsityConfig(SparsityConfig):
if (num_different_global_patterns > 1 and not different_layout_per_head): if (num_different_global_patterns > 1 and not different_layout_per_head):
raise ValueError( raise ValueError(
f'Number of different layouts cannot be more than one when you have set a single layout for all heads! Set different_layout_per_head to True.' 'Number of different layouts cannot be more than one when you have set a single layout for all heads! Set different_layout_per_head to True.'
) )
if (num_different_global_patterns > (num_local_blocks // num_global_blocks)): if (num_different_global_patterns > (num_local_blocks // num_global_blocks)):
raise ValueError( raise ValueError(

View File

@ -103,7 +103,7 @@ class TritonSelfAttention(nn.Module):
# triton autotune table update for score/context matmul # triton autotune table update for score/context matmul
if triton_autotune: if triton_autotune:
print(f"running triton autotune for regular attention kernel") print("running triton autotune for regular attention kernel")
__class__._triton_autotune(2, self.config.max_out_tokens, self.head_size, self.config.hidden_size, __class__._triton_autotune(2, self.config.max_out_tokens, self.head_size, self.config.hidden_size,
self.triangular_masking, self.scale) self.triangular_masking, self.scale)

View File

@ -1122,7 +1122,7 @@ def configure(
#print configuration only once #print configuration only once
see_memory_usage("After configuration", force=False) see_memory_usage("After configuration", force=False)
if dist.get_rank() == 0: if dist.get_rank() == 0:
logger.info(f"Activation Checkpointing Information") logger.info("Activation Checkpointing Information")
logger.info(f"----Partition Activations {PARTITION_ACTIVATIONS}, CPU CHECKPOINTING {CPU_CHECKPOINT}") logger.info(f"----Partition Activations {PARTITION_ACTIVATIONS}, CPU CHECKPOINTING {CPU_CHECKPOINT}")
logger.info(f"----contiguous Memory Checkpointing {CONTIGUOUS_CHECKPOINTING} with {num_layers} total layers") logger.info(f"----contiguous Memory Checkpointing {CONTIGUOUS_CHECKPOINTING} with {num_layers} total layers")
logger.info(f"----Synchronization {SYNCHRONIZE}") logger.info(f"----Synchronization {SYNCHRONIZE}")

View File

@ -54,7 +54,7 @@ class BF16_Optimizer(ZeROOptimizer):
self.param_names = param_names self.param_names = param_names
self.using_real_optimizer = not isinstance(self.optimizer, DummyOptim) self.using_real_optimizer = not isinstance(self.optimizer, DummyOptim)
assert bfloat16_config.enabled, f"BF16Optimizer: requires bfloat16 to be enabled" assert bfloat16_config.enabled, "BF16Optimizer: requires bfloat16 to be enabled"
assert grad_acc_dtype in [torch.float32, torch.bfloat16 assert grad_acc_dtype in [torch.float32, torch.bfloat16
], f"BF16Optimizer: Unsupported gradient accumulation data type: {grad_acc_dtype}" ], f"BF16Optimizer: Unsupported gradient accumulation data type: {grad_acc_dtype}"
self.grad_acc_dtype = grad_acc_dtype self.grad_acc_dtype = grad_acc_dtype
@ -504,13 +504,13 @@ class BF16_Optimizer(ZeROOptimizer):
current_rank_sd = state_dict_list[dp_rank] current_rank_sd = state_dict_list[dp_rank]
ckpt_version = current_rank_sd.get(DS_VERSION, False) ckpt_version = current_rank_sd.get(DS_VERSION, False)
assert ckpt_version, f"Empty ds_version in checkpoint, not clear how to proceed" assert ckpt_version, "Empty ds_version in checkpoint, not clear how to proceed"
ckpt_version = pkg_version.parse(ckpt_version) ckpt_version = pkg_version.parse(ckpt_version)
self.clip_grad = current_rank_sd.get(CLIP_GRAD, self.clip_grad) self.clip_grad = current_rank_sd.get(CLIP_GRAD, self.clip_grad)
if load_optimizer_states: if load_optimizer_states:
print(f"_load_legacy_checkpoint current_rank_sd[BASE_OPTIMIZER_STATE]") print("_load_legacy_checkpoint current_rank_sd[BASE_OPTIMIZER_STATE]")
self.optimizer.load_state_dict(current_rank_sd[BASE_OPTIMIZER_STATE]) self.optimizer.load_state_dict(current_rank_sd[BASE_OPTIMIZER_STATE])
if load_from_fp32_weights: if load_from_fp32_weights:

View File

@ -40,7 +40,7 @@ class CheckpointSize(object):
def init_decoupled_checkpoint(config_params, dp_writer_config, save_event, save_queue, optimize_dp_state): def init_decoupled_checkpoint(config_params, dp_writer_config, save_event, save_queue, optimize_dp_state):
checkpoint_engine = FastCheckpointEngine(config_params, dp_writer_config, optimize_dp_state) checkpoint_engine = FastCheckpointEngine(config_params, dp_writer_config, optimize_dp_state)
print(f'Created FastCheckpointEngine for Decoupled Checkpointing') print('Created FastCheckpointEngine for Decoupled Checkpointing')
save_path_list = [] save_path_list = []
while True: while True:
(save_info, event_type) = save_queue.get() (save_info, event_type) = save_queue.get()

View File

@ -43,7 +43,7 @@ class NebulaCheckpointEngine(CheckpointEngine):
self.checkpoint = torch_nebula.Checkpoint(info.tag, -2) self.checkpoint = torch_nebula.Checkpoint(info.tag, -2)
def save(self, state_dict, path: str): def save(self, state_dict, path: str):
log_dist(f"[Nebula] Create dummy files for loading.") log_dist("[Nebula] Create dummy files for loading.")
torch.save("", path) torch.save("", path)
tag = _get_tag_from_path(path) tag = _get_tag_from_path(path)
@ -84,7 +84,7 @@ class NebulaCheckpointEngine(CheckpointEngine):
checkpoint = torch_nebula.get_latest_checkpoint(persist_path=self.nebula_load_path) checkpoint = torch_nebula.get_latest_checkpoint(persist_path=self.nebula_load_path)
if checkpoint is None or (checkpoint is not None and checkpoint.tag == ''): if checkpoint is None or (checkpoint is not None and checkpoint.tag == ''):
logger.info( logger.info(
f"Unable to find latest checkpoint from Nebula tier3, try to get latest checkpoint again from nebula tier1 path!" "Unable to find latest checkpoint from Nebula tier3, try to get latest checkpoint again from nebula tier1 path!"
) )
# nebula tier1 latest # nebula tier1 latest
checkpoint = torch_nebula.get_latest_checkpoint() checkpoint = torch_nebula.get_latest_checkpoint()
@ -103,6 +103,6 @@ class NebulaCheckpointEngine(CheckpointEngine):
logger.info(f"[Nebula] all files for {tag} are saved in tier1. It is ready to start persisting") logger.info(f"[Nebula] all files for {tag} are saved in tier1. It is ready to start persisting")
commit_rls = self.checkpoint.commit() commit_rls = self.checkpoint.commit()
if not commit_rls: if not commit_rls:
logger.error(f"[Nebula] failed to commit the checkpoint, please check the log.") logger.error("[Nebula] failed to commit the checkpoint, please check the log.")
return False return False
return commit_rls return commit_rls

View File

@ -169,7 +169,7 @@ class ScientificNotationEncoder(json.JSONEncoder):
x = [f'\n{prefix}"{k}": {self.iterencode(v, level=level)}' for k, v in o.items()] x = [f'\n{prefix}"{k}": {self.iterencode(v, level=level)}' for k, v in o.items()]
return "{" + ", ".join(x) + f"\n{prefix_close}" + "}" return "{" + ", ".join(x) + f"\n{prefix_close}" + "}"
elif isinstance(o, collections.abc.Sequence) and not isinstance(o, str): elif isinstance(o, collections.abc.Sequence) and not isinstance(o, str):
return f"[{ f', '.join(map(self.iterencode, o)) }]" return f"[{ ', '.join(map(self.iterencode, o)) }]"
return "\n, ".join(super().iterencode(o, _one_shot)) return "\n, ".join(super().iterencode(o, _one_shot))

View File

@ -73,7 +73,7 @@ class CurriculumScheduler(object):
f"Curriculum learning with fixed_root schedule requires the schedule_config '{CURRICULUM_LEARNING_SCHEDULE_ROOT_DEGREE}'" f"Curriculum learning with fixed_root schedule requires the schedule_config '{CURRICULUM_LEARNING_SCHEDULE_ROOT_DEGREE}'"
if config[CURRICULUM_LEARNING_SCHEDULE_CONFIG][CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP] % 8 != 0: if config[CURRICULUM_LEARNING_SCHEDULE_CONFIG][CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP] % 8 != 0:
logger.warning( logger.warning(
f'When using seqlen metric, the difficulty_step for curriculum learning has to be multiple of 8 (for FP16 data) or 16 (for INT8 data) to enable NVIDIA Tensor Core acceleration. Disregard this warning if this is unrelated to your metric/hardware.' 'When using seqlen metric, the difficulty_step for curriculum learning has to be multiple of 8 (for FP16 data) or 16 (for INT8 data) to enable NVIDIA Tensor Core acceleration. Disregard this warning if this is unrelated to your metric/hardware.'
) )
self.state[CURRICULUM_LEARNING_SCHEDULE_CONFIG] = config[CURRICULUM_LEARNING_SCHEDULE_CONFIG] self.state[CURRICULUM_LEARNING_SCHEDULE_CONFIG] = config[CURRICULUM_LEARNING_SCHEDULE_CONFIG]
elif config[CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_LINEAR: elif config[CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_LINEAR:
@ -91,7 +91,7 @@ class CurriculumScheduler(object):
f"Curriculum learning with fixed_linear schedule requires the schedule_config '{CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP}'" f"Curriculum learning with fixed_linear schedule requires the schedule_config '{CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP}'"
if config[CURRICULUM_LEARNING_SCHEDULE_CONFIG][CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP] % 8 != 0: if config[CURRICULUM_LEARNING_SCHEDULE_CONFIG][CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP] % 8 != 0:
logger.warning( logger.warning(
f'When using seqlen metric, the difficulty_step for curriculum learning has to be multiple of 8 (for FP16 data) or 16 (for INT8 data) to enable NVIDIA Tensor Core acceleration. Disregard this warning if this is unrelated to your metric/hardware.' 'When using seqlen metric, the difficulty_step for curriculum learning has to be multiple of 8 (for FP16 data) or 16 (for INT8 data) to enable NVIDIA Tensor Core acceleration. Disregard this warning if this is unrelated to your metric/hardware.'
) )
self.state[CURRICULUM_LEARNING_SCHEDULE_CONFIG] = config[CURRICULUM_LEARNING_SCHEDULE_CONFIG] self.state[CURRICULUM_LEARNING_SCHEDULE_CONFIG] = config[CURRICULUM_LEARNING_SCHEDULE_CONFIG]
elif config[CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_CUSTOM: elif config[CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_CUSTOM:

View File

@ -107,7 +107,7 @@ class Eigenvalue(object):
# Disable eigenvalue if the model doesn't support second order gradients computation, # Disable eigenvalue if the model doesn't support second order gradients computation,
# e.g. when enabling DS transformer kernel. # e.g. when enabling DS transformer kernel.
if len(grads) == 0 or len(params) == 0: if len(grads) == 0 or len(params) == 0:
log_dist(f'The model does NOT support eigenvalue computation.', ranks=[0], level=logging.WARNING) log_dist('The model does NOT support eigenvalue computation.', ranks=[0], level=logging.WARNING)
return [] return []
i = 0 i = 0

View File

@ -266,7 +266,7 @@ class DeepSpeedEngine(Module):
self._do_sanity_check() self._do_sanity_check()
if self.autotp_size() > 1: if self.autotp_size() > 1:
self._configure_tensor_parallel(model, self.tensor_parallel_config()) self._configure_tensor_parallel(model, self.tensor_parallel_config())
see_memory_usage(f"DeepSpeed Engine: After args sanity test", force=self.memory_breakdown()) see_memory_usage("DeepSpeed Engine: After args sanity test", force=self.memory_breakdown())
if mpu is not None: if mpu is not None:
if self.elasticity_enabled(): if self.elasticity_enabled():
if not self.is_elastic_model_parallel_supported(): if not self.is_elastic_model_parallel_supported():
@ -280,7 +280,7 @@ class DeepSpeedEngine(Module):
self.monitor = MonitorMaster(self._config.monitor_config) self.monitor = MonitorMaster(self._config.monitor_config)
see_memory_usage( see_memory_usage(
f"DeepSpeed Engine: Before configure distributed model", "DeepSpeed Engine: Before configure distributed model",
force=self.memory_breakdown(), force=self.memory_breakdown(),
) )
@ -298,7 +298,7 @@ class DeepSpeedEngine(Module):
self._get_model_parameters() self._get_model_parameters()
see_memory_usage(f"DeepSpeed Engine: After configure distributed model") see_memory_usage("DeepSpeed Engine: After configure distributed model")
# Configure wall clock timers # Configure wall clock timers
self.timers = SynchronizedWallClockTimer() self.timers = SynchronizedWallClockTimer()
@ -507,7 +507,7 @@ class DeepSpeedEngine(Module):
broadcast_and_check(args, bcast_rank, bcast_group) broadcast_and_check(args, bcast_rank, bcast_group)
broadcast_and_check(kwargs, bcast_rank, bcast_group) broadcast_and_check(kwargs, bcast_rank, bcast_group)
logger.info(f":The Dataloader has passed the TP group consistency check.") logger.info(":The Dataloader has passed the TP group consistency check.")
self.first_dataloader_check.remove() self.first_dataloader_check.remove()
self.first_dataloader_check = self.module.register_forward_pre_hook(check_dataloader_inputs_same_across_ranks, self.first_dataloader_check = self.module.register_forward_pre_hook(check_dataloader_inputs_same_across_ranks,
@ -577,7 +577,7 @@ class DeepSpeedEngine(Module):
""" """
if train_batch_size % (self.train_micro_batch_size_per_gpu() * self.dp_world_size) != 0: if train_batch_size % (self.train_micro_batch_size_per_gpu() * self.dp_world_size) != 0:
#print(f'{train_batch_size=} {self.train_micro_batch_size_per_gpu()=} {self.dp_world_size=}') #print(f'{train_batch_size=} {self.train_micro_batch_size_per_gpu()=} {self.dp_world_size=}')
raise ValueError(f'Train batch size must be divisible by micro-batch data parallelism') raise ValueError('Train batch size must be divisible by micro-batch data parallelism')
new_gas = train_batch_size // (self.train_micro_batch_size_per_gpu() * self.dp_world_size) new_gas = train_batch_size // (self.train_micro_batch_size_per_gpu() * self.dp_world_size)
# overwrite config # overwrite config
self._config.train_batch_size = train_batch_size self._config.train_batch_size = train_batch_size
@ -736,7 +736,7 @@ class DeepSpeedEngine(Module):
if random_ltd_config[RANDOM_LTD_LAYER_TOKEN_LR_SCHEDULE][RANDOM_LTD_LAYER_TOKEN_LR_ENABLED]: if random_ltd_config[RANDOM_LTD_LAYER_TOKEN_LR_SCHEDULE][RANDOM_LTD_LAYER_TOKEN_LR_ENABLED]:
assert self.client_lr_scheduler is None assert self.client_lr_scheduler is None
raise ValueError(f'not yet support') raise ValueError('not yet support')
#self.lr_scheduler = lr_schedules.WarmupLayerTokenDecayLR(self.optimizer, self.random_ltd_scheduler) #self.lr_scheduler = lr_schedules.WarmupLayerTokenDecayLR(self.optimizer, self.random_ltd_scheduler)
def get_data_parallel_rank(self): def get_data_parallel_rank(self):
@ -1534,21 +1534,21 @@ class DeepSpeedEngine(Module):
optimizer = OnebitAdam(model_parameters, self, **optimizer_parameters) optimizer = OnebitAdam(model_parameters, self, **optimizer_parameters)
if not self.fp16_enabled(): if not self.fp16_enabled():
logger.warning(f"Currently the convergence of 1-bit Adam is only verified under FP16") logger.warning("Currently the convergence of 1-bit Adam is only verified under FP16")
elif self.optimizer_name() == ZERO_ONE_ADAM_OPTIMIZER: elif self.optimizer_name() == ZERO_ONE_ADAM_OPTIMIZER:
assert not self.zero_optimization(), "0/1 Adam is not compatible with ZeRO" assert not self.zero_optimization(), "0/1 Adam is not compatible with ZeRO"
from deepspeed.runtime.fp16.onebit.zoadam import ZeroOneAdam from deepspeed.runtime.fp16.onebit.zoadam import ZeroOneAdam
optimizer = ZeroOneAdam(model_parameters, self, **optimizer_parameters) optimizer = ZeroOneAdam(model_parameters, self, **optimizer_parameters)
if not self.fp16_enabled(): if not self.fp16_enabled():
logger.warning(f'Currently the convergence of 0/1 Adam is only verified under FP16') logger.warning('Currently the convergence of 0/1 Adam is only verified under FP16')
elif self.optimizer_name() == ONEBIT_LAMB_OPTIMIZER: elif self.optimizer_name() == ONEBIT_LAMB_OPTIMIZER:
assert not self.zero_optimization(), "1bit-Lamb is not compatible with ZeRO" assert not self.zero_optimization(), "1bit-Lamb is not compatible with ZeRO"
from deepspeed.runtime.fp16.onebit.lamb import OnebitLamb from deepspeed.runtime.fp16.onebit.lamb import OnebitLamb
optimizer = OnebitLamb(model_parameters, self, **optimizer_parameters) optimizer = OnebitLamb(model_parameters, self, **optimizer_parameters)
if not self.fp16_enabled(): if not self.fp16_enabled():
logger.warning(f"Currently the convergence of 1-bit Lamb is only verified under FP16") logger.warning("Currently the convergence of 1-bit Lamb is only verified under FP16")
elif self.optimizer_name() == LION_OPTIMIZER: elif self.optimizer_name() == LION_OPTIMIZER:
if self.zero_use_cpu_optimizer(): if self.zero_use_cpu_optimizer():
from deepspeed.ops.lion import DeepSpeedCPULion from deepspeed.ops.lion import DeepSpeedCPULion
@ -1560,19 +1560,19 @@ class DeepSpeedEngine(Module):
try: try:
from mup import MuAdam from mup import MuAdam
except ImportError: except ImportError:
logger.error(f"Install mup to use MuAdam optimizer") logger.error("Install mup to use MuAdam optimizer")
optimizer = MuAdam(model_parameters, **optimizer_parameters) optimizer = MuAdam(model_parameters, **optimizer_parameters)
elif self.optimizer_name() == MUADAMW_OPTIMIZER: elif self.optimizer_name() == MUADAMW_OPTIMIZER:
try: try:
from mup import MuAdamW from mup import MuAdamW
except ImportError: except ImportError:
logger.error(f"Install mup to use MuAdamW optimizer") logger.error("Install mup to use MuAdamW optimizer")
optimizer = MuAdamW(model_parameters, **optimizer_parameters) optimizer = MuAdamW(model_parameters, **optimizer_parameters)
elif self.optimizer_name() == MUSGD_OPTIMIZER: elif self.optimizer_name() == MUSGD_OPTIMIZER:
try: try:
from mup import MuSGD from mup import MuSGD
except ImportError: except ImportError:
logger.error(f"Install mup to use MuSGD optimizer") logger.error("Install mup to use MuSGD optimizer")
optimizer = MuSGD(model_parameters, **optimizer_parameters) optimizer = MuSGD(model_parameters, **optimizer_parameters)
else: else:
torch_optimizer = getattr(torch.optim, self.optimizer_name()) torch_optimizer = getattr(torch.optim, self.optimizer_name())
@ -1630,7 +1630,7 @@ class DeepSpeedEngine(Module):
if isinstance(optimizer, fused_opts) \ if isinstance(optimizer, fused_opts) \
or self.optimizer_name() in [ONEBIT_ADAM_OPTIMIZER, ZERO_ONE_ADAM_OPTIMIZER]: or self.optimizer_name() in [ONEBIT_ADAM_OPTIMIZER, ZERO_ONE_ADAM_OPTIMIZER]:
if self.dynamic_loss_scale(): if self.dynamic_loss_scale():
log_dist(f'Creating fp16 optimizer with dynamic loss scale', ranks=[0]) log_dist('Creating fp16 optimizer with dynamic loss scale', ranks=[0])
timers = self.timers if self.wall_clock_breakdown() else NoopTimer() timers = self.timers if self.wall_clock_breakdown() else NoopTimer()
optimizer = FP16_Optimizer( optimizer = FP16_Optimizer(
optimizer, optimizer,
@ -1658,7 +1658,7 @@ class DeepSpeedEngine(Module):
has_moe_layers=self.has_moe_layers, has_moe_layers=self.has_moe_layers,
) )
else: else:
log_dist(f'Creating fp16 unfused optimizer with dynamic loss scale', ranks=[0]) log_dist('Creating fp16 unfused optimizer with dynamic loss scale', ranks=[0])
optimizer = FP16_UnfusedOptimizer( optimizer = FP16_UnfusedOptimizer(
optimizer, optimizer,
deepspeed=self, deepspeed=self,
@ -2214,7 +2214,7 @@ class DeepSpeedEngine(Module):
if self.is_gradient_accumulation_boundary(): if self.is_gradient_accumulation_boundary():
if self.global_rank == 0: if self.global_rank == 0:
self.summary_events = [( self.summary_events = [(
f"Train/Samples/train_loss", "Train/Samples/train_loss",
self.losses.item(), self.losses.item(),
self.global_samples, self.global_samples,
)] )]
@ -2274,7 +2274,7 @@ class DeepSpeedEngine(Module):
assert not self.zero_optimization_partition_gradients(), \ assert not self.zero_optimization_partition_gradients(), \
f"no_sync context manager is incompatible with gradient partitioning logic of ZeRO stage {self.zero_optimization_stage()}" f"no_sync context manager is incompatible with gradient partitioning logic of ZeRO stage {self.zero_optimization_stage()}"
assert not self.inside_no_sync_ctxt, f"no_sync context manager reentry is unsupported" assert not self.inside_no_sync_ctxt, "no_sync context manager reentry is unsupported"
self.inside_no_sync_ctxt = True self.inside_no_sync_ctxt = True
try: try:
@ -2456,7 +2456,7 @@ class DeepSpeedEngine(Module):
if (self.eigenvalue_enabled() and (self.gas_boundary_ctr % self.eigenvalue_gas_boundary_resolution() == 0) if (self.eigenvalue_enabled() and (self.gas_boundary_ctr % self.eigenvalue_gas_boundary_resolution() == 0)
and self.quantizer.any_precision_switch()): and self.quantizer.any_precision_switch()):
log_dist(f"computing eigenvalue...", ranks=[0]) log_dist("computing eigenvalue...", ranks=[0])
self.block_eigenvalue = self.eigenvalue.compute_eigenvalue(self.module, self.device, self.block_eigenvalue = self.eigenvalue.compute_eigenvalue(self.module, self.device,
self.optimizer.cur_scale) self.optimizer.cur_scale)
@ -2482,11 +2482,11 @@ class DeepSpeedEngine(Module):
if self.monitor.enabled: if self.monitor.enabled:
if self.is_gradient_accumulation_boundary(): if self.is_gradient_accumulation_boundary():
if self.global_rank == 0: if self.global_rank == 0:
self.summary_events = [(f"Train/Samples/lr", self.get_lr()[0], self.global_samples)] self.summary_events = [("Train/Samples/lr", self.get_lr()[0], self.global_samples)]
if self.fp16_enabled() and hasattr(self.optimizer, "cur_scale"): if self.fp16_enabled() and hasattr(self.optimizer, "cur_scale"):
self.summary_events.append(( self.summary_events.append((
f"Train/Samples/loss_scale", "Train/Samples/loss_scale",
self.optimizer.cur_scale, self.optimizer.cur_scale,
self.global_samples, self.global_samples,
)) ))
@ -2578,27 +2578,27 @@ class DeepSpeedEngine(Module):
if self.global_rank == 0: if self.global_rank == 0:
self.summary_events = [ self.summary_events = [
( (
f"Train/Samples/elapsed_time_ms_forward", "Train/Samples/elapsed_time_ms_forward",
self.timers(FORWARD_GLOBAL_TIMER).elapsed(reset=False), self.timers(FORWARD_GLOBAL_TIMER).elapsed(reset=False),
self.global_samples, self.global_samples,
), ),
( (
f"Train/Samples/elapsed_time_ms_backward", "Train/Samples/elapsed_time_ms_backward",
self.timers(BACKWARD_GLOBAL_TIMER).elapsed(reset=False), self.timers(BACKWARD_GLOBAL_TIMER).elapsed(reset=False),
self.global_samples, self.global_samples,
), ),
( (
f"Train/Samples/elapsed_time_ms_backward_inner", "Train/Samples/elapsed_time_ms_backward_inner",
self.timers(BACKWARD_INNER_GLOBAL_TIMER).elapsed(reset=False), self.timers(BACKWARD_INNER_GLOBAL_TIMER).elapsed(reset=False),
self.global_samples, self.global_samples,
), ),
( (
f"Train/Samples/elapsed_time_ms_backward_allreduce", "Train/Samples/elapsed_time_ms_backward_allreduce",
self.timers(BACKWARD_REDUCE_GLOBAL_TIMER).elapsed(reset=False), self.timers(BACKWARD_REDUCE_GLOBAL_TIMER).elapsed(reset=False),
self.global_samples, self.global_samples,
), ),
( (
f"Train/Samples/elapsed_time_ms_step", "Train/Samples/elapsed_time_ms_step",
self.timers(STEP_GLOBAL_TIMER).elapsed(reset=False), self.timers(STEP_GLOBAL_TIMER).elapsed(reset=False),
self.global_samples, self.global_samples,
), ),
@ -3239,7 +3239,7 @@ class DeepSpeedEngine(Module):
if load_optimizer_states: if load_optimizer_states:
deepspeed_states.append('optimizer') deepspeed_states.append('optimizer')
client_state = {key: value for key, value in checkpoint.items() if not key in deepspeed_states} client_state = {key: value for key, value in checkpoint.items() if key not in deepspeed_states}
if optim_checkpoint is not None: if optim_checkpoint is not None:
client_state['optimizer'] = optim_checkpoint['optimizer'] client_state['optimizer'] = optim_checkpoint['optimizer']
@ -3739,7 +3739,7 @@ class DeepSpeedEngine(Module):
numel += param.ds_numel if hasattr(param, "ds_numel") else param.numel() numel += param.ds_numel if hasattr(param, "ds_numel") else param.numel()
shape = param.ds_shape if hasattr(param, "ds_shape") else param.shape shape = param.ds_shape if hasattr(param, "ds_shape") else param.shape
if param not in self.param_names: if param not in self.param_names:
raise ValueError(f"failed to find optimizer param in named params") raise ValueError("failed to find optimizer param in named params")
name = self.param_names[param] name = self.param_names[param]
param_shapes[name] = shape param_shapes[name] = shape

View File

@ -210,7 +210,7 @@ class DynamicLossScaler(LossScalerBase):
# we still create a scaler for other dtypes (fp32, bf16) which does not perform any scaling. # we still create a scaler for other dtypes (fp32, bf16) which does not perform any scaling.
def CreateLossScaler(dtype, static_loss_scale, dynamic_scaling, dynamic_loss_args): def CreateLossScaler(dtype, static_loss_scale, dynamic_scaling, dynamic_loss_args):
if dtype == torch.half and dynamic_scaling: if dtype == torch.half and dynamic_scaling:
assert dynamic_loss_args is not None, f"Dynamic loss scaling parameters must be defined." assert dynamic_loss_args is not None, "Dynamic loss scaling parameters must be defined."
return DynamicLossScaler(dtype=dtype, **dynamic_loss_args) return DynamicLossScaler(dtype=dtype, **dynamic_loss_args)
loss_scale_value = static_loss_scale if dtype == torch.half else 1.0 loss_scale_value = static_loss_scale if dtype == torch.half else 1.0

View File

@ -209,7 +209,7 @@ def get_config_from_args(args):
if not hasattr(args, LR_SCHEDULE) or args.lr_schedule is None: if not hasattr(args, LR_SCHEDULE) or args.lr_schedule is None:
return None, '--{} not specified on command line'.format(LR_SCHEDULE) return None, '--{} not specified on command line'.format(LR_SCHEDULE)
if not args.lr_schedule in VALID_LR_SCHEDULES: if args.lr_schedule not in VALID_LR_SCHEDULES:
return None, '{} is not supported LR schedule'.format(args.lr_schedule) return None, '{} is not supported LR schedule'.format(args.lr_schedule)
config = {} config = {}
@ -227,16 +227,16 @@ def get_config_from_args(args):
def get_lr_from_config(config): def get_lr_from_config(config):
if not 'type' in config: if 'type' not in config:
return None, 'LR schedule type not defined in config' return None, 'LR schedule type not defined in config'
if not 'params' in config: if 'params' not in config:
return None, 'LR schedule params not defined in config' return None, 'LR schedule params not defined in config'
lr_schedule = config['type'] lr_schedule = config['type']
lr_params = config['params'] lr_params = config['params']
if not lr_schedule in VALID_LR_SCHEDULES: if lr_schedule not in VALID_LR_SCHEDULES:
return None, '{} is not a valid LR schedule'.format(lr_schedule) return None, '{} is not a valid LR schedule'.format(lr_schedule)
if lr_schedule == LR_RANGE_TEST: if lr_schedule == LR_RANGE_TEST:

View File

@ -57,7 +57,7 @@ class DataParallelWriterFactory(object):
return self._create_config(1, 0) if dp_rank == 0 else None return self._create_config(1, 0) if dp_rank == 0 else None
assert self._uni_parallel_info.pure_dp, \ assert self._uni_parallel_info.pure_dp, \
f'3D parallelism is not yet supported for data parallel checkpointing.' '3D parallelism is not yet supported for data parallel checkpointing.'
if self._parallel_unit == CheckpointDataParallel.REPLICA or ep_info.ep_world_size == 1: if self._parallel_unit == CheckpointDataParallel.REPLICA or ep_info.ep_world_size == 1:
return self._get_parallel_write_for_ddp(ep_info.dp_world_size, ep_info.dp_rank) return self._get_parallel_write_for_ddp(ep_info.dp_world_size, ep_info.dp_rank)

View File

@ -217,7 +217,7 @@ class PipelineEngine(DeepSpeedEngine):
# set activation_checkpoint_func to non_reentrant_checkpoint func. # set activation_checkpoint_func to non_reentrant_checkpoint func.
self.module.activation_checkpoint_func = ds_checkpointing.non_reentrant_checkpoint self.module.activation_checkpoint_func = ds_checkpointing.non_reentrant_checkpoint
if self.grid.get_global_rank() == 0: if self.grid.get_global_rank() == 0:
logger.info(f'CONFIG: activation_checkpoint_func=non_reentrant_checkpoint') logger.info('CONFIG: activation_checkpoint_func=non_reentrant_checkpoint')
if self.module.activation_checkpoint_interval > 0: if self.module.activation_checkpoint_interval > 0:
self.module._precompute_checkpointable_values() self.module._precompute_checkpointable_values()
@ -359,7 +359,7 @@ class PipelineEngine(DeepSpeedEngine):
The arithmetic mean of the losses computed this batch. The arithmetic mean of the losses computed this batch.
""" """
if not torch._C.is_grad_enabled(): if not torch._C.is_grad_enabled():
raise RuntimeError(f'train_batch() requires gradients enabled. Use eval_batch() instead.') raise RuntimeError('train_batch() requires gradients enabled. Use eval_batch() instead.')
# Curriculum learning could change activation shape # Curriculum learning could change activation shape
if self.curriculum_enabled_legacy(): if self.curriculum_enabled_legacy():
@ -408,8 +408,8 @@ class PipelineEngine(DeepSpeedEngine):
# Monitoring # Monitoring
if self.global_rank == 0 and self.monitor.enabled: if self.global_rank == 0 and self.monitor.enabled:
self.summary_events = [(f'Train/Samples/train_loss', self.agg_train_loss.mean().item(), self.summary_events = [('Train/Samples/train_loss', self.agg_train_loss.mean().item(), self.global_samples)
self.global_samples)] ]
self.monitor.write_events(self.summary_events) self.monitor.write_events(self.summary_events)
if self.steps_per_print() is not None and self.wall_clock_breakdown( if self.steps_per_print() is not None and self.wall_clock_breakdown(
@ -498,7 +498,7 @@ class PipelineEngine(DeepSpeedEngine):
eval_output = self._bcast_pipe_scalar(eval_output) eval_output = self._bcast_pipe_scalar(eval_output)
if self.global_rank == 0 and self.monitor.enabled: if self.global_rank == 0 and self.monitor.enabled:
self.summary_events = [(f'Train/Samples/eval_loss', eval_output.mean().item(), self.global_samples)] self.summary_events = [('Train/Samples/eval_loss', eval_output.mean().item(), self.global_samples)]
self.monitor.write_events(self.summary_events) self.monitor.write_events(self.summary_events)
# Restore the training iterator # Restore the training iterator
@ -1220,10 +1220,9 @@ class PipelineEngine(DeepSpeedEngine):
self._force_grad_boundary = False self._force_grad_boundary = False
if self.global_rank == 0 and self.monitor.enabled: if self.global_rank == 0 and self.monitor.enabled:
self.summary_events = [(f'Train/Samples/lr', self.get_lr()[0], self.global_samples)] self.summary_events = [('Train/Samples/lr', self.get_lr()[0], self.global_samples)]
if self.fp16_enabled() and hasattr(self.optimizer, 'cur_scale'): if self.fp16_enabled() and hasattr(self.optimizer, 'cur_scale'):
self.summary_events.append( self.summary_events.append(('Train/Samples/loss_scale', self.optimizer.cur_scale, self.global_samples))
(f'Train/Samples/loss_scale', self.optimizer.cur_scale, self.global_samples))
self.monitor.write_events(self.summary_events) self.monitor.write_events(self.summary_events)
if self.wall_clock_breakdown(): if self.wall_clock_breakdown():

View File

@ -1266,7 +1266,7 @@ class UlyssesSPFwdLossBwdWithLogits:
def sp_fwd_loss_bwd(self, batch) -> torch.Tensor: def sp_fwd_loss_bwd(self, batch) -> torch.Tensor:
see_memory_usage(f"entered sp_fwd_loss_bwd", force=True) see_memory_usage("entered sp_fwd_loss_bwd", force=True)
# ensure shapes are correct # ensure shapes are correct
if not (batch["input_ids"].shape == batch["position_ids"].shape == batch["labels"].shape): if not (batch["input_ids"].shape == batch["position_ids"].shape == batch["labels"].shape):

View File

@ -102,7 +102,7 @@ class OptimizerStateSwapInfo(object):
def get_or_create_gradient_paths(self, offsets, lengths): def get_or_create_gradient_paths(self, offsets, lengths):
gradient_paths = [] gradient_paths = []
for offset, length in zip(offsets, lengths): for offset, length in zip(offsets, lengths):
if not offset in self.swapped_gradients.keys(): if offset not in self.swapped_gradients.keys():
path = os.path.join(self.swap_folder, f'{self.param_id}_gradient_{offset}_{length}.tensor.swp') path = os.path.join(self.swap_folder, f'{self.param_id}_gradient_{offset}_{length}.tensor.swp')
self.swapped_gradients[offset] = FlattenedTensorSwapInfo(path, length, offset) self.swapped_gradients[offset] = FlattenedTensorSwapInfo(path, length, offset)
@ -233,7 +233,7 @@ class OptimizerSwapper(object):
self.timer_names.update(gradient_swapper.get_timer_names()) self.timer_names.update(gradient_swapper.get_timer_names())
def _swap_out_gradients(self, parameter, gradient_offsets, gradient_tensors, gradient_swapper): def _swap_out_gradients(self, parameter, gradient_offsets, gradient_tensors, gradient_swapper):
if not OptimizerSwapper.parameter_id(parameter) in self.swap_params_info.keys(): if OptimizerSwapper.parameter_id(parameter) not in self.swap_params_info.keys():
return return
swap_info = self.swap_params_info[OptimizerSwapper.parameter_id(parameter)] swap_info = self.swap_params_info[OptimizerSwapper.parameter_id(parameter)]
@ -471,7 +471,7 @@ class OptimizerSwapper(object):
) )
def _get_state_tensors(self, parameter): def _get_state_tensors(self, parameter):
if not parameter in self.optimizer.state: if parameter not in self.optimizer.state:
return [] return []
tensor_list = [] tensor_list = []
@ -490,7 +490,7 @@ class OptimizerSwapper(object):
def _create_param_swap_info(self, parameter, numel): def _create_param_swap_info(self, parameter, numel):
param_id = OptimizerSwapper.parameter_id(parameter) param_id = OptimizerSwapper.parameter_id(parameter)
assert not param_id in self.swap_params_info assert param_id not in self.swap_params_info
self.swap_params_info[param_id] = OptimizerStateSwapInfo(parameter=parameter, self.swap_params_info[param_id] = OptimizerStateSwapInfo(parameter=parameter,
numel=numel, numel=numel,

View File

@ -399,8 +399,8 @@ class AsyncPartitionedParameterSwapper(object):
self.partitioned_swap_pool = SwapBufferPool([self.partitioned_swap_buffer]) self.partitioned_swap_pool = SwapBufferPool([self.partitioned_swap_buffer])
def swap_out_partitioned_params(self, dst_fp16_params, src_fp32_params): def swap_out_partitioned_params(self, dst_fp16_params, src_fp32_params):
assert self.partitioned_swap_buffer is not None, f'partitioned swap buffers for fp16 params not initialized' assert self.partitioned_swap_buffer is not None, 'partitioned swap buffers for fp16 params not initialized'
assert self.partitioned_swap_pool is not None, f'partitioned swap pool for fp16 params not initialized' assert self.partitioned_swap_pool is not None, 'partitioned swap pool for fp16 params not initialized'
assert len(dst_fp16_params) == len(src_fp32_params), \ assert len(dst_fp16_params) == len(src_fp32_params), \
f'mismatch in number of fp16 params {len(dst_fp16_params)} and fp32 params {len(src_fp32_params)}' f'mismatch in number of fp16 params {len(dst_fp16_params)} and fp32 params {len(src_fp32_params)}'

View File

@ -213,7 +213,7 @@ class PipelinedOptimizerSwapper(OptimizerSwapper):
count=required_buffer_count, count=required_buffer_count,
dtype=parameter.dtype) dtype=parameter.dtype)
assert allocated_buffers is not None, \ assert allocated_buffers is not None, \
f"PipelinedOptimizerSwapper ran out of swap buffers, try increasing 'buffer_count'" "PipelinedOptimizerSwapper ran out of swap buffers, try increasing 'buffer_count'"
state_buffers = allocated_buffers[:num_swap_tensors] state_buffers = allocated_buffers[:num_swap_tensors]
param_info.set_swap_buffers(state_buffers, aligned_numel) param_info.set_swap_buffers(state_buffers, aligned_numel)

View File

@ -30,7 +30,7 @@ def swap_out_tensors(swap_handle, tensor_buffers, swap_paths):
def print_object(obj, name, exclude_list=[]): def print_object(obj, name, exclude_list=[]):
logger.info('{}:'.format(name)) logger.info('{}:'.format(name))
for arg in sorted(vars(obj)): for arg in sorted(vars(obj)):
if not arg in exclude_list: if arg not in exclude_list:
dots = '.' * (29 - len(arg)) dots = '.' * (29 - len(arg))
logger.info(' {} {} {}'.format(arg, dots, getattr(obj, arg))) logger.info(' {} {} {}'.format(arg, dots, getattr(obj, arg)))
@ -55,7 +55,7 @@ class SwapBuffer(object):
def allocate_tensor(self, swap_path, numel, aligned_numel): def allocate_tensor(self, swap_path, numel, aligned_numel):
assert self.has_space(aligned_numel) assert self.has_space(aligned_numel)
assert not self.offset in self.swap_tensors assert self.offset not in self.swap_tensors
allocate_offset = self.offset allocate_offset = self.offset
swap_tensor = self.buffer.narrow(0, allocate_offset, aligned_numel) swap_tensor = self.buffer.narrow(0, allocate_offset, aligned_numel)

View File

@ -846,7 +846,7 @@ def get_global_norm_of_tensors(input_tensors, norm_type=2, mpu=None, use_graph=F
Total norm of the tensors (viewed as a single vector). Total norm of the tensors (viewed as a single vector).
""" """
assert isinstance(input_tensors, Iterable), f'expected Iterable type not {type(input_tensors)}' assert isinstance(input_tensors, Iterable), f'expected Iterable type not {type(input_tensors)}'
assert all([torch.is_tensor(t) for t in input_tensors]), f'expected list of only tensors' assert all([torch.is_tensor(t) for t in input_tensors]), 'expected list of only tensors'
norm_type = float(norm_type) norm_type = float(norm_type)
all_norms = [] all_norms = []

View File

@ -85,7 +85,7 @@ class ContiguousMemoryAllocator(object):
assert tensor_id in self.tensor_map.keys(), "No such tensor allocated by the allocator." assert tensor_id in self.tensor_map.keys(), "No such tensor allocated by the allocator."
assert tensor.numel() >= numel, "Assert tensor buffer does is not large enough" assert tensor.numel() >= numel, "Assert tensor buffer does is not large enough"
assert not tensor_id in self.id_to_params.keys(), "This tensor has already been assigned to a param" assert tensor_id not in self.id_to_params.keys(), "This tensor has already been assigned to a param"
self.id_to_params[tensor_id] = [param] self.id_to_params[tensor_id] = [param]

View File

@ -47,7 +47,7 @@ class MiCS_AllGatherCoalescedHandle(AllGatherCoalescedHandle):
instrument_w_nvtx(self.allgather_handle.wait)() instrument_w_nvtx(self.allgather_handle.wait)()
except (ValueError, RuntimeError) as e: except (ValueError, RuntimeError) as e:
log_dist( log_dist(
f"WARNING: Runtime Error while waiting the collective all-gather, possibly due to the _IllegalWork", "WARNING: Runtime Error while waiting the collective all-gather, possibly due to the _IllegalWork",
ranks=[0]) ranks=[0])
log_dist(f"Error message: {e}", ranks=[0]) log_dist(f"Error message: {e}", ranks=[0])
@ -158,7 +158,7 @@ class MiCS_Init(Init):
if sequence_data_parallel_group is not None: if sequence_data_parallel_group is not None:
logger.warning( logger.warning(
f"sequence_data_parallel_group' is deprecated and will be removed. Use 'data_parallel_group' instead.") "sequence_data_parallel_group' is deprecated and will be removed. Use 'data_parallel_group' instead.")
if data_parallel_group is not None: if data_parallel_group is not None:
raise ValueError( raise ValueError(
"Both 'data_parallel_group' and 'sequence_data_parallel_group' were specified. Please provide only one of these arguments." "Both 'data_parallel_group' and 'sequence_data_parallel_group' were specified. Please provide only one of these arguments."
@ -339,7 +339,7 @@ class MiCS_Offload(DeepSpeedZeRoOffload):
""" overload the parent class function for convert the parameters """ overload the parent class function for convert the parameters
""" """
log_dist(f'Convert to zero parameters from MiCS Offload manager', ranks=[0]) log_dist('Convert to zero parameters from MiCS Offload manager', ranks=[0])
non_zero_params = [p for p in module.parameters() if not is_zero_param(p)] non_zero_params = [p for p in module.parameters() if not is_zero_param(p)]
if non_zero_params: if non_zero_params:
zero_params = [p for p in module.parameters() if is_zero_param(p)] zero_params = [p for p in module.parameters() if is_zero_param(p)]

View File

@ -1020,7 +1020,7 @@ class Init(InsertPostInitMethodToModuleSubClasses):
if sequence_data_parallel_group is not None: if sequence_data_parallel_group is not None:
logger.warning( logger.warning(
f"sequence_data_parallel_group' is deprecated and will be removed. Use 'data_parallel_group' instead.") "sequence_data_parallel_group' is deprecated and will be removed. Use 'data_parallel_group' instead.")
if data_parallel_group is not None: if data_parallel_group is not None:
raise ValueError( raise ValueError(
"Both 'data_parallel_group' and 'sequence_data_parallel_group' were specified. Please provide only one of these arguments." "Both 'data_parallel_group' and 'sequence_data_parallel_group' were specified. Please provide only one of these arguments."

View File

@ -459,7 +459,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
self.offloaded_states: Set[OffloadDeviceEnum] = set() self.offloaded_states: Set[OffloadDeviceEnum] = set()
if dist.get_rank(group=self.dp_process_group) == 0: if dist.get_rank(group=self.dp_process_group) == 0:
see_memory_usage(f"After initializing ZeRO optimizer", force=True) see_memory_usage("After initializing ZeRO optimizer", force=True)
def destroy(self): def destroy(self):
self.parameter_offload.destroy() self.parameter_offload.destroy()
@ -551,7 +551,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
dist.barrier() dist.barrier()
if dist.get_rank() == 0: if dist.get_rank() == 0:
logger.info(f"optimizer state initialized") logger.info("optimizer state initialized")
# IPG # IPG
if self.contiguous_gradients: if self.contiguous_gradients:
@ -647,7 +647,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
nvme_swap_folder = os.path.join(offload_optimizer_config.nvme_path, 'zero_stage_3') nvme_swap_folder = os.path.join(offload_optimizer_config.nvme_path, 'zero_stage_3')
os.makedirs(nvme_swap_folder, exist_ok=True) os.makedirs(nvme_swap_folder, exist_ok=True)
if dist.get_rank() == 0: if dist.get_rank() == 0:
logger.info(f'Tensor Swapping: Adding optimizer tensors') logger.info('Tensor Swapping: Adding optimizer tensors')
swapper_type = PipelinedOptimizerSwapper if offload_optimizer_config.pipeline else PartitionedOptimizerSwapper swapper_type = PipelinedOptimizerSwapper if offload_optimizer_config.pipeline else PartitionedOptimizerSwapper
@ -797,7 +797,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
largest_partition_numel = [t.ds_numel for t in sub_group] largest_partition_numel = [t.ds_numel for t in sub_group]
max_partition_numel = total_elements max_partition_numel = total_elements
assert len(largest_partition_numel) > 0, f'Unexpected that largest partition is empty' assert len(largest_partition_numel) > 0, 'Unexpected that largest partition is empty'
self.fp16_groups[0][0].nvme_swapper.reserve_partitioned_swap_space(largest_partition_numel) self.fp16_groups[0][0].nvme_swapper.reserve_partitioned_swap_space(largest_partition_numel)
def _get_parameter_partitions(self) -> List[Tensor]: def _get_parameter_partitions(self) -> List[Tensor]:
@ -1142,10 +1142,10 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
@instrument_w_nvtx @instrument_w_nvtx
def independent_gradient_partition_epilogue(self): def independent_gradient_partition_epilogue(self):
self.report_ipg_memory_usage(f"In ipg_epilogue before reduce_ipg_grads", 0) self.report_ipg_memory_usage("In ipg_epilogue before reduce_ipg_grads", 0)
for comm_dtype in sort_dtypes(self.ipg_buckets.keys()): for comm_dtype in sort_dtypes(self.ipg_buckets.keys()):
self.__reduce_and_partition_ipg_grads(comm_dtype) self.__reduce_and_partition_ipg_grads(comm_dtype)
self.report_ipg_memory_usage(f"In ipg_epilogue after reduce_ipg_grads", 0) self.report_ipg_memory_usage("In ipg_epilogue after reduce_ipg_grads", 0)
if not get_accelerator().resolves_data_dependency(): if not get_accelerator().resolves_data_dependency():
self.reduce_and_partition_stream.synchronize() self.reduce_and_partition_stream.synchronize()
@ -1173,7 +1173,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
self.independent_gradient_partition_epilogue() self.independent_gradient_partition_epilogue()
def create_reduce_and_remove_grad_hooks(self): def create_reduce_and_remove_grad_hooks(self):
print_rank_0(f'[Begin] Create gradient reduction hooks') print_rank_0('[Begin] Create gradient reduction hooks')
self.leaf_parameters = defaultdict(list) self.leaf_parameters = defaultdict(list)
for i, param_group in enumerate(self.fp16_groups): for i, param_group in enumerate(self.fp16_groups):
for param in param_group: for param in param_group:
@ -1256,7 +1256,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
self._leaf_module_hooks.append(leaf_module.register_forward_pre_hook(wrapper_pre_hook(leaf_parameters))) self._leaf_module_hooks.append(leaf_module.register_forward_pre_hook(wrapper_pre_hook(leaf_parameters)))
self._leaf_module_hooks.append(leaf_module.register_forward_hook(wrapper_post_hook())) self._leaf_module_hooks.append(leaf_module.register_forward_hook(wrapper_post_hook()))
print_rank_0(f'[End] Create gradient reduction hooks') print_rank_0('[End] Create gradient reduction hooks')
def get_param_id(self, param): def get_param_id(self, param):
return OptimizerSwapper.parameter_id(param) return OptimizerSwapper.parameter_id(param)
@ -1426,7 +1426,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
self.grad_position[param_id] = [int(i), int(current_offset), int(num_elements)] self.grad_position[param_id] = [int(i), int(current_offset), int(num_elements)]
#print(f"param id {param_id} i:{i}, ds_tensor {num_elements} numel {param.numel()}") #print(f"param id {param_id} i:{i}, ds_tensor {num_elements} numel {param.numel()}")
current_offset += num_elements current_offset += num_elements
see_memory_usage(f"After Set Grad positions", force=False) see_memory_usage("After Set Grad positions", force=False)
def _constant_buffered_norm2(self, input, buffer_size=250000000): def _constant_buffered_norm2(self, input, buffer_size=250000000):
norm = None norm = None
@ -1515,7 +1515,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
self.norm_for_param_grads[self.get_param_id(param)] = self._constant_buffered_norm2(grad_buffer) self.norm_for_param_grads[self.get_param_id(param)] = self._constant_buffered_norm2(grad_buffer)
if self._swappable_optimizer_subgroup(i): if self._swappable_optimizer_subgroup(i):
if not i in offload_fp32_gradients.keys(): if i not in offload_fp32_gradients.keys():
offload_fp32_gradients[i] = [] offload_fp32_gradients[i] = []
offload_fp32_offsets[i] = [] offload_fp32_offsets[i] = []
@ -1560,7 +1560,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
""" """
if not self.zero_quantized_nontrainable_weights: if not self.zero_quantized_nontrainable_weights:
print_rank_0( print_rank_0(
f"Warning: quantize_nontrainable_params() called with zero_quantized_nontrainable_weights disabled, return without doing anything", "Warning: quantize_nontrainable_params() called with zero_quantized_nontrainable_weights disabled, return without doing anything",
force=True) force=True)
return return
quantizer_module = CUDAQuantizer() quantizer_module = CUDAQuantizer()
@ -1881,8 +1881,8 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
def _pre_step(self): def _pre_step(self):
self.micro_step_id = 0 self.micro_step_id = 0
print_rank_0(f"Inside Step function") print_rank_0("Inside Step function")
see_memory_usage(f"In step before checking overflow", force=False) see_memory_usage("In step before checking overflow", force=False)
print_rank_0("Finished Tracing at Beginning of Step") print_rank_0("Finished Tracing at Beginning of Step")
self._get_param_coordinator().hierarchy = 0 self._get_param_coordinator().hierarchy = 0
@ -2084,7 +2084,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
self.timers.log(timer_names) self.timers.log(timer_names)
see_memory_usage('After zero_optimizer step', force=False) see_memory_usage('After zero_optimizer step', force=False)
print_rank_0(f"------------------Finishing Step-----------------------") print_rank_0("------------------Finishing Step-----------------------")
@instrument_w_nvtx @instrument_w_nvtx
def _reassign_or_swap_out_partitioned_parameters(self, sub_group_id): def _reassign_or_swap_out_partitioned_parameters(self, sub_group_id):
@ -2296,7 +2296,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
if self.swap_optimizer: if self.swap_optimizer:
self.optimizer_swapper.pre_backward() self.optimizer_swapper.pre_backward()
see_memory_usage(f"Before backward", force=False) see_memory_usage("Before backward", force=False)
if self.custom_loss_scaler: if self.custom_loss_scaler:
scaled_loss = self.external_loss_scale * loss scaled_loss = self.external_loss_scale * loss
@ -2486,7 +2486,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
if not param.requires_grad: if not param.requires_grad:
return return
assert hasattr(param, "ds_tensor"), f" The parameter does not contain the partitioned copy of the tensor." assert hasattr(param, "ds_tensor"), " The parameter does not contain the partitioned copy of the tensor."
assert value.numel() == param.ds_tensor.numel( assert value.numel() == param.ds_tensor.numel(
), f" Number of elements do not match: {value.numel()} != {param.ds_tensor.ds_numel}" ), f" Number of elements do not match: {value.numel()} != {param.ds_tensor.ds_numel}"
@ -2961,7 +2961,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
self.empty_partition_cache() self.empty_partition_cache()
assert self.optimizer.__class__ == deepspeed.ops.adam.fused_adam.FusedAdam, f"Offloading is supported only for DeepSpeed FusedAdam." assert self.optimizer.__class__ == deepspeed.ops.adam.fused_adam.FusedAdam, "Offloading is supported only for DeepSpeed FusedAdam."
def needs_offload(target): def needs_offload(target):
# return True # return True

View File

@ -602,10 +602,10 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
see_memory_usage("After initializing optimizer states", force=True) see_memory_usage("After initializing optimizer states", force=True)
if dist.get_rank() == 0: if dist.get_rank() == 0:
logger.info(f"optimizer state initialized") logger.info("optimizer state initialized")
if dist.get_rank(group=self.dp_process_group) == 0: if dist.get_rank(group=self.dp_process_group) == 0:
see_memory_usage(f"After initializing ZeRO optimizer", force=True) see_memory_usage("After initializing ZeRO optimizer", force=True)
self._link_all_hp_params() self._link_all_hp_params()
self._hp_optimizer_states_linked = False self._hp_optimizer_states_linked = False
@ -722,7 +722,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
for i, tensor in enumerate(tensor_list): for i, tensor in enumerate(tensor_list):
j = i % num_partitions j = i % num_partitions
if not j in partition_tensors: if j not in partition_tensors:
partition_tensors[j] = [] partition_tensors[j] = []
partition_tensors[j].append((i, tensor)) partition_tensors[j].append((i, tensor))
@ -828,9 +828,9 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
i, param_group, partition_id) i, param_group, partition_id)
def independent_gradient_partition_epilogue(self): def independent_gradient_partition_epilogue(self):
self.report_ipg_memory_usage(f"In ipg_epilogue before reduce_ipg_grads", 0) self.report_ipg_memory_usage("In ipg_epilogue before reduce_ipg_grads", 0)
self.reduce_ipg_grads() self.reduce_ipg_grads()
self.report_ipg_memory_usage(f"In ipg_epilogue after reduce_ipg_grads", 0) self.report_ipg_memory_usage("In ipg_epilogue after reduce_ipg_grads", 0)
# if dist.get_rank() == 0: # if dist.get_rank() == 0:
# logger.info("Params already reduced %s", self.params_already_reduced) # logger.info("Params already reduced %s", self.params_already_reduced)
@ -846,7 +846,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
if self.cpu_offload is False: if self.cpu_offload is False:
for i, _ in enumerate(self.bit16_groups): for i, _ in enumerate(self.bit16_groups):
if not i in self.averaged_gradients or self.averaged_gradients[i] is None: if i not in self.averaged_gradients or self.averaged_gradients[i] is None:
self.averaged_gradients[i] = self.get_flat_partition( self.averaged_gradients[i] = self.get_flat_partition(
self.params_in_partition[i], self.params_in_partition[i],
self.first_offset[i], self.first_offset[i],
@ -871,7 +871,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
# All gradients required by the step # All gradients required by the step
# are in self.averaged_gradients # are in self.averaged_gradients
self.zero_grad(set_to_none=True) self.zero_grad(set_to_none=True)
see_memory_usage(f"End ipg_epilogue") see_memory_usage("End ipg_epilogue")
# resets all partition to no reduced # resets all partition to no reduced
# sets remaining grads to the total number of grads in each partition # sets remaining grads to the total number of grads in each partition
@ -1958,7 +1958,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
""" """
self.micro_step_id = INITIAL_MICRO_STEP_ID self.micro_step_id = INITIAL_MICRO_STEP_ID
see_memory_usage(f"In step before checking overflow") see_memory_usage("In step before checking overflow")
# First compute norm for all group so we know if there is overflow # First compute norm for all group so we know if there is overflow
if self.check_grad_overflow: if self.check_grad_overflow:
@ -2448,7 +2448,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
self.clip_grad = sd.get(CLIP_GRAD, self.clip_grad) self.clip_grad = sd.get(CLIP_GRAD, self.clip_grad)
ckpt_version = sd.get(DS_VERSION, False) ckpt_version = sd.get(DS_VERSION, False)
assert ckpt_version, f"Empty ds_version in checkpoint, not clear how to proceed" assert ckpt_version, "Empty ds_version in checkpoint, not clear how to proceed"
ckpt_version = pkg_version.parse(ckpt_version) ckpt_version = pkg_version.parse(ckpt_version)
# zero stage 1 mode # zero stage 1 mode

View File

@ -263,7 +263,7 @@ def safe_get_local_grad(param):
Returns: Returns:
Union[torch.Tensor, None]: A tensor on accelerator device Union[torch.Tensor, None]: A tensor on accelerator device
""" """
assert hasattr(param, 'ds_id'), f'This API is only defined for ZeRO-3 partitioned parameters' assert hasattr(param, 'ds_id'), 'This API is only defined for ZeRO-3 partitioned parameters'
return param._z3_optimizer.get_local_fp32_grad_for_param(param) return param._z3_optimizer.get_local_fp32_grad_for_param(param)
@ -277,7 +277,7 @@ def safe_set_local_grad(param, value):
param (``torch.nn.Parameter``): A model parameter. param (``torch.nn.Parameter``): A model parameter.
value (``torch.Tensor``): New value of local gradient partition. value (``torch.Tensor``): New value of local gradient partition.
""" """
assert hasattr(param, 'ds_id'), f'This API is only defined for ZeRO-3 partitioned parameters' assert hasattr(param, 'ds_id'), 'This API is only defined for ZeRO-3 partitioned parameters'
param._z3_optimizer.set_local_grad_for_param(value, param) param._z3_optimizer.set_local_grad_for_param(value, param)
@ -290,7 +290,7 @@ def safe_get_local_fp32_param(param):
Returns: Returns:
Union[torch.Tensor, None]: A tensor on accelerator device Union[torch.Tensor, None]: A tensor on accelerator device
""" """
assert hasattr(param, 'ds_id'), f'This API is only defined for ZeRO-3 partitioned parameters' assert hasattr(param, 'ds_id'), 'This API is only defined for ZeRO-3 partitioned parameters'
return param._z3_optimizer.get_local_fp32_param(param) return param._z3_optimizer.get_local_fp32_param(param)
@ -304,7 +304,7 @@ def safe_get_local_optimizer_state(param, optim_state_key):
Returns: Returns:
Union[torch.Tensor, None]: A tensor on accelerator device Union[torch.Tensor, None]: A tensor on accelerator device
""" """
assert hasattr(param, 'ds_id'), f'This API is only defined for ZeRO-3 partitioned parameters' assert hasattr(param, 'ds_id'), 'This API is only defined for ZeRO-3 partitioned parameters'
return param._z3_optimizer.get_local_fp32_param(param, optim_state_key) return param._z3_optimizer.get_local_fp32_param(param, optim_state_key)
@ -316,7 +316,7 @@ def safe_set_local_optimizer_state(param, value, optim_state_key):
value (``torch.Tensor``): New value of local optimizer state partition. value (``torch.Tensor``): New value of local optimizer state partition.
optim_state_key (``string``): Key value of optimizer state (e.g., `exp_avg` in Adam optimizer). optim_state_key (``string``): Key value of optimizer state (e.g., `exp_avg` in Adam optimizer).
""" """
assert hasattr(param, 'ds_id'), f'This API is only defined for ZeRO-3 partitioned parameters' assert hasattr(param, 'ds_id'), 'This API is only defined for ZeRO-3 partitioned parameters'
param._z3_optimizer.set_local_hp_param(value, param, optim_state_key) param._z3_optimizer.set_local_hp_param(value, param, optim_state_key)
@ -327,7 +327,7 @@ def safe_set_local_fp32_param(param, value):
param (``torch.nn.Parameter``): A model parameter. param (``torch.nn.Parameter``): A model parameter.
value (``torch.Tensor``): New value of local parameter partition. value (``torch.Tensor``): New value of local parameter partition.
""" """
assert hasattr(param, 'ds_id'), f'This API is only defined for ZeRO-3 partitioned parameters' assert hasattr(param, 'ds_id'), 'This API is only defined for ZeRO-3 partitioned parameters'
param._z3_optimizer.set_local_hp_param(value, param) param._z3_optimizer.set_local_hp_param(value, param)

View File

@ -142,7 +142,7 @@ class SynchronizedWallClockTimer:
def log(self, names, normalizer=1.0, reset=True, memory_breakdown=False, ranks=None): def log(self, names, normalizer=1.0, reset=True, memory_breakdown=False, ranks=None):
"""Log a group of timers.""" """Log a group of timers."""
assert normalizer > 0.0 assert normalizer > 0.0
string = f"time (ms)" string = "time (ms)"
for name in names: for name in names:
if name in self.timers: if name in self.timers:
elapsed_time = (self.timers[name].elapsed(reset=reset) / normalizer) elapsed_time = (self.timers[name].elapsed(reset=reset) / normalizer)

View File

@ -155,7 +155,7 @@ def parse_optim_states(files, ds_checkpoint_dir):
state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
state_dicts.append(state_dict) state_dicts.append(state_dict)
if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]:
raise ValueError(f"{files[0]} is not a zero checkpoint") raise ValueError(f"{files[0]} is not a zero checkpoint")
zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
@ -709,10 +709,10 @@ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
""" """
logger.info(f"Extracting fp32 weights") logger.info("Extracting fp32 weights")
state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
logger.info(f"Overwriting model with fp32 weights") logger.info("Overwriting model with fp32 weights")
model = model.cpu() model = model.cpu()
model.load_state_dict(state_dict, strict=False) model.load_state_dict(state_dict, strict=False)

View File

@ -50,7 +50,7 @@ class FPQuantizerBuilder(CUDAOpBuilder):
except ImportError: except ImportError:
if verbose: if verbose:
self.warning( self.warning(
f"please install triton==2.3.0, 2.3.1 or 3.0.0 if you want to use the FP Quantizer Kernels") "please install triton==2.3.0, 2.3.1 or 3.0.0 if you want to use the FP Quantizer Kernels")
return False return False
# triton 2.3.{0,1} and 3.0.0 are ok. # triton 2.3.{0,1} and 3.0.0 are ok.

View File

@ -42,7 +42,7 @@ class SparseAttnBuilder(OpBuilder):
import torch import torch
except ImportError: except ImportError:
if verbose: if verbose:
self.warning(f"unable to import torch, please install it first") self.warning("unable to import torch, please install it first")
return False return False
# torch-cpu will not have a cuda version # torch-cpu will not have a cuda version
@ -70,7 +70,7 @@ class SparseAttnBuilder(OpBuilder):
# auto-install of triton is broken on some systems, reverting to manual install for now # auto-install of triton is broken on some systems, reverting to manual install for now
# see this issue: https://github.com/deepspeedai/DeepSpeed/issues/1710 # see this issue: https://github.com/deepspeedai/DeepSpeed/issues/1710
if verbose: if verbose:
self.warning(f"please install triton==1.0.0 if you want to use sparse attention") self.warning("please install triton==1.0.0 if you want to use sparse attention")
return False return False
if pkg_version: if pkg_version:

View File

@ -99,7 +99,7 @@ def benchmark():
with cuda_timer(baseline_bw): with cuda_timer(baseline_bw):
ref_out.backward(d_out) ref_out.backward(d_out)
print(f"batch size\tours (FW)\tbaseline (FW)\tours (BW)\tbaseline (BW)") print("batch size\tours (FW)\tbaseline (FW)\tours (BW)\tbaseline (BW)")
for i in range(len(ours_fw)): for i in range(len(ours_fw)):
print(f"{i+1}\t{ours_fw[i]}\t{baseline_fw[i]}\t{ours_bw[i]}\t{baseline_bw[i]}") print(f"{i+1}\t{ours_fw[i]}\t{baseline_fw[i]}\t{ours_bw[i]}\t{baseline_bw[i]}")

View File

@ -15,9 +15,9 @@ import deepspeed
class VerboseLinear(torch.nn.Linear): class VerboseLinear(torch.nn.Linear):
def __init__(self, **kwargs): def __init__(self, **kwargs):
print(f'Begin VerboseLinear.__init__') print('Begin VerboseLinear.__init__')
super().__init__(**kwargs) super().__init__(**kwargs)
print(f'End VerboseLinear.__init__') print('End VerboseLinear.__init__')
class LinearStack(torch.nn.Module): class LinearStack(torch.nn.Module):

View File

@ -24,7 +24,7 @@ from deepspeed.runtime.utils import is_model_parallel_parameter
def skip_on_device(): def skip_on_device():
if get_accelerator().device_name() == 'xpu': if get_accelerator().device_name() == 'xpu':
pytest.skip(f"XPU requires a higher version for test") pytest.skip("XPU requires a higher version for test")
class SequentialLinearModel(torch.nn.Module): class SequentialLinearModel(torch.nn.Module):
@ -449,9 +449,9 @@ class TestSave(DistributedTest):
base_state_dict = base_model.state_dict() base_state_dict = base_model.state_dict()
if dist.get_rank() == 0: if dist.get_rank() == 0:
# we should consider the case when zero3 is used in the future. # we should consider the case when zero3 is used in the future.
assert compare_state_dicts(base_state_dict, tp_state_dict), f"State_dict is not the same!" assert compare_state_dicts(base_state_dict, tp_state_dict), "State_dict is not the same!"
else: else:
assert tp_state_dict is None, f"noly rank0 should have the state_dict" assert tp_state_dict is None, "noly rank0 should have the state_dict"
def test_ckpt_save(self, tmpdir, tp_size: int, zero_stage: int): def test_ckpt_save(self, tmpdir, tp_size: int, zero_stage: int):
skip_on_device() skip_on_device()

View File

@ -288,7 +288,7 @@ class TestExpertWeightGradWithZero(DistributedTest):
""" """
rank = int(deepspeed.comm.get_rank()) rank = int(deepspeed.comm.get_rank())
ep_state_dict = dict() ep_state_dict = dict()
dst_sub_key = f"deepspeed_moe.experts.deepspeed_experts.0" dst_sub_key = "deepspeed_moe.experts.deepspeed_experts.0"
src_sub_key = f"deepspeed_moe.experts.deepspeed_experts.{rank}" src_sub_key = f"deepspeed_moe.experts.deepspeed_experts.{rank}"
for moe_layer in ["moe_1", "moe_2"]: for moe_layer in ["moe_1", "moe_2"]:
for mlp_in_moe in [0, 1]: for mlp_in_moe in [0, 1]:

View File

@ -149,7 +149,7 @@ class TestOneBitAdamExpAvgMask(DistributedTest):
v["exp_avg"], v["exp_avg"],
v["exp_avg"].mul_(mask1.to(device=v["exp_avg"].device)), v["exp_avg"].mul_(mask1.to(device=v["exp_avg"].device)),
atol=1e-07, atol=1e-07,
), f"Momentum mask is not working properly" ), "Momentum mask is not working properly"
class TestOneBitAdamCheckpointing(DistributedTest): class TestOneBitAdamCheckpointing(DistributedTest):
@ -241,11 +241,11 @@ class TestOneBitAdamCheckpointing(DistributedTest):
assert optimizer_1.optimizer.adam_freeze_key is True assert optimizer_1.optimizer.adam_freeze_key is True
mask1 = mask1.to(device=optimizer_1.param_groups[0]["exp_avg_mask"].device) mask1 = mask1.to(device=optimizer_1.param_groups[0]["exp_avg_mask"].device)
assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1, assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1,
atol=1e-07), f"Incorrect momentum mask" atol=1e-07), "Incorrect momentum mask"
save_folder = os.path.join(tmpdir, "saved_checkpoint") save_folder = os.path.join(tmpdir, "saved_checkpoint")
model_1.save_checkpoint(save_folder, tag=None) model_1.save_checkpoint(save_folder, tag=None)
assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1, assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1,
atol=1e-07), f"Momentum mask should not change after saving checkpoint" atol=1e-07), "Momentum mask should not change after saving checkpoint"
model_2, optimizer_2, _, _ = deepspeed.initialize( model_2, optimizer_2, _, _ = deepspeed.initialize(
config=config_dict, config=config_dict,
@ -255,7 +255,7 @@ class TestOneBitAdamCheckpointing(DistributedTest):
# Test whether momentum mask stays the same after loading checkpoint # Test whether momentum mask stays the same after loading checkpoint
mask2 = mask2.to(device=optimizer_2.param_groups[0]["exp_avg_mask"].device) mask2 = mask2.to(device=optimizer_2.param_groups[0]["exp_avg_mask"].device)
assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2, assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2,
atol=1e-07), f"Incorrect momentum mask" atol=1e-07), "Incorrect momentum mask"
model_2.load_checkpoint( model_2.load_checkpoint(
save_folder, save_folder,
tag=None, tag=None,
@ -263,11 +263,11 @@ class TestOneBitAdamCheckpointing(DistributedTest):
load_lr_scheduler_states=True, load_lr_scheduler_states=True,
) )
assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2, assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2,
atol=1e-07), f"Momentum mask should not change after loading checkpoint" atol=1e-07), "Momentum mask should not change after loading checkpoint"
# Test whether worker&server error is reset # Test whether worker&server error is reset
for v in optimizer_2.state.values(): for v in optimizer_2.state.values():
assert "worker_error" not in v, f"Incorrect worker error" assert "worker_error" not in v, "Incorrect worker error"
assert "server_error" not in v, f"Incorrect server error" assert "server_error" not in v, "Incorrect server error"
assert optimizer_2.optimizer.adam_freeze_key is True assert optimizer_2.optimizer.adam_freeze_key is True
model_3, optimizer_3, _, _ = deepspeed.initialize( model_3, optimizer_3, _, _ = deepspeed.initialize(
@ -287,7 +287,7 @@ class TestOneBitAdamCheckpointing(DistributedTest):
model_3.step() model_3.step()
assert optimizer_3.optimizer.adam_freeze_key is True assert optimizer_3.optimizer.adam_freeze_key is True
# Test whether momentum mask stays the same after loading checkpoint # Test whether momentum mask stays the same after loading checkpoint
assert ("exp_avg_mask" not in optimizer_3.param_groups[0]), f"Incorrect momentum mask" assert ("exp_avg_mask" not in optimizer_3.param_groups[0]), "Incorrect momentum mask"
model_3.load_checkpoint( model_3.load_checkpoint(
save_folder, save_folder,
tag=None, tag=None,
@ -295,11 +295,11 @@ class TestOneBitAdamCheckpointing(DistributedTest):
load_lr_scheduler_states=True, load_lr_scheduler_states=True,
) )
assert ("exp_avg_mask" assert ("exp_avg_mask"
not in optimizer_3.param_groups[0]), f"Momentum mask should not change after loading checkpoint" not in optimizer_3.param_groups[0]), "Momentum mask should not change after loading checkpoint"
# Test whether worker&server error is reset # Test whether worker&server error is reset
for v in optimizer_3.state.values(): for v in optimizer_3.state.values():
assert "worker_error" not in v, f"Incorrect worker error" assert "worker_error" not in v, "Incorrect worker error"
assert "server_error" not in v, f"Incorrect server error" assert "server_error" not in v, "Incorrect server error"
assert optimizer_3.optimizer.adam_freeze_key is False assert optimizer_3.optimizer.adam_freeze_key is False
def test_overflow(self, tmpdir): def test_overflow(self, tmpdir):
@ -518,7 +518,7 @@ class TestZeroOneAdamExpAvgMask(DistributedTest):
v["exp_avg"], v["exp_avg"],
v["exp_avg"].mul_(mask1.to(device=v["exp_avg"].device)), v["exp_avg"].mul_(mask1.to(device=v["exp_avg"].device)),
atol=1e-07, atol=1e-07,
), f"Momentum mask is not working properly" ), "Momentum mask is not working properly"
class TestZeroOneAdamCheckpointing(DistributedTest): class TestZeroOneAdamCheckpointing(DistributedTest):
@ -614,11 +614,11 @@ class TestZeroOneAdamCheckpointing(DistributedTest):
# Test whether momentum mask still exist after saving checkpoint # Test whether momentum mask still exist after saving checkpoint
mask1 = mask1.to(device=optimizer_1.param_groups[0]["exp_avg_mask"].device) mask1 = mask1.to(device=optimizer_1.param_groups[0]["exp_avg_mask"].device)
assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1, assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1,
atol=1e-07), f"Incorrect momentum mask" atol=1e-07), "Incorrect momentum mask"
save_folder = os.path.join(tmpdir, "saved_checkpoint") save_folder = os.path.join(tmpdir, "saved_checkpoint")
model_1.save_checkpoint(save_folder, tag=None) model_1.save_checkpoint(save_folder, tag=None)
assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1, assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1,
atol=1e-07), f"Momentum mask should not change after saving checkpoint" atol=1e-07), "Momentum mask should not change after saving checkpoint"
model_2, optimizer_2, _, _ = deepspeed.initialize( model_2, optimizer_2, _, _ = deepspeed.initialize(
config=config_dict, config=config_dict,
@ -628,7 +628,7 @@ class TestZeroOneAdamCheckpointing(DistributedTest):
# Test whether momentum mask stays the same after loading checkpoint # Test whether momentum mask stays the same after loading checkpoint
mask2 = mask2.to(device=optimizer_2.param_groups[0]["exp_avg_mask"].device) mask2 = mask2.to(device=optimizer_2.param_groups[0]["exp_avg_mask"].device)
assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2, assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2,
atol=1e-07), f"Incorrect momentum mask" atol=1e-07), "Incorrect momentum mask"
model_2.load_checkpoint( model_2.load_checkpoint(
save_folder, save_folder,
tag=None, tag=None,
@ -636,11 +636,11 @@ class TestZeroOneAdamCheckpointing(DistributedTest):
load_lr_scheduler_states=True, load_lr_scheduler_states=True,
) )
assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2, assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2,
atol=1e-07), f"Momentum mask should not change after loading checkpoint" atol=1e-07), "Momentum mask should not change after loading checkpoint"
# Test whether worker&server error is reset # Test whether worker&server error is reset
for v in optimizer_2.state.values(): for v in optimizer_2.state.values():
assert "worker_error" not in v, f"Incorrect worker error" assert "worker_error" not in v, "Incorrect worker error"
assert "server_error" not in v, f"Incorrect server error" assert "server_error" not in v, "Incorrect server error"
model_3, optimizer_3, _, _ = deepspeed.initialize( model_3, optimizer_3, _, _ = deepspeed.initialize(
config=config_dict, config=config_dict,
@ -658,7 +658,7 @@ class TestZeroOneAdamCheckpointing(DistributedTest):
model_3.backward(loss) model_3.backward(loss)
model_3.step() model_3.step()
# Test whether momentum mask stays the same after loading checkpoint # Test whether momentum mask stays the same after loading checkpoint
assert ("exp_avg_mask" not in optimizer_3.param_groups[0]), f"Incorrect momentum mask" assert ("exp_avg_mask" not in optimizer_3.param_groups[0]), "Incorrect momentum mask"
model_3.load_checkpoint( model_3.load_checkpoint(
save_folder, save_folder,
tag=None, tag=None,
@ -666,11 +666,11 @@ class TestZeroOneAdamCheckpointing(DistributedTest):
load_lr_scheduler_states=True, load_lr_scheduler_states=True,
) )
assert ("exp_avg_mask" assert ("exp_avg_mask"
not in optimizer_3.param_groups[0]), f"Momentum mask should not change after loading checkpoint" not in optimizer_3.param_groups[0]), "Momentum mask should not change after loading checkpoint"
# Test whether worker&server error is reset # Test whether worker&server error is reset
for v in optimizer_3.state.values(): for v in optimizer_3.state.values():
assert "worker_error" not in v, f"Incorrect worker error" assert "worker_error" not in v, "Incorrect worker error"
assert "server_error" not in v, f"Incorrect server error" assert "server_error" not in v, "Incorrect server error"
def test_overflow(self, tmpdir): def test_overflow(self, tmpdir):
if not get_accelerator().is_fp16_supported(): if not get_accelerator().is_fp16_supported():
@ -899,7 +899,7 @@ class TestOneBitLampExpAvgMask(DistributedTest):
v["exp_avg"], v["exp_avg"],
v["exp_avg"].mul_(mask1.to(device=v["exp_avg"].device)), v["exp_avg"].mul_(mask1.to(device=v["exp_avg"].device)),
atol=1e-07, atol=1e-07,
), f"Momentum mask is not working properly" ), "Momentum mask is not working properly"
class TestOneBitLambCheckpointing(DistributedTest): class TestOneBitLambCheckpointing(DistributedTest):
@ -997,15 +997,15 @@ class TestOneBitLambCheckpointing(DistributedTest):
assert optimizer_1.optimizer.lamb_freeze_key is True assert optimizer_1.optimizer.lamb_freeze_key is True
mask1 = mask1.to(device=optimizer_1.param_groups[0]["exp_avg_mask"].device) mask1 = mask1.to(device=optimizer_1.param_groups[0]["exp_avg_mask"].device)
assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1, assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1,
atol=1e-07), f"Incorrect momentum mask" atol=1e-07), "Incorrect momentum mask"
scaling_coeff_1 = [] scaling_coeff_1 = []
for v in optimizer_1.state.values(): for v in optimizer_1.state.values():
assert "scaling_coeff" in v, f"Incorrect scaling_coeff" assert "scaling_coeff" in v, "Incorrect scaling_coeff"
scaling_coeff_1.append(v["scaling_coeff"]) scaling_coeff_1.append(v["scaling_coeff"])
save_folder = os.path.join(tmpdir, "saved_checkpoint") save_folder = os.path.join(tmpdir, "saved_checkpoint")
model_1.save_checkpoint(save_folder, tag=None) model_1.save_checkpoint(save_folder, tag=None)
assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1, assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1,
atol=1e-07), f"Momentum mask should not change after saving checkpoint" atol=1e-07), "Momentum mask should not change after saving checkpoint"
model_2, optimizer_2, _, _ = deepspeed.initialize( model_2, optimizer_2, _, _ = deepspeed.initialize(
config=config_dict, config=config_dict,
@ -1015,7 +1015,7 @@ class TestOneBitLambCheckpointing(DistributedTest):
# Test whether momentum mask stays the same after loading checkpoint # Test whether momentum mask stays the same after loading checkpoint
mask2 = mask2.to(device=optimizer_2.param_groups[0]["exp_avg_mask"].device) mask2 = mask2.to(device=optimizer_2.param_groups[0]["exp_avg_mask"].device)
assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2, assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2,
atol=1e-07), f"Incorrect momentum mask" atol=1e-07), "Incorrect momentum mask"
model_2.load_checkpoint( model_2.load_checkpoint(
save_folder, save_folder,
tag=None, tag=None,
@ -1023,16 +1023,16 @@ class TestOneBitLambCheckpointing(DistributedTest):
load_lr_scheduler_states=True, load_lr_scheduler_states=True,
) )
assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2, assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2,
atol=1e-07), f"Momentum mask should not change after loading checkpoint" atol=1e-07), "Momentum mask should not change after loading checkpoint"
# Test whether worker&server error is reset # Test whether worker&server error is reset
assert len(optimizer_2.optimizer.worker_errors) == 0, f"Incorrect worker error" assert len(optimizer_2.optimizer.worker_errors) == 0, "Incorrect worker error"
assert len(optimizer_2.optimizer.server_errors) == 0, f"Incorrect server error" assert len(optimizer_2.optimizer.server_errors) == 0, "Incorrect server error"
# Test whether scaling_coeffs is loaded correctly # Test whether scaling_coeffs is loaded correctly
scaling_coeff_2 = [] scaling_coeff_2 = []
for v in optimizer_2.state.values(): for v in optimizer_2.state.values():
assert "scaling_coeff" in v, f"Incorrect scaling_coeff" assert "scaling_coeff" in v, "Incorrect scaling_coeff"
scaling_coeff_2.append(v["scaling_coeff"]) scaling_coeff_2.append(v["scaling_coeff"])
assert list(sorted(scaling_coeff_2)) == list(sorted(scaling_coeff_1)), f"Incorrect scaling_coeffs" assert list(sorted(scaling_coeff_2)) == list(sorted(scaling_coeff_1)), "Incorrect scaling_coeffs"
assert optimizer_2.optimizer.lamb_freeze_key is True assert optimizer_2.optimizer.lamb_freeze_key is True
model_3, optimizer_3, _, _ = deepspeed.initialize( model_3, optimizer_3, _, _ = deepspeed.initialize(
@ -1052,7 +1052,7 @@ class TestOneBitLambCheckpointing(DistributedTest):
model_3.step() model_3.step()
assert optimizer_3.optimizer.lamb_freeze_key is True assert optimizer_3.optimizer.lamb_freeze_key is True
# Test whether momentum mask stays the same after loading checkpoint # Test whether momentum mask stays the same after loading checkpoint
assert ("exp_avg_mask" not in optimizer_3.param_groups[0]), f"Incorrect momentum mask" assert ("exp_avg_mask" not in optimizer_3.param_groups[0]), "Incorrect momentum mask"
model_3.load_checkpoint( model_3.load_checkpoint(
save_folder, save_folder,
tag=None, tag=None,
@ -1060,15 +1060,15 @@ class TestOneBitLambCheckpointing(DistributedTest):
load_lr_scheduler_states=True, load_lr_scheduler_states=True,
) )
assert ("exp_avg_mask" assert ("exp_avg_mask"
not in optimizer_3.param_groups[0]), f"Momentum mask should not change after loading checkpoint" not in optimizer_3.param_groups[0]), "Momentum mask should not change after loading checkpoint"
# Test whether worker&server error is reset # Test whether worker&server error is reset
assert len(optimizer_3.optimizer.worker_errors) == 0, f"Incorrect worker error" assert len(optimizer_3.optimizer.worker_errors) == 0, "Incorrect worker error"
assert len(optimizer_3.optimizer.server_errors) == 0, f"Incorrect server error" assert len(optimizer_3.optimizer.server_errors) == 0, "Incorrect server error"
# Test whether scaling_coeffs, lamb_coeff_freeze, last_factor are reset # Test whether scaling_coeffs, lamb_coeff_freeze, last_factor are reset
for v in optimizer_3.state.values(): for v in optimizer_3.state.values():
assert v["lamb_coeff_freeze"] == 0.0, f"Incorrect lamb_coeff_freeze" assert v["lamb_coeff_freeze"] == 0.0, "Incorrect lamb_coeff_freeze"
assert v["last_factor"] == 1.0, f"Incorrect last_factor" assert v["last_factor"] == 1.0, "Incorrect last_factor"
assert "scaling_coeff" not in v, f"Incorrect scaling_coeff" assert "scaling_coeff" not in v, "Incorrect scaling_coeff"
assert optimizer_3.optimizer.lamb_freeze_key is False assert optimizer_3.optimizer.lamb_freeze_key is False
def test_overflow(self, tmpdir): def test_overflow(self, tmpdir):

View File

@ -236,4 +236,4 @@ class TestLegacyCurriculumScheduler(DistributedTest):
model.step() model.step()
if n + 1 in ground_truths: if n + 1 in ground_truths:
true_seqlen = ground_truths[n + 1] true_seqlen = ground_truths[n + 1]
assert seqlen == true_seqlen, f"Incorrect curriculum schedule" assert seqlen == true_seqlen, "Incorrect curriculum schedule"

View File

@ -49,7 +49,7 @@ class TestTwoOutputModel(DistributedTest):
targets=[1, 2]) targets=[1, 2])
for n, batch in enumerate(data_loader): for n, batch in enumerate(data_loader):
assert len(batch) % 2 == 0, \ assert len(batch) % 2 == 0, \
f"multi_output_dataloader failed to return even number of data samples (input+target)" "multi_output_dataloader failed to return even number of data samples (input+target)"
midpoint = len(batch) // 2 midpoint = len(batch) // 2
inputs, targets = batch[:midpoint], batch[midpoint:] inputs, targets = batch[:midpoint], batch[midpoint:]
@ -107,7 +107,7 @@ class TestThreeOutputModel(DistributedTest):
targets=[1, 2, 3]) targets=[1, 2, 3])
for n, batch in enumerate(data_loader): for n, batch in enumerate(data_loader):
assert len(batch) % 2 == 0, \ assert len(batch) % 2 == 0, \
f"multi_output_dataloader failed to return even number of data samples (input+target)" "multi_output_dataloader failed to return even number of data samples (input+target)"
midpoint = len(batch) // 2 midpoint = len(batch) // 2
inputs, targets = batch[:midpoint], batch[midpoint:] inputs, targets = batch[:midpoint], batch[midpoint:]

View File

@ -62,14 +62,14 @@ def run_model(model, param_groups, config_dict, hidden_dim, dtype, offloaded_sta
pin_memory=pin_memory, pin_memory=pin_memory,
non_blocking=non_blocking) non_blocking=non_blocking)
alloc_after_offload = get_accelerator().memory_allocated() alloc_after_offload = get_accelerator().memory_allocated()
assert alloc_after_offload < alloc_before_offload, f"Allocated memory should decrease after offload" assert alloc_after_offload < alloc_before_offload, "Allocated memory should decrease after offload"
validate_device(model, torch.device(offload_device.value), offloaded_states) validate_device(model, torch.device(offload_device.value), offloaded_states)
# Reload states # Reload states
model.reload_states() model.reload_states()
assert alloc_after_offload < get_accelerator().memory_allocated( assert alloc_after_offload < get_accelerator().memory_allocated(
), f"Allocated memory should increase after offload back" ), "Allocated memory should increase after offload back"
# Verify restored states # Verify restored states
hp_param_restored = [safe_get_local_fp32_param(p) for p in model.parameters()] hp_param_restored = [safe_get_local_fp32_param(p) for p in model.parameters()]

View File

@ -255,7 +255,7 @@ class TestZ3LeafOptimization(DistributedTest):
loss, duration = bench_loss_and_time(config_dict) loss, duration = bench_loss_and_time(config_dict)
if dist.get_rank() == 0: if dist.get_rank() == 0:
print(f"baseline exec time:", baseline_exec_time) print("baseline exec time:", baseline_exec_time)
print( print(
f"finegrained optimziation exec time: {duration},granularity threshold:{module_granularity_threshold} " f"finegrained optimziation exec time: {duration},granularity threshold:{module_granularity_threshold} "
) )