mirror of
https://github.com/deepspeedai/DeepSpeed.git
synced 2025-10-20 15:33:51 +08:00
Fix invalid f-strings (#7457)
Fix invalid f-strings detected by ruff. --------- Signed-off-by: cyy <cyyever@outlook.com> Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> Co-authored-by: Olatunji Ruwase <tunji.ruwase@snowflake.com> Co-authored-by: Michael Wyatt <michael.wyatt@snowflake.com>
This commit is contained in:
@ -27,7 +27,7 @@ class HPU_Accelerator(DeepSpeedAccelerator):
|
||||
torch.utils.deterministic.fill_uninitialized_memory = False
|
||||
except ImportError as e:
|
||||
raise ValueError(
|
||||
f"HPU_Accelerator requires habana_frameworks.torch.hpu, which is not installed on this system.")
|
||||
"HPU_Accelerator requires habana_frameworks.torch.hpu, which is not installed on this system.")
|
||||
|
||||
self.fp16_supported = None
|
||||
|
||||
|
@ -64,13 +64,13 @@ def get_accelerator():
|
||||
assert ipex._C._has_xpu(), "XPU_Accelerator requires an intel_extension_for_pytorch that supports XPU."
|
||||
except ImportError as e:
|
||||
raise ValueError(
|
||||
f"XPU_Accelerator requires intel_extension_for_pytorch, which is not installed on this system.")
|
||||
"XPU_Accelerator requires intel_extension_for_pytorch, which is not installed on this system.")
|
||||
elif accelerator_name == "xpu.external":
|
||||
try:
|
||||
from intel_extension_for_deepspeed import XPU_Accelerator # noqa: F401 # type: ignore
|
||||
except ImportError as e:
|
||||
raise ValueError(
|
||||
f"XPU_Accelerator external requires intel_extension_for_deepspeed, which is not installed on this system."
|
||||
"XPU_Accelerator external requires intel_extension_for_deepspeed, which is not installed on this system."
|
||||
)
|
||||
elif accelerator_name == "cpu":
|
||||
pass
|
||||
@ -78,13 +78,13 @@ def get_accelerator():
|
||||
try:
|
||||
import torch_npu # noqa: F401 # type: ignore
|
||||
except ImportError as e:
|
||||
raise ValueError(f"NPU_Accelerator requires torch_npu, which is not installed on this system.")
|
||||
raise ValueError("NPU_Accelerator requires torch_npu, which is not installed on this system.")
|
||||
pass
|
||||
elif accelerator_name == "sdaa":
|
||||
try:
|
||||
import torch_sdaa # noqa: F401 # type: ignore
|
||||
except ImportError as e:
|
||||
raise ValueError(f"SDAA_Accelerator requires torch_sdaa, which is not installed on this system.")
|
||||
raise ValueError("SDAA_Accelerator requires torch_sdaa, which is not installed on this system.")
|
||||
pass
|
||||
elif accelerator_name == "mps":
|
||||
try:
|
||||
@ -93,18 +93,18 @@ def get_accelerator():
|
||||
# should use torch.mps.is_available() if it exists someday but this is used as proxy
|
||||
torch.mps.current_allocated_memory()
|
||||
except (RuntimeError, ImportError) as e:
|
||||
raise ValueError(f"MPS_Accelerator requires torch.mps, which is not installed on this system.")
|
||||
raise ValueError("MPS_Accelerator requires torch.mps, which is not installed on this system.")
|
||||
elif accelerator_name == "hpu":
|
||||
try:
|
||||
import habana_frameworks.torch.hpu # noqa: F401
|
||||
except ImportError as e:
|
||||
raise ValueError(
|
||||
f"HPU_Accelerator requires habana_frameworks.torch.hpu, which is not installed on this system.")
|
||||
"HPU_Accelerator requires habana_frameworks.torch.hpu, which is not installed on this system.")
|
||||
elif accelerator_name == "mlu":
|
||||
try:
|
||||
import torch_mlu # noqa: F401
|
||||
except ImportError as e:
|
||||
raise ValueError(f"MLU_Accelerator requires torch_mlu, which is not installed on this system.")
|
||||
raise ValueError("MLU_Accelerator requires torch_mlu, which is not installed on this system.")
|
||||
elif accelerator_name not in SUPPORTED_ACCELERATOR_LIST:
|
||||
raise ValueError(f'DS_ACCELERATOR must be one of {SUPPORTED_ACCELERATOR_LIST}. '
|
||||
f'Value "{accelerator_name}" is not supported')
|
||||
|
@ -70,10 +70,10 @@ def validate_args(args):
|
||||
error_messages = []
|
||||
|
||||
if args.folder is not None and len(args.folder_to_device_mapping) > 0:
|
||||
error_messages.append(f'--folder and --folder_to_device_mapping cannot be specified together.')
|
||||
error_messages.append('--folder and --folder_to_device_mapping cannot be specified together.')
|
||||
no_error = False
|
||||
elif args.folder is None and len(args.folder_to_device_mapping) == 0:
|
||||
error_messages.append(f'At least one of --folder or --folder_to_device_mapping must be specified.')
|
||||
error_messages.append('At least one of --folder or --folder_to_device_mapping must be specified.')
|
||||
no_error = False
|
||||
|
||||
# Validate --folder
|
||||
@ -102,7 +102,7 @@ def validate_args(args):
|
||||
print(f'Found {len(error_messages)} validation error(s)')
|
||||
# Validate --gpu, --use_gds
|
||||
if args.use_gds and not args.gpu:
|
||||
error_messages.append(f'--gpu must be set to transfer with --use_gds')
|
||||
error_messages.append('--gpu must be set to transfer with --use_gds')
|
||||
no_error = False
|
||||
|
||||
if not no_error:
|
||||
@ -201,7 +201,7 @@ def get_validated_args():
|
||||
args = refine_args(args)
|
||||
if not validate_args(args):
|
||||
quit()
|
||||
print(f'Successful validation of command line arguments')
|
||||
print('Successful validation of command line arguments')
|
||||
args.total_loops = args.warmup_loops + args.loops
|
||||
peer_tag = 'gpu' if args.gpu else 'process'
|
||||
args.mapping_dict = _get_mapping_dict(args)
|
||||
|
@ -54,7 +54,7 @@ class AIOBasic_Engine(object):
|
||||
task_log(tid,
|
||||
f'{io_string} file {filename} of size {args.io_size} bytes from buffer on device {buffer.device}')
|
||||
|
||||
task_log(tid, f'created deepspeed aio basic engine')
|
||||
task_log(tid, 'created deepspeed aio basic engine')
|
||||
|
||||
ctxt = {}
|
||||
ctxt[FILE] = filename
|
||||
|
@ -80,7 +80,7 @@ class AIOHandle_Engine(object):
|
||||
io_parallel = args.io_parallel if args.io_parallel else 1
|
||||
handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit,
|
||||
not args.sequential_requests, io_parallel)
|
||||
task_log(tid, f'created deepspeed aio handle engine')
|
||||
task_log(tid, 'created deepspeed aio handle engine')
|
||||
|
||||
bounce_buffer = None
|
||||
if args.gpu:
|
||||
|
@ -76,7 +76,7 @@ def io_engine_tasklet(pool_params):
|
||||
task_barrier(aio_barrier, num_processes)
|
||||
|
||||
# Run pre task
|
||||
task_log(tid, f'running pre-task')
|
||||
task_log(tid, 'running pre-task')
|
||||
io_engine = schedule["pre"]((args, tid))
|
||||
task_barrier(aio_barrier, num_processes)
|
||||
|
||||
@ -91,7 +91,7 @@ def io_engine_tasklet(pool_params):
|
||||
io_engine.ctxt["main_task_sec"].append(stop_time - start_time)
|
||||
|
||||
# Run post task
|
||||
task_log(tid, f'running post-task')
|
||||
task_log(tid, 'running post-task')
|
||||
schedule["post"]((args, tid, io_engine))
|
||||
task_barrier(aio_barrier, num_processes)
|
||||
|
||||
|
@ -12,7 +12,7 @@ from io_engine import io_engine_multiprocessing
|
||||
|
||||
|
||||
def main():
|
||||
print(f'Testing deepspeed_aio python frontend')
|
||||
print('Testing deepspeed_aio python frontend')
|
||||
|
||||
args = get_validated_args()
|
||||
mp.set_start_method('spawn', force=True)
|
||||
|
@ -15,7 +15,7 @@ from deepspeed.io import FastFileWriter
|
||||
class Torch_FastIO_Engine(object):
|
||||
|
||||
def __init__(self, args, tid, read_op):
|
||||
assert read_op is False, f'Read operation is not currently supported'
|
||||
assert read_op is False, 'Read operation is not currently supported'
|
||||
self.ctxt = self._create_context(args, tid, read_op)
|
||||
self.zipfile_serialization = not args.torch_legacy_save
|
||||
|
||||
@ -69,7 +69,7 @@ class Torch_FastIO_Engine(object):
|
||||
|
||||
fast_io_buffer = create_page_locked_tensor(args.fast_io_size, args.use_accelerator_pin_memory, aio_handle)
|
||||
|
||||
task_log(tid, f'created torch_fastio engine')
|
||||
task_log(tid, 'created torch_fastio engine')
|
||||
|
||||
ctxt = {}
|
||||
ctxt[FILE] = filename
|
||||
|
@ -54,7 +54,7 @@ class TorchIO_Engine(object):
|
||||
f'{io_string} file {filename} of size {args.io_size} bytes from buffer on device {buffer.device}',
|
||||
force=True)
|
||||
|
||||
task_log(tid, f'created torch_io engine')
|
||||
task_log(tid, 'created torch_io engine')
|
||||
|
||||
ctxt = {}
|
||||
ctxt[FILE] = filename
|
||||
|
@ -145,7 +145,7 @@ class Autotuner:
|
||||
f"{best_exp['name']} is the optimal setup after tuning. The exp result is at {best_exp['result_dir']}."
|
||||
)
|
||||
else:
|
||||
logger.info(f"No optimal setup is found. Please check that experiments were run successfully.")
|
||||
logger.info("No optimal setup is found. Please check that experiments were run successfully.")
|
||||
tuning_duration = datetime.timedelta(seconds=(time.time() - self.start_time))
|
||||
|
||||
logger.info(f"Tuning completed in {tuning_duration}")
|
||||
@ -410,7 +410,7 @@ class Autotuner:
|
||||
|
||||
self.start_time = time.time()
|
||||
if self.fast_enabled():
|
||||
logger.info(f"Fast mode is enabled. Tuning micro batch size only.")
|
||||
logger.info("Fast mode is enabled. Tuning micro batch size only.")
|
||||
|
||||
# model info profile run with DEFAULT_MIN_MEM_CONFIG
|
||||
model_info = self.model_info_profile_run()
|
||||
@ -1110,4 +1110,4 @@ class Autotuner:
|
||||
|
||||
logger.info(f"Done running with the optimal DeepSpeed configuration using {self.optimal_cmd}")
|
||||
else:
|
||||
logger.info(f"No optimal DeepSpeed configuration found by autotuning.")
|
||||
logger.info("No optimal DeepSpeed configuration found by autotuning.")
|
||||
|
@ -94,14 +94,14 @@ class DeepSpeedCheckpoint(object):
|
||||
return self.dp_degree != self.zero_checkpoint.get_src_dp_degree()
|
||||
|
||||
def show_2d_mapping(self):
|
||||
print(f'reshaped 2d map ---- begin')
|
||||
print('reshaped 2d map ---- begin')
|
||||
|
||||
for i in range(self.pp_degree):
|
||||
for j in range(self.tp_degree):
|
||||
file_list = self.get_2d_parallel_files(pp_index=i, tp_index=j)
|
||||
print(f'[{i}, {j}] = {file_list}')
|
||||
|
||||
print(f'reshaped 2d map ---- end')
|
||||
print('reshaped 2d map ---- end')
|
||||
|
||||
def show_tp_embedding_map(self):
|
||||
self._dump_mapping(self.tp_to_embedding_map, 'tp_to_embedding_layers')
|
||||
@ -137,7 +137,7 @@ class DeepSpeedCheckpoint(object):
|
||||
return self.layer_keys[self.final_layer_norm_idx]
|
||||
|
||||
def get_iteration(self):
|
||||
if not ITERATION_KEY in self.global_state:
|
||||
if ITERATION_KEY not in self.global_state:
|
||||
sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'), weights_only=False)
|
||||
self.global_state[ITERATION_KEY] = sd.get(ITERATION_KEY, 0)
|
||||
|
||||
@ -157,7 +157,7 @@ class DeepSpeedCheckpoint(object):
|
||||
return self.tp_to_embedding_map[tp_index]
|
||||
|
||||
def _get_checkpoint_value(self, key):
|
||||
if not key in self.global_state:
|
||||
if key not in self.global_state:
|
||||
sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'), weights_only=False)
|
||||
self.global_state[key] = sd.get(key, None)
|
||||
|
||||
@ -254,7 +254,7 @@ class DeepSpeedCheckpoint(object):
|
||||
layer_file_partitions = partition_data(layer_files, self.tp_degree)
|
||||
for tp_index in range(self.tp_degree):
|
||||
map_key = (tp_index, pp_index)
|
||||
if not map_key in file_map.keys():
|
||||
if map_key not in file_map.keys():
|
||||
file_map[map_key] = []
|
||||
file_map[map_key].append(layer_file_partitions[tp_index])
|
||||
|
||||
@ -286,7 +286,7 @@ class DeepSpeedCheckpoint(object):
|
||||
def _merge_state_dicts(self, sd_list):
|
||||
merged_sd = {}
|
||||
for key in sd_list[0].keys():
|
||||
if not key in SEQUENTIAL_LAYERS:
|
||||
if key not in SEQUENTIAL_LAYERS:
|
||||
cat_dim = LAYER_CONCAT_DIM.get(key, 0)
|
||||
merged_sd[key] = torch.cat([sd[key] for sd in sd_list], dim=cat_dim)
|
||||
else:
|
||||
|
@ -269,7 +269,7 @@ def merge_tp_slices(ds_checkpoint, dir, slice_dir, tp_degree, name_and_shape):
|
||||
|
||||
step_merged = _merge_zero_shards(slice_base_path, "step", tp_degree, shape)
|
||||
if step_merged:
|
||||
_save_checkpoint(os.path.join(param_base_path, f"step.pt"), step_merged[0])
|
||||
_save_checkpoint(os.path.join(param_base_path, "step.pt"), step_merged[0])
|
||||
|
||||
for state in ("fp32", "exp_avg", "exp_avg_sq"):
|
||||
slices = _merge_zero_shards(slice_base_path, state, tp_degree, shape)
|
||||
@ -415,7 +415,7 @@ def _save_optimizer_state(args, ds_checkpoint):
|
||||
output_sd = {k: v for k, v in optim_sd.items() if k not in sharded_states}
|
||||
output_sd[PARAM_GROUPS] = optim_sd[BASE_OPTIMIZER_STATE][PARAM_GROUPS]
|
||||
zero_output_folder = os.path.join(args.output_folder, "zero")
|
||||
output_file_path = os.path.join(zero_output_folder, f"optimizer_state.pt")
|
||||
output_file_path = os.path.join(zero_output_folder, "optimizer_state.pt")
|
||||
_save_checkpoint(output_file_path, output_sd)
|
||||
|
||||
|
||||
@ -424,7 +424,7 @@ def _save_optimizer_state_stage3(args, optim_files):
|
||||
output_sd = sd[OPTIMIZER_STATE_DICT]
|
||||
output_sd[PARAM_GROUPS] = output_sd[OPTIMIZER_STATE_DICT][PARAM_GROUPS]
|
||||
zero_output_folder = os.path.join(args.output_folder, "zero")
|
||||
output_file_path = os.path.join(zero_output_folder, f"optimizer_state.pt")
|
||||
output_file_path = os.path.join(zero_output_folder, "optimizer_state.pt")
|
||||
_save_checkpoint(output_file_path, output_sd)
|
||||
|
||||
|
||||
@ -467,7 +467,7 @@ def _check_for_required_state(ds_checkpoint):
|
||||
|
||||
|
||||
def main(args):
|
||||
print(f'Convert DeepSpeed Checkpoint to Universal Checkpoint')
|
||||
print('Convert DeepSpeed Checkpoint to Universal Checkpoint')
|
||||
|
||||
print(f'Converting DeepSpeed checkpoint in {args.input_folder} to Universal checkpoint in {args.output_folder}')
|
||||
|
||||
|
@ -24,7 +24,7 @@ class meg_2d_parallel_map(object):
|
||||
assert type(data) is list
|
||||
|
||||
key = self._make_key(pp_index, tp_index)
|
||||
if not key in self.map.keys():
|
||||
if key not in self.map.keys():
|
||||
self.map[key] = []
|
||||
self.map[key] += data
|
||||
|
||||
@ -84,14 +84,14 @@ def reshape_meg_2d_parallel(old_pp_degree, old_tp_degree, new_pp_degree, new_tp_
|
||||
old_2d_map = meg_2d_parallel_map(old_pp_degree, old_tp_degree)
|
||||
old_2d_map.simple_init()
|
||||
if verbose:
|
||||
old_2d_map.print_data(f'original_2d_map:')
|
||||
old_2d_map.print_data('original_2d_map:')
|
||||
|
||||
if old_tp_degree != new_tp_degree:
|
||||
new_tp_map = _reshape_tp_dimension(old_2d_map, new_tp_degree)
|
||||
else:
|
||||
new_tp_map = old_2d_map
|
||||
if verbose:
|
||||
new_tp_map.print_data(f'after_tp_reshape:')
|
||||
new_tp_map.print_data('after_tp_reshape:')
|
||||
|
||||
if old_pp_degree != new_pp_degree:
|
||||
final_map = _reshape_pp_dimension(new_tp_map, new_pp_degree)
|
||||
@ -99,7 +99,7 @@ def reshape_meg_2d_parallel(old_pp_degree, old_tp_degree, new_pp_degree, new_tp_
|
||||
final_map = new_tp_map
|
||||
|
||||
if verbose:
|
||||
final_map.print_data(f'final_2d_map:')
|
||||
final_map.print_data('final_2d_map:')
|
||||
|
||||
return final_map
|
||||
|
||||
@ -159,7 +159,7 @@ def get_mpu_ranks(tp_size=1, pp_size=1, dp_size=1, virtual_pp_size=None):
|
||||
ranks = [data_parallel_group_ranks[i] for data_parallel_group_ranks in all_dp_group_ranks]
|
||||
all_pp_group_ranks.append(list(ranks))
|
||||
|
||||
print(f"PP", all_pp_group_ranks)
|
||||
print("PP", all_pp_group_ranks)
|
||||
|
||||
# Build the tensor model-parallel groups.
|
||||
all_tp_group_ranks = []
|
||||
@ -167,7 +167,7 @@ def get_mpu_ranks(tp_size=1, pp_size=1, dp_size=1, virtual_pp_size=None):
|
||||
ranks = range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size)
|
||||
all_tp_group_ranks.append(list(ranks))
|
||||
|
||||
print(f"TP", all_tp_group_ranks)
|
||||
print("TP", all_tp_group_ranks)
|
||||
|
||||
return all_tp_group_ranks, all_pp_group_ranks, all_dp_group_ranks
|
||||
|
||||
|
@ -115,7 +115,7 @@ def add_free_activations(graph_id: int, graph: Graph, activation_node_names: Lis
|
||||
def _should_free(node: Node) -> bool:
|
||||
if not hasattr(node, "meta"):
|
||||
return False
|
||||
if not "tensor_meta" in node.meta:
|
||||
if "tensor_meta" not in node.meta:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
@ -34,7 +34,7 @@ def get_random_id() -> int:
|
||||
def _should_offload(node: Node) -> bool:
|
||||
if not hasattr(node, "meta"):
|
||||
return False
|
||||
if not "tensor_meta" in node.meta:
|
||||
if "tensor_meta" not in node.meta:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
@ -34,7 +34,7 @@ def add_reload_parameter(graph_id: int, gm: GraphModule, node: Node, ds_id: int)
|
||||
new_node = gm.graph.create_node('call_function',
|
||||
torch.ops.dc.reload_parameter.default,
|
||||
args, {},
|
||||
name=f"reload_parameter")
|
||||
name="reload_parameter")
|
||||
return new_node
|
||||
|
||||
|
||||
|
@ -249,7 +249,7 @@ def get_sparse_pruning_shared_parameters(param_dict):
|
||||
output[SPARSE_PRUNING_DENSE_RATIO] = get_scalar_param(sub_param_dict, SPARSE_PRUNING_DENSE_RATIO,
|
||||
SPARSE_PRUNING_DENSE_RATIO_DEFAULT)
|
||||
assert output[SPARSE_PRUNING_DENSE_RATIO] > 0 and output[
|
||||
SPARSE_PRUNING_DENSE_RATIO] < 1, f"Invalid dense_ratio value. Must be less than 1"
|
||||
SPARSE_PRUNING_DENSE_RATIO] < 1, "Invalid dense_ratio value. Must be less than 1"
|
||||
output[SPARSE_PRUNING_SCHEDULE_OFFSET_STRIDE] = get_scalar_param(
|
||||
sub_param_dict, SPARSE_PRUNING_SCHEDULE_OFFSET_STRIDE, SPARSE_PRUNING_SCHEDULE_OFFSET_STRIDE_DEFAULT)
|
||||
output[SPARSE_PRUNING_EXCLUDED_MODULES] = get_list_param(sub_param_dict, SPARSE_PRUNING_EXCLUDED_MODULES,
|
||||
@ -258,7 +258,7 @@ def get_sparse_pruning_shared_parameters(param_dict):
|
||||
SPARSE_PRUNING_SCHEDULE_OFFSET_END,
|
||||
output[SPARSE_PRUNING_SCHEDULE_OFFSET])
|
||||
assert output[SPARSE_PRUNING_SCHEDULE_OFFSET] <= output[
|
||||
SPARSE_PRUNING_SCHEDULE_OFFSET_END], f"Invalid schedule_offset and schedule_offset_end values"
|
||||
SPARSE_PRUNING_SCHEDULE_OFFSET_END], "Invalid schedule_offset and schedule_offset_end values"
|
||||
else:
|
||||
output[SPARSE_PRUNING_ENABLED] = SPARSE_PRUNING_ENABLED_DEFAULT
|
||||
output[SPARSE_PRUNING_METHOD] = SPARSE_PRUNING_METHOD_DEFAULT
|
||||
|
@ -91,7 +91,7 @@ def installed_cann_version():
|
||||
import re
|
||||
ascend_path = installed_cann_path()
|
||||
if ascend_path is None:
|
||||
return f"CANN_HOME does not exist, unable to compile NPU op(s)"
|
||||
return "CANN_HOME does not exist, unable to compile NPU op(s)"
|
||||
cann_version = ""
|
||||
for dirpath, _, filenames in os.walk(os.path.realpath(ascend_path)):
|
||||
if cann_version:
|
||||
|
@ -188,7 +188,7 @@ def main():
|
||||
|
||||
if not is_torch_elastic_compatible():
|
||||
if args.enable_elastic_training:
|
||||
logger.info(f"Disabling elastic training support as \
|
||||
logger.info("Disabling elastic training support as \
|
||||
PyTorch version should be greater than 1.11.x")
|
||||
args.enable_elastic_training = False
|
||||
|
||||
|
@ -59,9 +59,9 @@ def env_mapping(env, rank_name_list=None, local_rank_name_list=None):
|
||||
if rank == None:
|
||||
rank = env.get(rank_name)
|
||||
elif rank != env.get(rank_name):
|
||||
raise EnvironmentError(f"rank number doesn't match!")
|
||||
raise EnvironmentError("rank number doesn't match!")
|
||||
if rank == None:
|
||||
raise EnvironmentError(f"rank number is not in current env!")
|
||||
raise EnvironmentError("rank number is not in current env!")
|
||||
env['RANK'] = rank
|
||||
|
||||
local_rank = None
|
||||
@ -70,9 +70,9 @@ def env_mapping(env, rank_name_list=None, local_rank_name_list=None):
|
||||
if local_rank == None:
|
||||
local_rank = env.get(local_rank_name)
|
||||
elif local_rank != env.get(local_rank_name):
|
||||
raise EnvironmentError(f"local_rank number doesn't match!")
|
||||
raise EnvironmentError("local_rank number doesn't match!")
|
||||
if local_rank == None:
|
||||
raise EnvironmentError(f"rank number is not in current env!")
|
||||
raise EnvironmentError("rank number is not in current env!")
|
||||
env['LOCAL_RANK'] = local_rank
|
||||
|
||||
return env
|
||||
|
@ -42,7 +42,7 @@ class QuantizedParameter(nn.Parameter):
|
||||
quantizer: Quantizer = None,
|
||||
):
|
||||
if requires_grad:
|
||||
raise ValueError(f"requires_grad=True is not supported with QuantizedParameter")
|
||||
raise ValueError("requires_grad=True is not supported with QuantizedParameter")
|
||||
if data is None:
|
||||
data = torch.empty(0)
|
||||
self = torch.Tensor._make_subclass(cls, data, requires_grad)
|
||||
|
@ -56,7 +56,7 @@ class DeepSpeedTransformerInference(nn.Module):
|
||||
if DeepSpeedTransformerInference.layer_id == 1:
|
||||
log_dist(f"DeepSpeed-Inference config: {self.config.__dict__}", [0])
|
||||
if deepspeed.HAS_TRITON and self.config.use_triton:
|
||||
log_dist(f"Injecting Triton kernels ...", [0])
|
||||
log_dist("Injecting Triton kernels ...", [0])
|
||||
|
||||
if self.config.bigscience_bloom:
|
||||
self.attention = BloomSelfAttention(self.config, mp_group, quantize_scales, quantize_groups, merge_count)
|
||||
|
@ -150,8 +150,8 @@ def prepare_tp_fused_qkvw(module, src, mp_size, gpu_index):
|
||||
module_name = max(module_name_matches, key=len)
|
||||
fused_type = fused_type_dict[module_name]
|
||||
return _transpose_fused_qkvw(src, mp_size, fused_type, module)
|
||||
warning_once(f"Unrecognized fusedkqv weight type, default to using bloom type,"
|
||||
f"please check in prepare_tp_fused_qkvw() to avoid potential calculation errors")
|
||||
warning_once("Unrecognized fusedkqv weight type, default to using bloom type,"
|
||||
"please check in prepare_tp_fused_qkvw() to avoid potential calculation errors")
|
||||
return _bloom_type_transpose(src, mp_size)
|
||||
|
||||
|
||||
|
@ -497,7 +497,7 @@ def replace_transformer_layer(orig_layer_impl, model, checkpoint_dict, config, m
|
||||
if dist.is_initialized():
|
||||
dist.barrier()
|
||||
transformer_name = get_transformer_name(replaced_module)
|
||||
non_tp_ckpt_name = f'non-tp.pt'
|
||||
non_tp_ckpt_name = 'non-tp.pt'
|
||||
ckpt_files = [non_tp_ckpt_name]
|
||||
os.makedirs(config.save_mp_checkpoint_path, exist_ok=True)
|
||||
|
||||
|
@ -496,7 +496,7 @@ class TopKGate(Module):
|
||||
self.top2_2nd_expert_sampling = top2_2nd_expert_sampling
|
||||
|
||||
def _set_ep_group(self, ep_group):
|
||||
assert self.ep_group is None, f'Attempting to override an existing ep_group'
|
||||
assert self.ep_group is None, 'Attempting to override an existing ep_group'
|
||||
self.ep_group = ep_group
|
||||
|
||||
def forward(self,
|
||||
|
@ -70,10 +70,10 @@ def validate_args(args):
|
||||
error_messages = []
|
||||
|
||||
if args.folder is not None and len(args.folder_to_device_mapping) > 0:
|
||||
error_messages.append(f'--folder and --folder_to_device_mapping cannot be specified together.')
|
||||
error_messages.append('--folder and --folder_to_device_mapping cannot be specified together.')
|
||||
no_error = False
|
||||
elif args.folder is None and len(args.folder_to_device_mapping) == 0:
|
||||
error_messages.append(f'At least one of --folder or --folder_to_device_mapping must be specified.')
|
||||
error_messages.append('At least one of --folder or --folder_to_device_mapping must be specified.')
|
||||
no_error = False
|
||||
|
||||
# Validate --folder
|
||||
@ -102,7 +102,7 @@ def validate_args(args):
|
||||
print(f'Found {len(error_messages)} validation error(s)')
|
||||
# Validate --gpu, --use_gds
|
||||
if args.use_gds and not args.gpu:
|
||||
error_messages.append(f'--gpu must be set to transfer with --use_gds')
|
||||
error_messages.append('--gpu must be set to transfer with --use_gds')
|
||||
no_error = False
|
||||
|
||||
if not no_error:
|
||||
@ -201,7 +201,7 @@ def get_validated_args():
|
||||
args = refine_args(args)
|
||||
if not validate_args(args):
|
||||
quit()
|
||||
print(f'Successful validation of command line arguments')
|
||||
print('Successful validation of command line arguments')
|
||||
args.total_loops = args.warmup_loops + args.loops
|
||||
peer_tag = 'gpu' if args.gpu else 'process'
|
||||
args.mapping_dict = _get_mapping_dict(args)
|
||||
|
@ -54,7 +54,7 @@ class AIOBasic_Engine(object):
|
||||
task_log(tid,
|
||||
f'{io_string} file {filename} of size {args.io_size} bytes from buffer on device {buffer.device}')
|
||||
|
||||
task_log(tid, f'created deepspeed aio basic engine')
|
||||
task_log(tid, 'created deepspeed aio basic engine')
|
||||
|
||||
ctxt = {}
|
||||
ctxt[FILE] = filename
|
||||
|
@ -95,7 +95,7 @@ class AIOHandle_Engine(object):
|
||||
else:
|
||||
handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit,
|
||||
not args.sequential_requests, io_parallel)
|
||||
task_log(tid, f'Created DeepNVMe handle engine')
|
||||
task_log(tid, 'Created DeepNVMe handle engine')
|
||||
|
||||
bounce_buffer = None
|
||||
if args.gpu:
|
||||
|
@ -76,7 +76,7 @@ def io_engine_tasklet(pool_params):
|
||||
task_barrier(aio_barrier, num_processes)
|
||||
|
||||
# Run pre task
|
||||
task_log(tid, f'running pre-task')
|
||||
task_log(tid, 'running pre-task')
|
||||
io_engine = schedule["pre"]((args, tid))
|
||||
task_barrier(aio_barrier, num_processes)
|
||||
|
||||
@ -91,7 +91,7 @@ def io_engine_tasklet(pool_params):
|
||||
io_engine.ctxt["main_task_sec"].append(stop_time - start_time)
|
||||
|
||||
# Run post task
|
||||
task_log(tid, f'running post-task')
|
||||
task_log(tid, 'running post-task')
|
||||
schedule["post"]((args, tid, io_engine))
|
||||
task_barrier(aio_barrier, num_processes)
|
||||
|
||||
|
@ -101,7 +101,7 @@ def get_metric(file, metric):
|
||||
|
||||
|
||||
def validate_args(args):
|
||||
if not args.metric in PERF_METRICS:
|
||||
if args.metric not in PERF_METRICS:
|
||||
print(f'{args.metric} is not a valid performance metrics')
|
||||
return False
|
||||
|
||||
|
@ -12,7 +12,7 @@ from .io_engine import io_engine_multiprocessing
|
||||
|
||||
|
||||
def ds_io_main():
|
||||
print(f'Testing DeepNVMe python frontend')
|
||||
print('Testing DeepNVMe python frontend')
|
||||
|
||||
args = get_validated_args()
|
||||
mp.set_start_method('spawn', force=True)
|
||||
|
@ -15,7 +15,7 @@ from deepspeed.io import FastFileWriter
|
||||
class Torch_FastIO_Engine(object):
|
||||
|
||||
def __init__(self, args, tid, read_op):
|
||||
assert read_op is False, f'Read operation is not currently supported'
|
||||
assert read_op is False, 'Read operation is not currently supported'
|
||||
self.ctxt = self._create_context(args, tid, read_op)
|
||||
self.zipfile_serialization = not args.torch_legacy_save
|
||||
|
||||
@ -69,7 +69,7 @@ class Torch_FastIO_Engine(object):
|
||||
|
||||
fast_io_buffer = create_page_locked_tensor(args.fast_io_size, args.use_accelerator_pin_memory, aio_handle)
|
||||
|
||||
task_log(tid, f'created torch_fastio engine')
|
||||
task_log(tid, 'created torch_fastio engine')
|
||||
|
||||
ctxt = {}
|
||||
ctxt[FILE] = filename
|
||||
|
@ -54,7 +54,7 @@ class TorchIO_Engine(object):
|
||||
f'{io_string} file {filename} of size {args.io_size} bytes from buffer on device {buffer.device}',
|
||||
force=True)
|
||||
|
||||
task_log(tid, f'created torch_io engine')
|
||||
task_log(tid, 'created torch_io engine')
|
||||
|
||||
ctxt = {}
|
||||
ctxt[FILE] = filename
|
||||
|
@ -126,7 +126,7 @@ class FP_Quantize(Quantizer):
|
||||
|
||||
if scale is not None:
|
||||
assert input_q.numel() == fp_out.numel(), \
|
||||
f'[De-quantization Error]: quantized data should have the same size as original tensor when scale is not None!'
|
||||
'[De-quantization Error]: quantized data should have the same size as original tensor when scale is not None!'
|
||||
input_q = torch.cat([input_q.reshape(-1, self.group_size), scale], dim=-1).contiguous()
|
||||
fp_quant_module.dequantize(fp_out, input_q, self.group_size, q_mantisa_bits, q_bits - q_mantisa_bits - 1)
|
||||
return fp_out
|
||||
@ -159,7 +159,7 @@ class FP_Quantize(Quantizer):
|
||||
|
||||
if scale is not None:
|
||||
assert input_q.numel() == fp_out.numel(), \
|
||||
f'[De-quantization Error]: quantized data should have the same size as original tensor when scale is not None!'
|
||||
'[De-quantization Error]: quantized data should have the same size as original tensor when scale is not None!'
|
||||
input_q = torch.cat([input_q.reshape(-1, self.group_size), scale], dim=-1).contiguous()
|
||||
|
||||
fp_quant_module.selective_dequantize(fp_out, input_q, indexes, self.group_size, q_mantisa_bits,
|
||||
|
@ -142,7 +142,7 @@ class FixedSparsityConfig(SparsityConfig):
|
||||
|
||||
if (num_different_global_patterns > 1 and not different_layout_per_head):
|
||||
raise ValueError(
|
||||
f'Number of different layouts cannot be more than one when you have set a single layout for all heads! Set different_layout_per_head to True.'
|
||||
'Number of different layouts cannot be more than one when you have set a single layout for all heads! Set different_layout_per_head to True.'
|
||||
)
|
||||
if (num_different_global_patterns > (num_local_blocks // num_global_blocks)):
|
||||
raise ValueError(
|
||||
|
@ -103,7 +103,7 @@ class TritonSelfAttention(nn.Module):
|
||||
|
||||
# triton autotune table update for score/context matmul
|
||||
if triton_autotune:
|
||||
print(f"running triton autotune for regular attention kernel")
|
||||
print("running triton autotune for regular attention kernel")
|
||||
__class__._triton_autotune(2, self.config.max_out_tokens, self.head_size, self.config.hidden_size,
|
||||
self.triangular_masking, self.scale)
|
||||
|
||||
|
@ -1122,7 +1122,7 @@ def configure(
|
||||
#print configuration only once
|
||||
see_memory_usage("After configuration", force=False)
|
||||
if dist.get_rank() == 0:
|
||||
logger.info(f"Activation Checkpointing Information")
|
||||
logger.info("Activation Checkpointing Information")
|
||||
logger.info(f"----Partition Activations {PARTITION_ACTIVATIONS}, CPU CHECKPOINTING {CPU_CHECKPOINT}")
|
||||
logger.info(f"----contiguous Memory Checkpointing {CONTIGUOUS_CHECKPOINTING} with {num_layers} total layers")
|
||||
logger.info(f"----Synchronization {SYNCHRONIZE}")
|
||||
|
@ -54,7 +54,7 @@ class BF16_Optimizer(ZeROOptimizer):
|
||||
self.param_names = param_names
|
||||
self.using_real_optimizer = not isinstance(self.optimizer, DummyOptim)
|
||||
|
||||
assert bfloat16_config.enabled, f"BF16Optimizer: requires bfloat16 to be enabled"
|
||||
assert bfloat16_config.enabled, "BF16Optimizer: requires bfloat16 to be enabled"
|
||||
assert grad_acc_dtype in [torch.float32, torch.bfloat16
|
||||
], f"BF16Optimizer: Unsupported gradient accumulation data type: {grad_acc_dtype}"
|
||||
self.grad_acc_dtype = grad_acc_dtype
|
||||
@ -504,13 +504,13 @@ class BF16_Optimizer(ZeROOptimizer):
|
||||
current_rank_sd = state_dict_list[dp_rank]
|
||||
|
||||
ckpt_version = current_rank_sd.get(DS_VERSION, False)
|
||||
assert ckpt_version, f"Empty ds_version in checkpoint, not clear how to proceed"
|
||||
assert ckpt_version, "Empty ds_version in checkpoint, not clear how to proceed"
|
||||
ckpt_version = pkg_version.parse(ckpt_version)
|
||||
|
||||
self.clip_grad = current_rank_sd.get(CLIP_GRAD, self.clip_grad)
|
||||
|
||||
if load_optimizer_states:
|
||||
print(f"_load_legacy_checkpoint current_rank_sd[BASE_OPTIMIZER_STATE]")
|
||||
print("_load_legacy_checkpoint current_rank_sd[BASE_OPTIMIZER_STATE]")
|
||||
self.optimizer.load_state_dict(current_rank_sd[BASE_OPTIMIZER_STATE])
|
||||
|
||||
if load_from_fp32_weights:
|
||||
|
@ -40,7 +40,7 @@ class CheckpointSize(object):
|
||||
|
||||
def init_decoupled_checkpoint(config_params, dp_writer_config, save_event, save_queue, optimize_dp_state):
|
||||
checkpoint_engine = FastCheckpointEngine(config_params, dp_writer_config, optimize_dp_state)
|
||||
print(f'Created FastCheckpointEngine for Decoupled Checkpointing')
|
||||
print('Created FastCheckpointEngine for Decoupled Checkpointing')
|
||||
save_path_list = []
|
||||
while True:
|
||||
(save_info, event_type) = save_queue.get()
|
||||
|
@ -43,7 +43,7 @@ class NebulaCheckpointEngine(CheckpointEngine):
|
||||
self.checkpoint = torch_nebula.Checkpoint(info.tag, -2)
|
||||
|
||||
def save(self, state_dict, path: str):
|
||||
log_dist(f"[Nebula] Create dummy files for loading.")
|
||||
log_dist("[Nebula] Create dummy files for loading.")
|
||||
torch.save("", path)
|
||||
|
||||
tag = _get_tag_from_path(path)
|
||||
@ -84,7 +84,7 @@ class NebulaCheckpointEngine(CheckpointEngine):
|
||||
checkpoint = torch_nebula.get_latest_checkpoint(persist_path=self.nebula_load_path)
|
||||
if checkpoint is None or (checkpoint is not None and checkpoint.tag == ''):
|
||||
logger.info(
|
||||
f"Unable to find latest checkpoint from Nebula tier3, try to get latest checkpoint again from nebula tier1 path!"
|
||||
"Unable to find latest checkpoint from Nebula tier3, try to get latest checkpoint again from nebula tier1 path!"
|
||||
)
|
||||
# nebula tier1 latest
|
||||
checkpoint = torch_nebula.get_latest_checkpoint()
|
||||
@ -103,6 +103,6 @@ class NebulaCheckpointEngine(CheckpointEngine):
|
||||
logger.info(f"[Nebula] all files for {tag} are saved in tier1. It is ready to start persisting")
|
||||
commit_rls = self.checkpoint.commit()
|
||||
if not commit_rls:
|
||||
logger.error(f"[Nebula] failed to commit the checkpoint, please check the log.")
|
||||
logger.error("[Nebula] failed to commit the checkpoint, please check the log.")
|
||||
return False
|
||||
return commit_rls
|
||||
|
@ -169,7 +169,7 @@ class ScientificNotationEncoder(json.JSONEncoder):
|
||||
x = [f'\n{prefix}"{k}": {self.iterencode(v, level=level)}' for k, v in o.items()]
|
||||
return "{" + ", ".join(x) + f"\n{prefix_close}" + "}"
|
||||
elif isinstance(o, collections.abc.Sequence) and not isinstance(o, str):
|
||||
return f"[{ f', '.join(map(self.iterencode, o)) }]"
|
||||
return f"[{ ', '.join(map(self.iterencode, o)) }]"
|
||||
return "\n, ".join(super().iterencode(o, _one_shot))
|
||||
|
||||
|
||||
|
@ -73,7 +73,7 @@ class CurriculumScheduler(object):
|
||||
f"Curriculum learning with fixed_root schedule requires the schedule_config '{CURRICULUM_LEARNING_SCHEDULE_ROOT_DEGREE}'"
|
||||
if config[CURRICULUM_LEARNING_SCHEDULE_CONFIG][CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP] % 8 != 0:
|
||||
logger.warning(
|
||||
f'When using seqlen metric, the difficulty_step for curriculum learning has to be multiple of 8 (for FP16 data) or 16 (for INT8 data) to enable NVIDIA Tensor Core acceleration. Disregard this warning if this is unrelated to your metric/hardware.'
|
||||
'When using seqlen metric, the difficulty_step for curriculum learning has to be multiple of 8 (for FP16 data) or 16 (for INT8 data) to enable NVIDIA Tensor Core acceleration. Disregard this warning if this is unrelated to your metric/hardware.'
|
||||
)
|
||||
self.state[CURRICULUM_LEARNING_SCHEDULE_CONFIG] = config[CURRICULUM_LEARNING_SCHEDULE_CONFIG]
|
||||
elif config[CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_LINEAR:
|
||||
@ -91,7 +91,7 @@ class CurriculumScheduler(object):
|
||||
f"Curriculum learning with fixed_linear schedule requires the schedule_config '{CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP}'"
|
||||
if config[CURRICULUM_LEARNING_SCHEDULE_CONFIG][CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP] % 8 != 0:
|
||||
logger.warning(
|
||||
f'When using seqlen metric, the difficulty_step for curriculum learning has to be multiple of 8 (for FP16 data) or 16 (for INT8 data) to enable NVIDIA Tensor Core acceleration. Disregard this warning if this is unrelated to your metric/hardware.'
|
||||
'When using seqlen metric, the difficulty_step for curriculum learning has to be multiple of 8 (for FP16 data) or 16 (for INT8 data) to enable NVIDIA Tensor Core acceleration. Disregard this warning if this is unrelated to your metric/hardware.'
|
||||
)
|
||||
self.state[CURRICULUM_LEARNING_SCHEDULE_CONFIG] = config[CURRICULUM_LEARNING_SCHEDULE_CONFIG]
|
||||
elif config[CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_CUSTOM:
|
||||
|
@ -107,7 +107,7 @@ class Eigenvalue(object):
|
||||
# Disable eigenvalue if the model doesn't support second order gradients computation,
|
||||
# e.g. when enabling DS transformer kernel.
|
||||
if len(grads) == 0 or len(params) == 0:
|
||||
log_dist(f'The model does NOT support eigenvalue computation.', ranks=[0], level=logging.WARNING)
|
||||
log_dist('The model does NOT support eigenvalue computation.', ranks=[0], level=logging.WARNING)
|
||||
return []
|
||||
|
||||
i = 0
|
||||
|
@ -266,7 +266,7 @@ class DeepSpeedEngine(Module):
|
||||
self._do_sanity_check()
|
||||
if self.autotp_size() > 1:
|
||||
self._configure_tensor_parallel(model, self.tensor_parallel_config())
|
||||
see_memory_usage(f"DeepSpeed Engine: After args sanity test", force=self.memory_breakdown())
|
||||
see_memory_usage("DeepSpeed Engine: After args sanity test", force=self.memory_breakdown())
|
||||
if mpu is not None:
|
||||
if self.elasticity_enabled():
|
||||
if not self.is_elastic_model_parallel_supported():
|
||||
@ -280,7 +280,7 @@ class DeepSpeedEngine(Module):
|
||||
self.monitor = MonitorMaster(self._config.monitor_config)
|
||||
|
||||
see_memory_usage(
|
||||
f"DeepSpeed Engine: Before configure distributed model",
|
||||
"DeepSpeed Engine: Before configure distributed model",
|
||||
force=self.memory_breakdown(),
|
||||
)
|
||||
|
||||
@ -298,7 +298,7 @@ class DeepSpeedEngine(Module):
|
||||
|
||||
self._get_model_parameters()
|
||||
|
||||
see_memory_usage(f"DeepSpeed Engine: After configure distributed model")
|
||||
see_memory_usage("DeepSpeed Engine: After configure distributed model")
|
||||
|
||||
# Configure wall clock timers
|
||||
self.timers = SynchronizedWallClockTimer()
|
||||
@ -507,7 +507,7 @@ class DeepSpeedEngine(Module):
|
||||
broadcast_and_check(args, bcast_rank, bcast_group)
|
||||
broadcast_and_check(kwargs, bcast_rank, bcast_group)
|
||||
|
||||
logger.info(f":The Dataloader has passed the TP group consistency check.")
|
||||
logger.info(":The Dataloader has passed the TP group consistency check.")
|
||||
self.first_dataloader_check.remove()
|
||||
|
||||
self.first_dataloader_check = self.module.register_forward_pre_hook(check_dataloader_inputs_same_across_ranks,
|
||||
@ -577,7 +577,7 @@ class DeepSpeedEngine(Module):
|
||||
"""
|
||||
if train_batch_size % (self.train_micro_batch_size_per_gpu() * self.dp_world_size) != 0:
|
||||
#print(f'{train_batch_size=} {self.train_micro_batch_size_per_gpu()=} {self.dp_world_size=}')
|
||||
raise ValueError(f'Train batch size must be divisible by micro-batch data parallelism')
|
||||
raise ValueError('Train batch size must be divisible by micro-batch data parallelism')
|
||||
new_gas = train_batch_size // (self.train_micro_batch_size_per_gpu() * self.dp_world_size)
|
||||
# overwrite config
|
||||
self._config.train_batch_size = train_batch_size
|
||||
@ -736,7 +736,7 @@ class DeepSpeedEngine(Module):
|
||||
|
||||
if random_ltd_config[RANDOM_LTD_LAYER_TOKEN_LR_SCHEDULE][RANDOM_LTD_LAYER_TOKEN_LR_ENABLED]:
|
||||
assert self.client_lr_scheduler is None
|
||||
raise ValueError(f'not yet support')
|
||||
raise ValueError('not yet support')
|
||||
#self.lr_scheduler = lr_schedules.WarmupLayerTokenDecayLR(self.optimizer, self.random_ltd_scheduler)
|
||||
|
||||
def get_data_parallel_rank(self):
|
||||
@ -1534,21 +1534,21 @@ class DeepSpeedEngine(Module):
|
||||
|
||||
optimizer = OnebitAdam(model_parameters, self, **optimizer_parameters)
|
||||
if not self.fp16_enabled():
|
||||
logger.warning(f"Currently the convergence of 1-bit Adam is only verified under FP16")
|
||||
logger.warning("Currently the convergence of 1-bit Adam is only verified under FP16")
|
||||
elif self.optimizer_name() == ZERO_ONE_ADAM_OPTIMIZER:
|
||||
assert not self.zero_optimization(), "0/1 Adam is not compatible with ZeRO"
|
||||
from deepspeed.runtime.fp16.onebit.zoadam import ZeroOneAdam
|
||||
|
||||
optimizer = ZeroOneAdam(model_parameters, self, **optimizer_parameters)
|
||||
if not self.fp16_enabled():
|
||||
logger.warning(f'Currently the convergence of 0/1 Adam is only verified under FP16')
|
||||
logger.warning('Currently the convergence of 0/1 Adam is only verified under FP16')
|
||||
elif self.optimizer_name() == ONEBIT_LAMB_OPTIMIZER:
|
||||
assert not self.zero_optimization(), "1bit-Lamb is not compatible with ZeRO"
|
||||
from deepspeed.runtime.fp16.onebit.lamb import OnebitLamb
|
||||
|
||||
optimizer = OnebitLamb(model_parameters, self, **optimizer_parameters)
|
||||
if not self.fp16_enabled():
|
||||
logger.warning(f"Currently the convergence of 1-bit Lamb is only verified under FP16")
|
||||
logger.warning("Currently the convergence of 1-bit Lamb is only verified under FP16")
|
||||
elif self.optimizer_name() == LION_OPTIMIZER:
|
||||
if self.zero_use_cpu_optimizer():
|
||||
from deepspeed.ops.lion import DeepSpeedCPULion
|
||||
@ -1560,19 +1560,19 @@ class DeepSpeedEngine(Module):
|
||||
try:
|
||||
from mup import MuAdam
|
||||
except ImportError:
|
||||
logger.error(f"Install mup to use MuAdam optimizer")
|
||||
logger.error("Install mup to use MuAdam optimizer")
|
||||
optimizer = MuAdam(model_parameters, **optimizer_parameters)
|
||||
elif self.optimizer_name() == MUADAMW_OPTIMIZER:
|
||||
try:
|
||||
from mup import MuAdamW
|
||||
except ImportError:
|
||||
logger.error(f"Install mup to use MuAdamW optimizer")
|
||||
logger.error("Install mup to use MuAdamW optimizer")
|
||||
optimizer = MuAdamW(model_parameters, **optimizer_parameters)
|
||||
elif self.optimizer_name() == MUSGD_OPTIMIZER:
|
||||
try:
|
||||
from mup import MuSGD
|
||||
except ImportError:
|
||||
logger.error(f"Install mup to use MuSGD optimizer")
|
||||
logger.error("Install mup to use MuSGD optimizer")
|
||||
optimizer = MuSGD(model_parameters, **optimizer_parameters)
|
||||
else:
|
||||
torch_optimizer = getattr(torch.optim, self.optimizer_name())
|
||||
@ -1630,7 +1630,7 @@ class DeepSpeedEngine(Module):
|
||||
if isinstance(optimizer, fused_opts) \
|
||||
or self.optimizer_name() in [ONEBIT_ADAM_OPTIMIZER, ZERO_ONE_ADAM_OPTIMIZER]:
|
||||
if self.dynamic_loss_scale():
|
||||
log_dist(f'Creating fp16 optimizer with dynamic loss scale', ranks=[0])
|
||||
log_dist('Creating fp16 optimizer with dynamic loss scale', ranks=[0])
|
||||
timers = self.timers if self.wall_clock_breakdown() else NoopTimer()
|
||||
optimizer = FP16_Optimizer(
|
||||
optimizer,
|
||||
@ -1658,7 +1658,7 @@ class DeepSpeedEngine(Module):
|
||||
has_moe_layers=self.has_moe_layers,
|
||||
)
|
||||
else:
|
||||
log_dist(f'Creating fp16 unfused optimizer with dynamic loss scale', ranks=[0])
|
||||
log_dist('Creating fp16 unfused optimizer with dynamic loss scale', ranks=[0])
|
||||
optimizer = FP16_UnfusedOptimizer(
|
||||
optimizer,
|
||||
deepspeed=self,
|
||||
@ -2214,7 +2214,7 @@ class DeepSpeedEngine(Module):
|
||||
if self.is_gradient_accumulation_boundary():
|
||||
if self.global_rank == 0:
|
||||
self.summary_events = [(
|
||||
f"Train/Samples/train_loss",
|
||||
"Train/Samples/train_loss",
|
||||
self.losses.item(),
|
||||
self.global_samples,
|
||||
)]
|
||||
@ -2274,7 +2274,7 @@ class DeepSpeedEngine(Module):
|
||||
assert not self.zero_optimization_partition_gradients(), \
|
||||
f"no_sync context manager is incompatible with gradient partitioning logic of ZeRO stage {self.zero_optimization_stage()}"
|
||||
|
||||
assert not self.inside_no_sync_ctxt, f"no_sync context manager reentry is unsupported"
|
||||
assert not self.inside_no_sync_ctxt, "no_sync context manager reentry is unsupported"
|
||||
|
||||
self.inside_no_sync_ctxt = True
|
||||
try:
|
||||
@ -2456,7 +2456,7 @@ class DeepSpeedEngine(Module):
|
||||
|
||||
if (self.eigenvalue_enabled() and (self.gas_boundary_ctr % self.eigenvalue_gas_boundary_resolution() == 0)
|
||||
and self.quantizer.any_precision_switch()):
|
||||
log_dist(f"computing eigenvalue...", ranks=[0])
|
||||
log_dist("computing eigenvalue...", ranks=[0])
|
||||
self.block_eigenvalue = self.eigenvalue.compute_eigenvalue(self.module, self.device,
|
||||
self.optimizer.cur_scale)
|
||||
|
||||
@ -2482,11 +2482,11 @@ class DeepSpeedEngine(Module):
|
||||
if self.monitor.enabled:
|
||||
if self.is_gradient_accumulation_boundary():
|
||||
if self.global_rank == 0:
|
||||
self.summary_events = [(f"Train/Samples/lr", self.get_lr()[0], self.global_samples)]
|
||||
self.summary_events = [("Train/Samples/lr", self.get_lr()[0], self.global_samples)]
|
||||
|
||||
if self.fp16_enabled() and hasattr(self.optimizer, "cur_scale"):
|
||||
self.summary_events.append((
|
||||
f"Train/Samples/loss_scale",
|
||||
"Train/Samples/loss_scale",
|
||||
self.optimizer.cur_scale,
|
||||
self.global_samples,
|
||||
))
|
||||
@ -2578,27 +2578,27 @@ class DeepSpeedEngine(Module):
|
||||
if self.global_rank == 0:
|
||||
self.summary_events = [
|
||||
(
|
||||
f"Train/Samples/elapsed_time_ms_forward",
|
||||
"Train/Samples/elapsed_time_ms_forward",
|
||||
self.timers(FORWARD_GLOBAL_TIMER).elapsed(reset=False),
|
||||
self.global_samples,
|
||||
),
|
||||
(
|
||||
f"Train/Samples/elapsed_time_ms_backward",
|
||||
"Train/Samples/elapsed_time_ms_backward",
|
||||
self.timers(BACKWARD_GLOBAL_TIMER).elapsed(reset=False),
|
||||
self.global_samples,
|
||||
),
|
||||
(
|
||||
f"Train/Samples/elapsed_time_ms_backward_inner",
|
||||
"Train/Samples/elapsed_time_ms_backward_inner",
|
||||
self.timers(BACKWARD_INNER_GLOBAL_TIMER).elapsed(reset=False),
|
||||
self.global_samples,
|
||||
),
|
||||
(
|
||||
f"Train/Samples/elapsed_time_ms_backward_allreduce",
|
||||
"Train/Samples/elapsed_time_ms_backward_allreduce",
|
||||
self.timers(BACKWARD_REDUCE_GLOBAL_TIMER).elapsed(reset=False),
|
||||
self.global_samples,
|
||||
),
|
||||
(
|
||||
f"Train/Samples/elapsed_time_ms_step",
|
||||
"Train/Samples/elapsed_time_ms_step",
|
||||
self.timers(STEP_GLOBAL_TIMER).elapsed(reset=False),
|
||||
self.global_samples,
|
||||
),
|
||||
@ -3239,7 +3239,7 @@ class DeepSpeedEngine(Module):
|
||||
if load_optimizer_states:
|
||||
deepspeed_states.append('optimizer')
|
||||
|
||||
client_state = {key: value for key, value in checkpoint.items() if not key in deepspeed_states}
|
||||
client_state = {key: value for key, value in checkpoint.items() if key not in deepspeed_states}
|
||||
|
||||
if optim_checkpoint is not None:
|
||||
client_state['optimizer'] = optim_checkpoint['optimizer']
|
||||
@ -3739,7 +3739,7 @@ class DeepSpeedEngine(Module):
|
||||
numel += param.ds_numel if hasattr(param, "ds_numel") else param.numel()
|
||||
shape = param.ds_shape if hasattr(param, "ds_shape") else param.shape
|
||||
if param not in self.param_names:
|
||||
raise ValueError(f"failed to find optimizer param in named params")
|
||||
raise ValueError("failed to find optimizer param in named params")
|
||||
name = self.param_names[param]
|
||||
param_shapes[name] = shape
|
||||
|
||||
|
@ -210,7 +210,7 @@ class DynamicLossScaler(LossScalerBase):
|
||||
# we still create a scaler for other dtypes (fp32, bf16) which does not perform any scaling.
|
||||
def CreateLossScaler(dtype, static_loss_scale, dynamic_scaling, dynamic_loss_args):
|
||||
if dtype == torch.half and dynamic_scaling:
|
||||
assert dynamic_loss_args is not None, f"Dynamic loss scaling parameters must be defined."
|
||||
assert dynamic_loss_args is not None, "Dynamic loss scaling parameters must be defined."
|
||||
return DynamicLossScaler(dtype=dtype, **dynamic_loss_args)
|
||||
|
||||
loss_scale_value = static_loss_scale if dtype == torch.half else 1.0
|
||||
|
@ -209,7 +209,7 @@ def get_config_from_args(args):
|
||||
if not hasattr(args, LR_SCHEDULE) or args.lr_schedule is None:
|
||||
return None, '--{} not specified on command line'.format(LR_SCHEDULE)
|
||||
|
||||
if not args.lr_schedule in VALID_LR_SCHEDULES:
|
||||
if args.lr_schedule not in VALID_LR_SCHEDULES:
|
||||
return None, '{} is not supported LR schedule'.format(args.lr_schedule)
|
||||
|
||||
config = {}
|
||||
@ -227,16 +227,16 @@ def get_config_from_args(args):
|
||||
|
||||
|
||||
def get_lr_from_config(config):
|
||||
if not 'type' in config:
|
||||
if 'type' not in config:
|
||||
return None, 'LR schedule type not defined in config'
|
||||
|
||||
if not 'params' in config:
|
||||
if 'params' not in config:
|
||||
return None, 'LR schedule params not defined in config'
|
||||
|
||||
lr_schedule = config['type']
|
||||
lr_params = config['params']
|
||||
|
||||
if not lr_schedule in VALID_LR_SCHEDULES:
|
||||
if lr_schedule not in VALID_LR_SCHEDULES:
|
||||
return None, '{} is not a valid LR schedule'.format(lr_schedule)
|
||||
|
||||
if lr_schedule == LR_RANGE_TEST:
|
||||
|
@ -57,7 +57,7 @@ class DataParallelWriterFactory(object):
|
||||
return self._create_config(1, 0) if dp_rank == 0 else None
|
||||
|
||||
assert self._uni_parallel_info.pure_dp, \
|
||||
f'3D parallelism is not yet supported for data parallel checkpointing.'
|
||||
'3D parallelism is not yet supported for data parallel checkpointing.'
|
||||
|
||||
if self._parallel_unit == CheckpointDataParallel.REPLICA or ep_info.ep_world_size == 1:
|
||||
return self._get_parallel_write_for_ddp(ep_info.dp_world_size, ep_info.dp_rank)
|
||||
|
@ -217,7 +217,7 @@ class PipelineEngine(DeepSpeedEngine):
|
||||
# set activation_checkpoint_func to non_reentrant_checkpoint func.
|
||||
self.module.activation_checkpoint_func = ds_checkpointing.non_reentrant_checkpoint
|
||||
if self.grid.get_global_rank() == 0:
|
||||
logger.info(f'CONFIG: activation_checkpoint_func=non_reentrant_checkpoint')
|
||||
logger.info('CONFIG: activation_checkpoint_func=non_reentrant_checkpoint')
|
||||
if self.module.activation_checkpoint_interval > 0:
|
||||
self.module._precompute_checkpointable_values()
|
||||
|
||||
@ -359,7 +359,7 @@ class PipelineEngine(DeepSpeedEngine):
|
||||
The arithmetic mean of the losses computed this batch.
|
||||
"""
|
||||
if not torch._C.is_grad_enabled():
|
||||
raise RuntimeError(f'train_batch() requires gradients enabled. Use eval_batch() instead.')
|
||||
raise RuntimeError('train_batch() requires gradients enabled. Use eval_batch() instead.')
|
||||
|
||||
# Curriculum learning could change activation shape
|
||||
if self.curriculum_enabled_legacy():
|
||||
@ -408,8 +408,8 @@ class PipelineEngine(DeepSpeedEngine):
|
||||
|
||||
# Monitoring
|
||||
if self.global_rank == 0 and self.monitor.enabled:
|
||||
self.summary_events = [(f'Train/Samples/train_loss', self.agg_train_loss.mean().item(),
|
||||
self.global_samples)]
|
||||
self.summary_events = [('Train/Samples/train_loss', self.agg_train_loss.mean().item(), self.global_samples)
|
||||
]
|
||||
self.monitor.write_events(self.summary_events)
|
||||
|
||||
if self.steps_per_print() is not None and self.wall_clock_breakdown(
|
||||
@ -498,7 +498,7 @@ class PipelineEngine(DeepSpeedEngine):
|
||||
eval_output = self._bcast_pipe_scalar(eval_output)
|
||||
|
||||
if self.global_rank == 0 and self.monitor.enabled:
|
||||
self.summary_events = [(f'Train/Samples/eval_loss', eval_output.mean().item(), self.global_samples)]
|
||||
self.summary_events = [('Train/Samples/eval_loss', eval_output.mean().item(), self.global_samples)]
|
||||
self.monitor.write_events(self.summary_events)
|
||||
|
||||
# Restore the training iterator
|
||||
@ -1220,10 +1220,9 @@ class PipelineEngine(DeepSpeedEngine):
|
||||
self._force_grad_boundary = False
|
||||
|
||||
if self.global_rank == 0 and self.monitor.enabled:
|
||||
self.summary_events = [(f'Train/Samples/lr', self.get_lr()[0], self.global_samples)]
|
||||
self.summary_events = [('Train/Samples/lr', self.get_lr()[0], self.global_samples)]
|
||||
if self.fp16_enabled() and hasattr(self.optimizer, 'cur_scale'):
|
||||
self.summary_events.append(
|
||||
(f'Train/Samples/loss_scale', self.optimizer.cur_scale, self.global_samples))
|
||||
self.summary_events.append(('Train/Samples/loss_scale', self.optimizer.cur_scale, self.global_samples))
|
||||
self.monitor.write_events(self.summary_events)
|
||||
|
||||
if self.wall_clock_breakdown():
|
||||
|
@ -1266,7 +1266,7 @@ class UlyssesSPFwdLossBwdWithLogits:
|
||||
|
||||
def sp_fwd_loss_bwd(self, batch) -> torch.Tensor:
|
||||
|
||||
see_memory_usage(f"entered sp_fwd_loss_bwd", force=True)
|
||||
see_memory_usage("entered sp_fwd_loss_bwd", force=True)
|
||||
|
||||
# ensure shapes are correct
|
||||
if not (batch["input_ids"].shape == batch["position_ids"].shape == batch["labels"].shape):
|
||||
|
@ -102,7 +102,7 @@ class OptimizerStateSwapInfo(object):
|
||||
def get_or_create_gradient_paths(self, offsets, lengths):
|
||||
gradient_paths = []
|
||||
for offset, length in zip(offsets, lengths):
|
||||
if not offset in self.swapped_gradients.keys():
|
||||
if offset not in self.swapped_gradients.keys():
|
||||
path = os.path.join(self.swap_folder, f'{self.param_id}_gradient_{offset}_{length}.tensor.swp')
|
||||
self.swapped_gradients[offset] = FlattenedTensorSwapInfo(path, length, offset)
|
||||
|
||||
@ -233,7 +233,7 @@ class OptimizerSwapper(object):
|
||||
self.timer_names.update(gradient_swapper.get_timer_names())
|
||||
|
||||
def _swap_out_gradients(self, parameter, gradient_offsets, gradient_tensors, gradient_swapper):
|
||||
if not OptimizerSwapper.parameter_id(parameter) in self.swap_params_info.keys():
|
||||
if OptimizerSwapper.parameter_id(parameter) not in self.swap_params_info.keys():
|
||||
return
|
||||
|
||||
swap_info = self.swap_params_info[OptimizerSwapper.parameter_id(parameter)]
|
||||
@ -471,7 +471,7 @@ class OptimizerSwapper(object):
|
||||
)
|
||||
|
||||
def _get_state_tensors(self, parameter):
|
||||
if not parameter in self.optimizer.state:
|
||||
if parameter not in self.optimizer.state:
|
||||
return []
|
||||
|
||||
tensor_list = []
|
||||
@ -490,7 +490,7 @@ class OptimizerSwapper(object):
|
||||
|
||||
def _create_param_swap_info(self, parameter, numel):
|
||||
param_id = OptimizerSwapper.parameter_id(parameter)
|
||||
assert not param_id in self.swap_params_info
|
||||
assert param_id not in self.swap_params_info
|
||||
|
||||
self.swap_params_info[param_id] = OptimizerStateSwapInfo(parameter=parameter,
|
||||
numel=numel,
|
||||
|
@ -399,8 +399,8 @@ class AsyncPartitionedParameterSwapper(object):
|
||||
self.partitioned_swap_pool = SwapBufferPool([self.partitioned_swap_buffer])
|
||||
|
||||
def swap_out_partitioned_params(self, dst_fp16_params, src_fp32_params):
|
||||
assert self.partitioned_swap_buffer is not None, f'partitioned swap buffers for fp16 params not initialized'
|
||||
assert self.partitioned_swap_pool is not None, f'partitioned swap pool for fp16 params not initialized'
|
||||
assert self.partitioned_swap_buffer is not None, 'partitioned swap buffers for fp16 params not initialized'
|
||||
assert self.partitioned_swap_pool is not None, 'partitioned swap pool for fp16 params not initialized'
|
||||
assert len(dst_fp16_params) == len(src_fp32_params), \
|
||||
f'mismatch in number of fp16 params {len(dst_fp16_params)} and fp32 params {len(src_fp32_params)}'
|
||||
|
||||
|
@ -213,7 +213,7 @@ class PipelinedOptimizerSwapper(OptimizerSwapper):
|
||||
count=required_buffer_count,
|
||||
dtype=parameter.dtype)
|
||||
assert allocated_buffers is not None, \
|
||||
f"PipelinedOptimizerSwapper ran out of swap buffers, try increasing 'buffer_count'"
|
||||
"PipelinedOptimizerSwapper ran out of swap buffers, try increasing 'buffer_count'"
|
||||
|
||||
state_buffers = allocated_buffers[:num_swap_tensors]
|
||||
param_info.set_swap_buffers(state_buffers, aligned_numel)
|
||||
|
@ -30,7 +30,7 @@ def swap_out_tensors(swap_handle, tensor_buffers, swap_paths):
|
||||
def print_object(obj, name, exclude_list=[]):
|
||||
logger.info('{}:'.format(name))
|
||||
for arg in sorted(vars(obj)):
|
||||
if not arg in exclude_list:
|
||||
if arg not in exclude_list:
|
||||
dots = '.' * (29 - len(arg))
|
||||
logger.info(' {} {} {}'.format(arg, dots, getattr(obj, arg)))
|
||||
|
||||
@ -55,7 +55,7 @@ class SwapBuffer(object):
|
||||
|
||||
def allocate_tensor(self, swap_path, numel, aligned_numel):
|
||||
assert self.has_space(aligned_numel)
|
||||
assert not self.offset in self.swap_tensors
|
||||
assert self.offset not in self.swap_tensors
|
||||
|
||||
allocate_offset = self.offset
|
||||
swap_tensor = self.buffer.narrow(0, allocate_offset, aligned_numel)
|
||||
|
@ -846,7 +846,7 @@ def get_global_norm_of_tensors(input_tensors, norm_type=2, mpu=None, use_graph=F
|
||||
Total norm of the tensors (viewed as a single vector).
|
||||
"""
|
||||
assert isinstance(input_tensors, Iterable), f'expected Iterable type not {type(input_tensors)}'
|
||||
assert all([torch.is_tensor(t) for t in input_tensors]), f'expected list of only tensors'
|
||||
assert all([torch.is_tensor(t) for t in input_tensors]), 'expected list of only tensors'
|
||||
|
||||
norm_type = float(norm_type)
|
||||
all_norms = []
|
||||
|
@ -85,7 +85,7 @@ class ContiguousMemoryAllocator(object):
|
||||
|
||||
assert tensor_id in self.tensor_map.keys(), "No such tensor allocated by the allocator."
|
||||
assert tensor.numel() >= numel, "Assert tensor buffer does is not large enough"
|
||||
assert not tensor_id in self.id_to_params.keys(), "This tensor has already been assigned to a param"
|
||||
assert tensor_id not in self.id_to_params.keys(), "This tensor has already been assigned to a param"
|
||||
|
||||
self.id_to_params[tensor_id] = [param]
|
||||
|
||||
|
@ -47,7 +47,7 @@ class MiCS_AllGatherCoalescedHandle(AllGatherCoalescedHandle):
|
||||
instrument_w_nvtx(self.allgather_handle.wait)()
|
||||
except (ValueError, RuntimeError) as e:
|
||||
log_dist(
|
||||
f"WARNING: Runtime Error while waiting the collective all-gather, possibly due to the _IllegalWork",
|
||||
"WARNING: Runtime Error while waiting the collective all-gather, possibly due to the _IllegalWork",
|
||||
ranks=[0])
|
||||
log_dist(f"Error message: {e}", ranks=[0])
|
||||
|
||||
@ -158,7 +158,7 @@ class MiCS_Init(Init):
|
||||
|
||||
if sequence_data_parallel_group is not None:
|
||||
logger.warning(
|
||||
f"sequence_data_parallel_group' is deprecated and will be removed. Use 'data_parallel_group' instead.")
|
||||
"sequence_data_parallel_group' is deprecated and will be removed. Use 'data_parallel_group' instead.")
|
||||
if data_parallel_group is not None:
|
||||
raise ValueError(
|
||||
"Both 'data_parallel_group' and 'sequence_data_parallel_group' were specified. Please provide only one of these arguments."
|
||||
@ -339,7 +339,7 @@ class MiCS_Offload(DeepSpeedZeRoOffload):
|
||||
""" overload the parent class function for convert the parameters
|
||||
|
||||
"""
|
||||
log_dist(f'Convert to zero parameters from MiCS Offload manager', ranks=[0])
|
||||
log_dist('Convert to zero parameters from MiCS Offload manager', ranks=[0])
|
||||
non_zero_params = [p for p in module.parameters() if not is_zero_param(p)]
|
||||
if non_zero_params:
|
||||
zero_params = [p for p in module.parameters() if is_zero_param(p)]
|
||||
|
@ -1020,7 +1020,7 @@ class Init(InsertPostInitMethodToModuleSubClasses):
|
||||
|
||||
if sequence_data_parallel_group is not None:
|
||||
logger.warning(
|
||||
f"sequence_data_parallel_group' is deprecated and will be removed. Use 'data_parallel_group' instead.")
|
||||
"sequence_data_parallel_group' is deprecated and will be removed. Use 'data_parallel_group' instead.")
|
||||
if data_parallel_group is not None:
|
||||
raise ValueError(
|
||||
"Both 'data_parallel_group' and 'sequence_data_parallel_group' were specified. Please provide only one of these arguments."
|
||||
|
@ -459,7 +459,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
|
||||
self.offloaded_states: Set[OffloadDeviceEnum] = set()
|
||||
|
||||
if dist.get_rank(group=self.dp_process_group) == 0:
|
||||
see_memory_usage(f"After initializing ZeRO optimizer", force=True)
|
||||
see_memory_usage("After initializing ZeRO optimizer", force=True)
|
||||
|
||||
def destroy(self):
|
||||
self.parameter_offload.destroy()
|
||||
@ -551,7 +551,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
|
||||
dist.barrier()
|
||||
|
||||
if dist.get_rank() == 0:
|
||||
logger.info(f"optimizer state initialized")
|
||||
logger.info("optimizer state initialized")
|
||||
|
||||
# IPG
|
||||
if self.contiguous_gradients:
|
||||
@ -647,7 +647,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
|
||||
nvme_swap_folder = os.path.join(offload_optimizer_config.nvme_path, 'zero_stage_3')
|
||||
os.makedirs(nvme_swap_folder, exist_ok=True)
|
||||
if dist.get_rank() == 0:
|
||||
logger.info(f'Tensor Swapping: Adding optimizer tensors')
|
||||
logger.info('Tensor Swapping: Adding optimizer tensors')
|
||||
|
||||
swapper_type = PipelinedOptimizerSwapper if offload_optimizer_config.pipeline else PartitionedOptimizerSwapper
|
||||
|
||||
@ -797,7 +797,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
|
||||
largest_partition_numel = [t.ds_numel for t in sub_group]
|
||||
max_partition_numel = total_elements
|
||||
|
||||
assert len(largest_partition_numel) > 0, f'Unexpected that largest partition is empty'
|
||||
assert len(largest_partition_numel) > 0, 'Unexpected that largest partition is empty'
|
||||
self.fp16_groups[0][0].nvme_swapper.reserve_partitioned_swap_space(largest_partition_numel)
|
||||
|
||||
def _get_parameter_partitions(self) -> List[Tensor]:
|
||||
@ -1142,10 +1142,10 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
|
||||
|
||||
@instrument_w_nvtx
|
||||
def independent_gradient_partition_epilogue(self):
|
||||
self.report_ipg_memory_usage(f"In ipg_epilogue before reduce_ipg_grads", 0)
|
||||
self.report_ipg_memory_usage("In ipg_epilogue before reduce_ipg_grads", 0)
|
||||
for comm_dtype in sort_dtypes(self.ipg_buckets.keys()):
|
||||
self.__reduce_and_partition_ipg_grads(comm_dtype)
|
||||
self.report_ipg_memory_usage(f"In ipg_epilogue after reduce_ipg_grads", 0)
|
||||
self.report_ipg_memory_usage("In ipg_epilogue after reduce_ipg_grads", 0)
|
||||
|
||||
if not get_accelerator().resolves_data_dependency():
|
||||
self.reduce_and_partition_stream.synchronize()
|
||||
@ -1173,7 +1173,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
|
||||
self.independent_gradient_partition_epilogue()
|
||||
|
||||
def create_reduce_and_remove_grad_hooks(self):
|
||||
print_rank_0(f'[Begin] Create gradient reduction hooks')
|
||||
print_rank_0('[Begin] Create gradient reduction hooks')
|
||||
self.leaf_parameters = defaultdict(list)
|
||||
for i, param_group in enumerate(self.fp16_groups):
|
||||
for param in param_group:
|
||||
@ -1256,7 +1256,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
|
||||
self._leaf_module_hooks.append(leaf_module.register_forward_pre_hook(wrapper_pre_hook(leaf_parameters)))
|
||||
self._leaf_module_hooks.append(leaf_module.register_forward_hook(wrapper_post_hook()))
|
||||
|
||||
print_rank_0(f'[End] Create gradient reduction hooks')
|
||||
print_rank_0('[End] Create gradient reduction hooks')
|
||||
|
||||
def get_param_id(self, param):
|
||||
return OptimizerSwapper.parameter_id(param)
|
||||
@ -1426,7 +1426,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
|
||||
self.grad_position[param_id] = [int(i), int(current_offset), int(num_elements)]
|
||||
#print(f"param id {param_id} i:{i}, ds_tensor {num_elements} numel {param.numel()}")
|
||||
current_offset += num_elements
|
||||
see_memory_usage(f"After Set Grad positions", force=False)
|
||||
see_memory_usage("After Set Grad positions", force=False)
|
||||
|
||||
def _constant_buffered_norm2(self, input, buffer_size=250000000):
|
||||
norm = None
|
||||
@ -1515,7 +1515,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
|
||||
self.norm_for_param_grads[self.get_param_id(param)] = self._constant_buffered_norm2(grad_buffer)
|
||||
|
||||
if self._swappable_optimizer_subgroup(i):
|
||||
if not i in offload_fp32_gradients.keys():
|
||||
if i not in offload_fp32_gradients.keys():
|
||||
offload_fp32_gradients[i] = []
|
||||
offload_fp32_offsets[i] = []
|
||||
|
||||
@ -1560,7 +1560,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
|
||||
"""
|
||||
if not self.zero_quantized_nontrainable_weights:
|
||||
print_rank_0(
|
||||
f"Warning: quantize_nontrainable_params() called with zero_quantized_nontrainable_weights disabled, return without doing anything",
|
||||
"Warning: quantize_nontrainable_params() called with zero_quantized_nontrainable_weights disabled, return without doing anything",
|
||||
force=True)
|
||||
return
|
||||
quantizer_module = CUDAQuantizer()
|
||||
@ -1881,8 +1881,8 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
|
||||
def _pre_step(self):
|
||||
self.micro_step_id = 0
|
||||
|
||||
print_rank_0(f"Inside Step function")
|
||||
see_memory_usage(f"In step before checking overflow", force=False)
|
||||
print_rank_0("Inside Step function")
|
||||
see_memory_usage("In step before checking overflow", force=False)
|
||||
|
||||
print_rank_0("Finished Tracing at Beginning of Step")
|
||||
self._get_param_coordinator().hierarchy = 0
|
||||
@ -2084,7 +2084,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
|
||||
self.timers.log(timer_names)
|
||||
|
||||
see_memory_usage('After zero_optimizer step', force=False)
|
||||
print_rank_0(f"------------------Finishing Step-----------------------")
|
||||
print_rank_0("------------------Finishing Step-----------------------")
|
||||
|
||||
@instrument_w_nvtx
|
||||
def _reassign_or_swap_out_partitioned_parameters(self, sub_group_id):
|
||||
@ -2296,7 +2296,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
|
||||
if self.swap_optimizer:
|
||||
self.optimizer_swapper.pre_backward()
|
||||
|
||||
see_memory_usage(f"Before backward", force=False)
|
||||
see_memory_usage("Before backward", force=False)
|
||||
|
||||
if self.custom_loss_scaler:
|
||||
scaled_loss = self.external_loss_scale * loss
|
||||
@ -2486,7 +2486,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
|
||||
if not param.requires_grad:
|
||||
return
|
||||
|
||||
assert hasattr(param, "ds_tensor"), f" The parameter does not contain the partitioned copy of the tensor."
|
||||
assert hasattr(param, "ds_tensor"), " The parameter does not contain the partitioned copy of the tensor."
|
||||
assert value.numel() == param.ds_tensor.numel(
|
||||
), f" Number of elements do not match: {value.numel()} != {param.ds_tensor.ds_numel}"
|
||||
|
||||
@ -2961,7 +2961,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
|
||||
|
||||
self.empty_partition_cache()
|
||||
|
||||
assert self.optimizer.__class__ == deepspeed.ops.adam.fused_adam.FusedAdam, f"Offloading is supported only for DeepSpeed FusedAdam."
|
||||
assert self.optimizer.__class__ == deepspeed.ops.adam.fused_adam.FusedAdam, "Offloading is supported only for DeepSpeed FusedAdam."
|
||||
|
||||
def needs_offload(target):
|
||||
# return True
|
||||
|
@ -602,10 +602,10 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
|
||||
see_memory_usage("After initializing optimizer states", force=True)
|
||||
|
||||
if dist.get_rank() == 0:
|
||||
logger.info(f"optimizer state initialized")
|
||||
logger.info("optimizer state initialized")
|
||||
|
||||
if dist.get_rank(group=self.dp_process_group) == 0:
|
||||
see_memory_usage(f"After initializing ZeRO optimizer", force=True)
|
||||
see_memory_usage("After initializing ZeRO optimizer", force=True)
|
||||
|
||||
self._link_all_hp_params()
|
||||
self._hp_optimizer_states_linked = False
|
||||
@ -722,7 +722,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
|
||||
|
||||
for i, tensor in enumerate(tensor_list):
|
||||
j = i % num_partitions
|
||||
if not j in partition_tensors:
|
||||
if j not in partition_tensors:
|
||||
partition_tensors[j] = []
|
||||
partition_tensors[j].append((i, tensor))
|
||||
|
||||
@ -828,9 +828,9 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
|
||||
i, param_group, partition_id)
|
||||
|
||||
def independent_gradient_partition_epilogue(self):
|
||||
self.report_ipg_memory_usage(f"In ipg_epilogue before reduce_ipg_grads", 0)
|
||||
self.report_ipg_memory_usage("In ipg_epilogue before reduce_ipg_grads", 0)
|
||||
self.reduce_ipg_grads()
|
||||
self.report_ipg_memory_usage(f"In ipg_epilogue after reduce_ipg_grads", 0)
|
||||
self.report_ipg_memory_usage("In ipg_epilogue after reduce_ipg_grads", 0)
|
||||
|
||||
# if dist.get_rank() == 0:
|
||||
# logger.info("Params already reduced %s", self.params_already_reduced)
|
||||
@ -846,7 +846,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
|
||||
if self.cpu_offload is False:
|
||||
for i, _ in enumerate(self.bit16_groups):
|
||||
|
||||
if not i in self.averaged_gradients or self.averaged_gradients[i] is None:
|
||||
if i not in self.averaged_gradients or self.averaged_gradients[i] is None:
|
||||
self.averaged_gradients[i] = self.get_flat_partition(
|
||||
self.params_in_partition[i],
|
||||
self.first_offset[i],
|
||||
@ -871,7 +871,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
|
||||
# All gradients required by the step
|
||||
# are in self.averaged_gradients
|
||||
self.zero_grad(set_to_none=True)
|
||||
see_memory_usage(f"End ipg_epilogue")
|
||||
see_memory_usage("End ipg_epilogue")
|
||||
|
||||
# resets all partition to no reduced
|
||||
# sets remaining grads to the total number of grads in each partition
|
||||
@ -1958,7 +1958,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
|
||||
"""
|
||||
self.micro_step_id = INITIAL_MICRO_STEP_ID
|
||||
|
||||
see_memory_usage(f"In step before checking overflow")
|
||||
see_memory_usage("In step before checking overflow")
|
||||
|
||||
# First compute norm for all group so we know if there is overflow
|
||||
if self.check_grad_overflow:
|
||||
@ -2448,7 +2448,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
|
||||
self.clip_grad = sd.get(CLIP_GRAD, self.clip_grad)
|
||||
|
||||
ckpt_version = sd.get(DS_VERSION, False)
|
||||
assert ckpt_version, f"Empty ds_version in checkpoint, not clear how to proceed"
|
||||
assert ckpt_version, "Empty ds_version in checkpoint, not clear how to proceed"
|
||||
ckpt_version = pkg_version.parse(ckpt_version)
|
||||
|
||||
# zero stage 1 mode
|
||||
|
@ -263,7 +263,7 @@ def safe_get_local_grad(param):
|
||||
Returns:
|
||||
Union[torch.Tensor, None]: A tensor on accelerator device
|
||||
"""
|
||||
assert hasattr(param, 'ds_id'), f'This API is only defined for ZeRO-3 partitioned parameters'
|
||||
assert hasattr(param, 'ds_id'), 'This API is only defined for ZeRO-3 partitioned parameters'
|
||||
return param._z3_optimizer.get_local_fp32_grad_for_param(param)
|
||||
|
||||
|
||||
@ -277,7 +277,7 @@ def safe_set_local_grad(param, value):
|
||||
param (``torch.nn.Parameter``): A model parameter.
|
||||
value (``torch.Tensor``): New value of local gradient partition.
|
||||
"""
|
||||
assert hasattr(param, 'ds_id'), f'This API is only defined for ZeRO-3 partitioned parameters'
|
||||
assert hasattr(param, 'ds_id'), 'This API is only defined for ZeRO-3 partitioned parameters'
|
||||
param._z3_optimizer.set_local_grad_for_param(value, param)
|
||||
|
||||
|
||||
@ -290,7 +290,7 @@ def safe_get_local_fp32_param(param):
|
||||
Returns:
|
||||
Union[torch.Tensor, None]: A tensor on accelerator device
|
||||
"""
|
||||
assert hasattr(param, 'ds_id'), f'This API is only defined for ZeRO-3 partitioned parameters'
|
||||
assert hasattr(param, 'ds_id'), 'This API is only defined for ZeRO-3 partitioned parameters'
|
||||
return param._z3_optimizer.get_local_fp32_param(param)
|
||||
|
||||
|
||||
@ -304,7 +304,7 @@ def safe_get_local_optimizer_state(param, optim_state_key):
|
||||
Returns:
|
||||
Union[torch.Tensor, None]: A tensor on accelerator device
|
||||
"""
|
||||
assert hasattr(param, 'ds_id'), f'This API is only defined for ZeRO-3 partitioned parameters'
|
||||
assert hasattr(param, 'ds_id'), 'This API is only defined for ZeRO-3 partitioned parameters'
|
||||
return param._z3_optimizer.get_local_fp32_param(param, optim_state_key)
|
||||
|
||||
|
||||
@ -316,7 +316,7 @@ def safe_set_local_optimizer_state(param, value, optim_state_key):
|
||||
value (``torch.Tensor``): New value of local optimizer state partition.
|
||||
optim_state_key (``string``): Key value of optimizer state (e.g., `exp_avg` in Adam optimizer).
|
||||
"""
|
||||
assert hasattr(param, 'ds_id'), f'This API is only defined for ZeRO-3 partitioned parameters'
|
||||
assert hasattr(param, 'ds_id'), 'This API is only defined for ZeRO-3 partitioned parameters'
|
||||
param._z3_optimizer.set_local_hp_param(value, param, optim_state_key)
|
||||
|
||||
|
||||
@ -327,7 +327,7 @@ def safe_set_local_fp32_param(param, value):
|
||||
param (``torch.nn.Parameter``): A model parameter.
|
||||
value (``torch.Tensor``): New value of local parameter partition.
|
||||
"""
|
||||
assert hasattr(param, 'ds_id'), f'This API is only defined for ZeRO-3 partitioned parameters'
|
||||
assert hasattr(param, 'ds_id'), 'This API is only defined for ZeRO-3 partitioned parameters'
|
||||
param._z3_optimizer.set_local_hp_param(value, param)
|
||||
|
||||
|
||||
|
@ -142,7 +142,7 @@ class SynchronizedWallClockTimer:
|
||||
def log(self, names, normalizer=1.0, reset=True, memory_breakdown=False, ranks=None):
|
||||
"""Log a group of timers."""
|
||||
assert normalizer > 0.0
|
||||
string = f"time (ms)"
|
||||
string = "time (ms)"
|
||||
for name in names:
|
||||
if name in self.timers:
|
||||
elapsed_time = (self.timers[name].elapsed(reset=reset) / normalizer)
|
||||
|
@ -155,7 +155,7 @@ def parse_optim_states(files, ds_checkpoint_dir):
|
||||
state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
|
||||
state_dicts.append(state_dict)
|
||||
|
||||
if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
|
||||
if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]:
|
||||
raise ValueError(f"{files[0]} is not a zero checkpoint")
|
||||
zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
|
||||
world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
|
||||
@ -709,10 +709,10 @@ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
|
||||
``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
|
||||
|
||||
"""
|
||||
logger.info(f"Extracting fp32 weights")
|
||||
logger.info("Extracting fp32 weights")
|
||||
state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
|
||||
|
||||
logger.info(f"Overwriting model with fp32 weights")
|
||||
logger.info("Overwriting model with fp32 weights")
|
||||
model = model.cpu()
|
||||
model.load_state_dict(state_dict, strict=False)
|
||||
|
||||
|
@ -50,7 +50,7 @@ class FPQuantizerBuilder(CUDAOpBuilder):
|
||||
except ImportError:
|
||||
if verbose:
|
||||
self.warning(
|
||||
f"please install triton==2.3.0, 2.3.1 or 3.0.0 if you want to use the FP Quantizer Kernels")
|
||||
"please install triton==2.3.0, 2.3.1 or 3.0.0 if you want to use the FP Quantizer Kernels")
|
||||
return False
|
||||
|
||||
# triton 2.3.{0,1} and 3.0.0 are ok.
|
||||
|
@ -42,7 +42,7 @@ class SparseAttnBuilder(OpBuilder):
|
||||
import torch
|
||||
except ImportError:
|
||||
if verbose:
|
||||
self.warning(f"unable to import torch, please install it first")
|
||||
self.warning("unable to import torch, please install it first")
|
||||
return False
|
||||
|
||||
# torch-cpu will not have a cuda version
|
||||
@ -70,7 +70,7 @@ class SparseAttnBuilder(OpBuilder):
|
||||
# auto-install of triton is broken on some systems, reverting to manual install for now
|
||||
# see this issue: https://github.com/deepspeedai/DeepSpeed/issues/1710
|
||||
if verbose:
|
||||
self.warning(f"please install triton==1.0.0 if you want to use sparse attention")
|
||||
self.warning("please install triton==1.0.0 if you want to use sparse attention")
|
||||
return False
|
||||
|
||||
if pkg_version:
|
||||
|
@ -99,7 +99,7 @@ def benchmark():
|
||||
with cuda_timer(baseline_bw):
|
||||
ref_out.backward(d_out)
|
||||
|
||||
print(f"batch size\tours (FW)\tbaseline (FW)\tours (BW)\tbaseline (BW)")
|
||||
print("batch size\tours (FW)\tbaseline (FW)\tours (BW)\tbaseline (BW)")
|
||||
for i in range(len(ours_fw)):
|
||||
print(f"{i+1}\t{ours_fw[i]}\t{baseline_fw[i]}\t{ours_bw[i]}\t{baseline_bw[i]}")
|
||||
|
||||
|
@ -15,9 +15,9 @@ import deepspeed
|
||||
class VerboseLinear(torch.nn.Linear):
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
print(f'Begin VerboseLinear.__init__')
|
||||
print('Begin VerboseLinear.__init__')
|
||||
super().__init__(**kwargs)
|
||||
print(f'End VerboseLinear.__init__')
|
||||
print('End VerboseLinear.__init__')
|
||||
|
||||
|
||||
class LinearStack(torch.nn.Module):
|
||||
|
@ -24,7 +24,7 @@ from deepspeed.runtime.utils import is_model_parallel_parameter
|
||||
|
||||
def skip_on_device():
|
||||
if get_accelerator().device_name() == 'xpu':
|
||||
pytest.skip(f"XPU requires a higher version for test")
|
||||
pytest.skip("XPU requires a higher version for test")
|
||||
|
||||
|
||||
class SequentialLinearModel(torch.nn.Module):
|
||||
@ -449,9 +449,9 @@ class TestSave(DistributedTest):
|
||||
base_state_dict = base_model.state_dict()
|
||||
if dist.get_rank() == 0:
|
||||
# we should consider the case when zero3 is used in the future.
|
||||
assert compare_state_dicts(base_state_dict, tp_state_dict), f"State_dict is not the same!"
|
||||
assert compare_state_dicts(base_state_dict, tp_state_dict), "State_dict is not the same!"
|
||||
else:
|
||||
assert tp_state_dict is None, f"noly rank0 should have the state_dict"
|
||||
assert tp_state_dict is None, "noly rank0 should have the state_dict"
|
||||
|
||||
def test_ckpt_save(self, tmpdir, tp_size: int, zero_stage: int):
|
||||
skip_on_device()
|
||||
|
@ -288,7 +288,7 @@ class TestExpertWeightGradWithZero(DistributedTest):
|
||||
"""
|
||||
rank = int(deepspeed.comm.get_rank())
|
||||
ep_state_dict = dict()
|
||||
dst_sub_key = f"deepspeed_moe.experts.deepspeed_experts.0"
|
||||
dst_sub_key = "deepspeed_moe.experts.deepspeed_experts.0"
|
||||
src_sub_key = f"deepspeed_moe.experts.deepspeed_experts.{rank}"
|
||||
for moe_layer in ["moe_1", "moe_2"]:
|
||||
for mlp_in_moe in [0, 1]:
|
||||
|
@ -149,7 +149,7 @@ class TestOneBitAdamExpAvgMask(DistributedTest):
|
||||
v["exp_avg"],
|
||||
v["exp_avg"].mul_(mask1.to(device=v["exp_avg"].device)),
|
||||
atol=1e-07,
|
||||
), f"Momentum mask is not working properly"
|
||||
), "Momentum mask is not working properly"
|
||||
|
||||
|
||||
class TestOneBitAdamCheckpointing(DistributedTest):
|
||||
@ -241,11 +241,11 @@ class TestOneBitAdamCheckpointing(DistributedTest):
|
||||
assert optimizer_1.optimizer.adam_freeze_key is True
|
||||
mask1 = mask1.to(device=optimizer_1.param_groups[0]["exp_avg_mask"].device)
|
||||
assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1,
|
||||
atol=1e-07), f"Incorrect momentum mask"
|
||||
atol=1e-07), "Incorrect momentum mask"
|
||||
save_folder = os.path.join(tmpdir, "saved_checkpoint")
|
||||
model_1.save_checkpoint(save_folder, tag=None)
|
||||
assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1,
|
||||
atol=1e-07), f"Momentum mask should not change after saving checkpoint"
|
||||
atol=1e-07), "Momentum mask should not change after saving checkpoint"
|
||||
|
||||
model_2, optimizer_2, _, _ = deepspeed.initialize(
|
||||
config=config_dict,
|
||||
@ -255,7 +255,7 @@ class TestOneBitAdamCheckpointing(DistributedTest):
|
||||
# Test whether momentum mask stays the same after loading checkpoint
|
||||
mask2 = mask2.to(device=optimizer_2.param_groups[0]["exp_avg_mask"].device)
|
||||
assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2,
|
||||
atol=1e-07), f"Incorrect momentum mask"
|
||||
atol=1e-07), "Incorrect momentum mask"
|
||||
model_2.load_checkpoint(
|
||||
save_folder,
|
||||
tag=None,
|
||||
@ -263,11 +263,11 @@ class TestOneBitAdamCheckpointing(DistributedTest):
|
||||
load_lr_scheduler_states=True,
|
||||
)
|
||||
assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2,
|
||||
atol=1e-07), f"Momentum mask should not change after loading checkpoint"
|
||||
atol=1e-07), "Momentum mask should not change after loading checkpoint"
|
||||
# Test whether worker&server error is reset
|
||||
for v in optimizer_2.state.values():
|
||||
assert "worker_error" not in v, f"Incorrect worker error"
|
||||
assert "server_error" not in v, f"Incorrect server error"
|
||||
assert "worker_error" not in v, "Incorrect worker error"
|
||||
assert "server_error" not in v, "Incorrect server error"
|
||||
assert optimizer_2.optimizer.adam_freeze_key is True
|
||||
|
||||
model_3, optimizer_3, _, _ = deepspeed.initialize(
|
||||
@ -287,7 +287,7 @@ class TestOneBitAdamCheckpointing(DistributedTest):
|
||||
model_3.step()
|
||||
assert optimizer_3.optimizer.adam_freeze_key is True
|
||||
# Test whether momentum mask stays the same after loading checkpoint
|
||||
assert ("exp_avg_mask" not in optimizer_3.param_groups[0]), f"Incorrect momentum mask"
|
||||
assert ("exp_avg_mask" not in optimizer_3.param_groups[0]), "Incorrect momentum mask"
|
||||
model_3.load_checkpoint(
|
||||
save_folder,
|
||||
tag=None,
|
||||
@ -295,11 +295,11 @@ class TestOneBitAdamCheckpointing(DistributedTest):
|
||||
load_lr_scheduler_states=True,
|
||||
)
|
||||
assert ("exp_avg_mask"
|
||||
not in optimizer_3.param_groups[0]), f"Momentum mask should not change after loading checkpoint"
|
||||
not in optimizer_3.param_groups[0]), "Momentum mask should not change after loading checkpoint"
|
||||
# Test whether worker&server error is reset
|
||||
for v in optimizer_3.state.values():
|
||||
assert "worker_error" not in v, f"Incorrect worker error"
|
||||
assert "server_error" not in v, f"Incorrect server error"
|
||||
assert "worker_error" not in v, "Incorrect worker error"
|
||||
assert "server_error" not in v, "Incorrect server error"
|
||||
assert optimizer_3.optimizer.adam_freeze_key is False
|
||||
|
||||
def test_overflow(self, tmpdir):
|
||||
@ -518,7 +518,7 @@ class TestZeroOneAdamExpAvgMask(DistributedTest):
|
||||
v["exp_avg"],
|
||||
v["exp_avg"].mul_(mask1.to(device=v["exp_avg"].device)),
|
||||
atol=1e-07,
|
||||
), f"Momentum mask is not working properly"
|
||||
), "Momentum mask is not working properly"
|
||||
|
||||
|
||||
class TestZeroOneAdamCheckpointing(DistributedTest):
|
||||
@ -614,11 +614,11 @@ class TestZeroOneAdamCheckpointing(DistributedTest):
|
||||
# Test whether momentum mask still exist after saving checkpoint
|
||||
mask1 = mask1.to(device=optimizer_1.param_groups[0]["exp_avg_mask"].device)
|
||||
assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1,
|
||||
atol=1e-07), f"Incorrect momentum mask"
|
||||
atol=1e-07), "Incorrect momentum mask"
|
||||
save_folder = os.path.join(tmpdir, "saved_checkpoint")
|
||||
model_1.save_checkpoint(save_folder, tag=None)
|
||||
assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1,
|
||||
atol=1e-07), f"Momentum mask should not change after saving checkpoint"
|
||||
atol=1e-07), "Momentum mask should not change after saving checkpoint"
|
||||
|
||||
model_2, optimizer_2, _, _ = deepspeed.initialize(
|
||||
config=config_dict,
|
||||
@ -628,7 +628,7 @@ class TestZeroOneAdamCheckpointing(DistributedTest):
|
||||
# Test whether momentum mask stays the same after loading checkpoint
|
||||
mask2 = mask2.to(device=optimizer_2.param_groups[0]["exp_avg_mask"].device)
|
||||
assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2,
|
||||
atol=1e-07), f"Incorrect momentum mask"
|
||||
atol=1e-07), "Incorrect momentum mask"
|
||||
model_2.load_checkpoint(
|
||||
save_folder,
|
||||
tag=None,
|
||||
@ -636,11 +636,11 @@ class TestZeroOneAdamCheckpointing(DistributedTest):
|
||||
load_lr_scheduler_states=True,
|
||||
)
|
||||
assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2,
|
||||
atol=1e-07), f"Momentum mask should not change after loading checkpoint"
|
||||
atol=1e-07), "Momentum mask should not change after loading checkpoint"
|
||||
# Test whether worker&server error is reset
|
||||
for v in optimizer_2.state.values():
|
||||
assert "worker_error" not in v, f"Incorrect worker error"
|
||||
assert "server_error" not in v, f"Incorrect server error"
|
||||
assert "worker_error" not in v, "Incorrect worker error"
|
||||
assert "server_error" not in v, "Incorrect server error"
|
||||
|
||||
model_3, optimizer_3, _, _ = deepspeed.initialize(
|
||||
config=config_dict,
|
||||
@ -658,7 +658,7 @@ class TestZeroOneAdamCheckpointing(DistributedTest):
|
||||
model_3.backward(loss)
|
||||
model_3.step()
|
||||
# Test whether momentum mask stays the same after loading checkpoint
|
||||
assert ("exp_avg_mask" not in optimizer_3.param_groups[0]), f"Incorrect momentum mask"
|
||||
assert ("exp_avg_mask" not in optimizer_3.param_groups[0]), "Incorrect momentum mask"
|
||||
model_3.load_checkpoint(
|
||||
save_folder,
|
||||
tag=None,
|
||||
@ -666,11 +666,11 @@ class TestZeroOneAdamCheckpointing(DistributedTest):
|
||||
load_lr_scheduler_states=True,
|
||||
)
|
||||
assert ("exp_avg_mask"
|
||||
not in optimizer_3.param_groups[0]), f"Momentum mask should not change after loading checkpoint"
|
||||
not in optimizer_3.param_groups[0]), "Momentum mask should not change after loading checkpoint"
|
||||
# Test whether worker&server error is reset
|
||||
for v in optimizer_3.state.values():
|
||||
assert "worker_error" not in v, f"Incorrect worker error"
|
||||
assert "server_error" not in v, f"Incorrect server error"
|
||||
assert "worker_error" not in v, "Incorrect worker error"
|
||||
assert "server_error" not in v, "Incorrect server error"
|
||||
|
||||
def test_overflow(self, tmpdir):
|
||||
if not get_accelerator().is_fp16_supported():
|
||||
@ -899,7 +899,7 @@ class TestOneBitLampExpAvgMask(DistributedTest):
|
||||
v["exp_avg"],
|
||||
v["exp_avg"].mul_(mask1.to(device=v["exp_avg"].device)),
|
||||
atol=1e-07,
|
||||
), f"Momentum mask is not working properly"
|
||||
), "Momentum mask is not working properly"
|
||||
|
||||
|
||||
class TestOneBitLambCheckpointing(DistributedTest):
|
||||
@ -997,15 +997,15 @@ class TestOneBitLambCheckpointing(DistributedTest):
|
||||
assert optimizer_1.optimizer.lamb_freeze_key is True
|
||||
mask1 = mask1.to(device=optimizer_1.param_groups[0]["exp_avg_mask"].device)
|
||||
assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1,
|
||||
atol=1e-07), f"Incorrect momentum mask"
|
||||
atol=1e-07), "Incorrect momentum mask"
|
||||
scaling_coeff_1 = []
|
||||
for v in optimizer_1.state.values():
|
||||
assert "scaling_coeff" in v, f"Incorrect scaling_coeff"
|
||||
assert "scaling_coeff" in v, "Incorrect scaling_coeff"
|
||||
scaling_coeff_1.append(v["scaling_coeff"])
|
||||
save_folder = os.path.join(tmpdir, "saved_checkpoint")
|
||||
model_1.save_checkpoint(save_folder, tag=None)
|
||||
assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1,
|
||||
atol=1e-07), f"Momentum mask should not change after saving checkpoint"
|
||||
atol=1e-07), "Momentum mask should not change after saving checkpoint"
|
||||
|
||||
model_2, optimizer_2, _, _ = deepspeed.initialize(
|
||||
config=config_dict,
|
||||
@ -1015,7 +1015,7 @@ class TestOneBitLambCheckpointing(DistributedTest):
|
||||
# Test whether momentum mask stays the same after loading checkpoint
|
||||
mask2 = mask2.to(device=optimizer_2.param_groups[0]["exp_avg_mask"].device)
|
||||
assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2,
|
||||
atol=1e-07), f"Incorrect momentum mask"
|
||||
atol=1e-07), "Incorrect momentum mask"
|
||||
model_2.load_checkpoint(
|
||||
save_folder,
|
||||
tag=None,
|
||||
@ -1023,16 +1023,16 @@ class TestOneBitLambCheckpointing(DistributedTest):
|
||||
load_lr_scheduler_states=True,
|
||||
)
|
||||
assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2,
|
||||
atol=1e-07), f"Momentum mask should not change after loading checkpoint"
|
||||
atol=1e-07), "Momentum mask should not change after loading checkpoint"
|
||||
# Test whether worker&server error is reset
|
||||
assert len(optimizer_2.optimizer.worker_errors) == 0, f"Incorrect worker error"
|
||||
assert len(optimizer_2.optimizer.server_errors) == 0, f"Incorrect server error"
|
||||
assert len(optimizer_2.optimizer.worker_errors) == 0, "Incorrect worker error"
|
||||
assert len(optimizer_2.optimizer.server_errors) == 0, "Incorrect server error"
|
||||
# Test whether scaling_coeffs is loaded correctly
|
||||
scaling_coeff_2 = []
|
||||
for v in optimizer_2.state.values():
|
||||
assert "scaling_coeff" in v, f"Incorrect scaling_coeff"
|
||||
assert "scaling_coeff" in v, "Incorrect scaling_coeff"
|
||||
scaling_coeff_2.append(v["scaling_coeff"])
|
||||
assert list(sorted(scaling_coeff_2)) == list(sorted(scaling_coeff_1)), f"Incorrect scaling_coeffs"
|
||||
assert list(sorted(scaling_coeff_2)) == list(sorted(scaling_coeff_1)), "Incorrect scaling_coeffs"
|
||||
assert optimizer_2.optimizer.lamb_freeze_key is True
|
||||
|
||||
model_3, optimizer_3, _, _ = deepspeed.initialize(
|
||||
@ -1052,7 +1052,7 @@ class TestOneBitLambCheckpointing(DistributedTest):
|
||||
model_3.step()
|
||||
assert optimizer_3.optimizer.lamb_freeze_key is True
|
||||
# Test whether momentum mask stays the same after loading checkpoint
|
||||
assert ("exp_avg_mask" not in optimizer_3.param_groups[0]), f"Incorrect momentum mask"
|
||||
assert ("exp_avg_mask" not in optimizer_3.param_groups[0]), "Incorrect momentum mask"
|
||||
model_3.load_checkpoint(
|
||||
save_folder,
|
||||
tag=None,
|
||||
@ -1060,15 +1060,15 @@ class TestOneBitLambCheckpointing(DistributedTest):
|
||||
load_lr_scheduler_states=True,
|
||||
)
|
||||
assert ("exp_avg_mask"
|
||||
not in optimizer_3.param_groups[0]), f"Momentum mask should not change after loading checkpoint"
|
||||
not in optimizer_3.param_groups[0]), "Momentum mask should not change after loading checkpoint"
|
||||
# Test whether worker&server error is reset
|
||||
assert len(optimizer_3.optimizer.worker_errors) == 0, f"Incorrect worker error"
|
||||
assert len(optimizer_3.optimizer.server_errors) == 0, f"Incorrect server error"
|
||||
assert len(optimizer_3.optimizer.worker_errors) == 0, "Incorrect worker error"
|
||||
assert len(optimizer_3.optimizer.server_errors) == 0, "Incorrect server error"
|
||||
# Test whether scaling_coeffs, lamb_coeff_freeze, last_factor are reset
|
||||
for v in optimizer_3.state.values():
|
||||
assert v["lamb_coeff_freeze"] == 0.0, f"Incorrect lamb_coeff_freeze"
|
||||
assert v["last_factor"] == 1.0, f"Incorrect last_factor"
|
||||
assert "scaling_coeff" not in v, f"Incorrect scaling_coeff"
|
||||
assert v["lamb_coeff_freeze"] == 0.0, "Incorrect lamb_coeff_freeze"
|
||||
assert v["last_factor"] == 1.0, "Incorrect last_factor"
|
||||
assert "scaling_coeff" not in v, "Incorrect scaling_coeff"
|
||||
assert optimizer_3.optimizer.lamb_freeze_key is False
|
||||
|
||||
def test_overflow(self, tmpdir):
|
||||
|
@ -236,4 +236,4 @@ class TestLegacyCurriculumScheduler(DistributedTest):
|
||||
model.step()
|
||||
if n + 1 in ground_truths:
|
||||
true_seqlen = ground_truths[n + 1]
|
||||
assert seqlen == true_seqlen, f"Incorrect curriculum schedule"
|
||||
assert seqlen == true_seqlen, "Incorrect curriculum schedule"
|
||||
|
@ -49,7 +49,7 @@ class TestTwoOutputModel(DistributedTest):
|
||||
targets=[1, 2])
|
||||
for n, batch in enumerate(data_loader):
|
||||
assert len(batch) % 2 == 0, \
|
||||
f"multi_output_dataloader failed to return even number of data samples (input+target)"
|
||||
"multi_output_dataloader failed to return even number of data samples (input+target)"
|
||||
|
||||
midpoint = len(batch) // 2
|
||||
inputs, targets = batch[:midpoint], batch[midpoint:]
|
||||
@ -107,7 +107,7 @@ class TestThreeOutputModel(DistributedTest):
|
||||
targets=[1, 2, 3])
|
||||
for n, batch in enumerate(data_loader):
|
||||
assert len(batch) % 2 == 0, \
|
||||
f"multi_output_dataloader failed to return even number of data samples (input+target)"
|
||||
"multi_output_dataloader failed to return even number of data samples (input+target)"
|
||||
|
||||
midpoint = len(batch) // 2
|
||||
inputs, targets = batch[:midpoint], batch[midpoint:]
|
||||
|
@ -62,14 +62,14 @@ def run_model(model, param_groups, config_dict, hidden_dim, dtype, offloaded_sta
|
||||
pin_memory=pin_memory,
|
||||
non_blocking=non_blocking)
|
||||
alloc_after_offload = get_accelerator().memory_allocated()
|
||||
assert alloc_after_offload < alloc_before_offload, f"Allocated memory should decrease after offload"
|
||||
assert alloc_after_offload < alloc_before_offload, "Allocated memory should decrease after offload"
|
||||
|
||||
validate_device(model, torch.device(offload_device.value), offloaded_states)
|
||||
|
||||
# Reload states
|
||||
model.reload_states()
|
||||
assert alloc_after_offload < get_accelerator().memory_allocated(
|
||||
), f"Allocated memory should increase after offload back"
|
||||
), "Allocated memory should increase after offload back"
|
||||
|
||||
# Verify restored states
|
||||
hp_param_restored = [safe_get_local_fp32_param(p) for p in model.parameters()]
|
||||
|
@ -255,7 +255,7 @@ class TestZ3LeafOptimization(DistributedTest):
|
||||
loss, duration = bench_loss_and_time(config_dict)
|
||||
|
||||
if dist.get_rank() == 0:
|
||||
print(f"baseline exec time:", baseline_exec_time)
|
||||
print("baseline exec time:", baseline_exec_time)
|
||||
print(
|
||||
f"finegrained optimziation exec time: {duration},granularity threshold:{module_granularity_threshold} "
|
||||
)
|
||||
|
Reference in New Issue
Block a user