Remove local_rank arg from TrainingArguments (#41382)

This commit is contained in:
Marc Sun
2025-10-09 18:54:12 +02:00
committed by GitHub
parent 081391b20e
commit 0419ff881d
36 changed files with 63 additions and 72 deletions

View File

@ -99,18 +99,18 @@ def main():
logging.basicConfig( logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S", datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, level=logging.INFO if training_args.local_process_index in [-1, 0] else logging.WARN,
) )
logger.warning( logger.warning(
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
training_args.local_rank, training_args.local_process_index,
training_args.device, training_args.device,
training_args.n_gpu, training_args.n_gpu,
bool(training_args.local_rank != -1), bool(training_args.parallel_mode.value == "distributed"),
training_args.fp16, training_args.fp16,
) )
# Set the verbosity to info of the Transformers logger (on main process only): # Set the verbosity to info of the Transformers logger (on main process only):
if is_main_process(training_args.local_rank): if is_main_process(training_args.local_process_index):
transformers.utils.logging.set_verbosity_info() transformers.utils.logging.set_verbosity_info()
transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format() transformers.utils.logging.enable_explicit_format()

View File

@ -80,18 +80,18 @@ def main():
logging.basicConfig( logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S", datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, level=logging.INFO if training_args.local_process_index in [-1, 0] else logging.WARN,
) )
logger.warning( logger.warning(
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
training_args.local_rank, training_args.local_process_index,
training_args.device, training_args.device,
training_args.n_gpu, training_args.n_gpu,
bool(training_args.local_rank != -1), bool(training_args.parallel_mode.value == "distributed"),
training_args.fp16, training_args.fp16,
) )
# Set the verbosity to info of the Transformers logger (on main process only): # Set the verbosity to info of the Transformers logger (on main process only):
if is_main_process(training_args.local_rank): if is_main_process(training_args.local_process_index):
transformers.utils.logging.set_verbosity_info() transformers.utils.logging.set_verbosity_info()
transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format() transformers.utils.logging.enable_explicit_format()

View File

@ -212,18 +212,18 @@ def main():
logging.basicConfig( logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S", datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, level=logging.INFO if training_args.local_process_index in [-1, 0] else logging.WARN,
) )
logger.warning( logger.warning(
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
training_args.local_rank, training_args.local_process_index,
training_args.device, training_args.device,
training_args.n_gpu, training_args.n_gpu,
bool(training_args.local_rank != -1), bool(training_args.parallel_mode.value == "distributed"),
training_args.fp16, training_args.fp16,
) )
# Set the verbosity to info of the Transformers logger (on main process only): # Set the verbosity to info of the Transformers logger (on main process only):
if is_main_process(training_args.local_rank): if is_main_process(training_args.local_process_index):
transformers.utils.logging.set_verbosity_info() transformers.utils.logging.set_verbosity_info()
transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format() transformers.utils.logging.enable_explicit_format()

View File

@ -171,11 +171,11 @@ def main():
logging.basicConfig( logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S", datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, level=logging.INFO if training_args.local_process_index in [-1, 0] else logging.WARN,
) )
logger.warning( logger.warning(
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
training_args.local_rank, training_args.local_process_index,
training_args.device, training_args.device,
training_args.n_gpu, training_args.n_gpu,
bool(training_args.parallel_mode == ParallelMode.DISTRIBUTED), bool(training_args.parallel_mode == ParallelMode.DISTRIBUTED),
@ -184,7 +184,7 @@ def main():
transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format() transformers.utils.logging.enable_explicit_format()
# Set the verbosity to info of the Transformers logger (on main process only): # Set the verbosity to info of the Transformers logger (on main process only):
if is_main_process(training_args.local_rank): if is_main_process(training_args.local_process_index):
transformers.utils.logging.set_verbosity_info() transformers.utils.logging.set_verbosity_info()
logger.info("Training/evaluation parameters %s", training_args) logger.info("Training/evaluation parameters %s", training_args)

View File

@ -125,18 +125,18 @@ def main():
logging.basicConfig( logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S", datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, level=logging.INFO if training_args.local_process_index in [-1, 0] else logging.WARN,
) )
logger.warning( logger.warning(
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
training_args.local_rank, training_args.local_process_index,
training_args.device, training_args.device,
training_args.n_gpu, training_args.n_gpu,
bool(training_args.local_rank != -1), bool(training_args.parallel_mode.value == "distributed"),
training_args.fp16, training_args.fp16,
) )
# Set the verbosity to info of the Transformers logger (on main process only): # Set the verbosity to info of the Transformers logger (on main process only):
if is_main_process(training_args.local_rank): if is_main_process(training_args.local_process_index):
transformers.utils.logging.set_verbosity_info() transformers.utils.logging.set_verbosity_info()
transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format() transformers.utils.logging.enable_explicit_format()

View File

@ -236,7 +236,7 @@ def main():
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+ f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")

View File

@ -265,7 +265,7 @@ def main():
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+ f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")

View File

@ -219,7 +219,7 @@ def main():
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+ f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")

View File

@ -211,7 +211,7 @@ def main():
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+ f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")

View File

@ -275,7 +275,7 @@ def main():
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+ f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")

View File

@ -357,7 +357,7 @@ def main():
# Setup logging and log on each process the small summary: # Setup logging and log on each process the small summary:
setup_logging(training_args) setup_logging(training_args)
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+ f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")

View File

@ -311,7 +311,7 @@ def main():
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+ f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")

View File

@ -338,7 +338,7 @@ def main():
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+ f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")

View File

@ -283,7 +283,7 @@ def main():
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+ f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
) )
# Set the verbosity to info of the Transformers logger (on main process only): # Set the verbosity to info of the Transformers logger (on main process only):

View File

@ -263,7 +263,7 @@ def main():
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+ f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")

View File

@ -207,7 +207,7 @@ def main():
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+ f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")

View File

@ -367,7 +367,7 @@ def main():
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+ f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")

View File

@ -256,7 +256,7 @@ def main():
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+ f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")

View File

@ -254,7 +254,7 @@ def main():
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+ f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")

View File

@ -302,7 +302,7 @@ def main():
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+ f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")

View File

@ -215,7 +215,7 @@ def main():
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+ f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")

View File

@ -435,15 +435,15 @@ def main():
datefmt="%m/%d/%Y %H:%M:%S", datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)], handlers=[logging.StreamHandler(sys.stdout)],
) )
logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN) logger.setLevel(logging.INFO if is_main_process(training_args.local_process_index) else logging.WARN)
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
) )
# Set the verbosity to info of the Transformers logger (on main process only): # Set the verbosity to info of the Transformers logger (on main process only):
if is_main_process(training_args.local_rank): if is_main_process(training_args.local_process_index):
transformers.utils.logging.set_verbosity_info() transformers.utils.logging.set_verbosity_info()
logger.info("Training/evaluation parameters %s", training_args) logger.info("Training/evaluation parameters %s", training_args)
@ -726,7 +726,7 @@ def main():
# make sure all processes wait until data is saved # make sure all processes wait until data is saved
with training_args.main_process_first(): with training_args.main_process_first():
# only the main process saves them # only the main process saves them
if is_main_process(training_args.local_rank): if is_main_process(training_args.local_process_index):
# save feature extractor, tokenizer and config # save feature extractor, tokenizer and config
feature_extractor.save_pretrained(training_args.output_dir) feature_extractor.save_pretrained(training_args.output_dir)
tokenizer.save_pretrained(training_args.output_dir) tokenizer.save_pretrained(training_args.output_dir)

View File

@ -412,15 +412,15 @@ def main():
datefmt="%m/%d/%Y %H:%M:%S", datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)], handlers=[logging.StreamHandler(sys.stdout)],
) )
logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN) logger.setLevel(logging.INFO if is_main_process(training_args.local_process_index) else logging.WARN)
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
) )
# Set the verbosity to info of the Transformers logger (on main process only): # Set the verbosity to info of the Transformers logger (on main process only):
if is_main_process(training_args.local_rank): if is_main_process(training_args.local_process_index):
transformers.utils.logging.set_verbosity_info() transformers.utils.logging.set_verbosity_info()
logger.info("Training/evaluation parameters %s", training_args) logger.info("Training/evaluation parameters %s", training_args)
@ -721,7 +721,7 @@ def main():
# make sure all processes wait until data is saved # make sure all processes wait until data is saved
with training_args.main_process_first(): with training_args.main_process_first():
# only the main process saves them # only the main process saves them
if is_main_process(training_args.local_rank): if is_main_process(training_args.local_process_index):
# save feature extractor, tokenizer and config # save feature extractor, tokenizer and config
feature_extractor.save_pretrained(training_args.output_dir) feature_extractor.save_pretrained(training_args.output_dir)
tokenizer.save_pretrained(training_args.output_dir) tokenizer.save_pretrained(training_args.output_dir)

View File

@ -326,17 +326,17 @@ def main():
transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format() transformers.utils.logging.enable_explicit_format()
logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN) logger.setLevel(logging.INFO if is_main_process(training_args.local_process_index) else logging.WARN)
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# Set the verbosity to info of the Transformers logger (on main process only): # Set the verbosity to info of the Transformers logger (on main process only):
if is_main_process(training_args.local_rank): if is_main_process(training_args.local_process_index):
transformers.utils.logging.set_verbosity_info() transformers.utils.logging.set_verbosity_info()
logger.info("Training/evaluation parameters %s", training_args) logger.info("Training/evaluation parameters %s", training_args)
@ -557,7 +557,7 @@ def main():
# make sure all processes wait until data is saved # make sure all processes wait until data is saved
with training_args.main_process_first(): with training_args.main_process_first():
# only the main process saves them # only the main process saves them
if is_main_process(training_args.local_rank): if is_main_process(training_args.local_process_index):
# save feature extractor, tokenizer and config # save feature extractor, tokenizer and config
feature_extractor.save_pretrained(training_args.output_dir) feature_extractor.save_pretrained(training_args.output_dir)
tokenizer.save_pretrained(training_args.output_dir) tokenizer.save_pretrained(training_args.output_dir)

View File

@ -356,7 +356,7 @@ def main():
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+ f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")

View File

@ -315,7 +315,7 @@ def main():
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+ f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")

View File

@ -260,7 +260,7 @@ def main():
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+ f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")

View File

@ -218,7 +218,7 @@ def main():
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+ f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")

View File

@ -257,7 +257,7 @@ def main():
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+ f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")

View File

@ -304,7 +304,7 @@ def main():
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+ f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")

View File

@ -4514,9 +4514,7 @@ class Trainer:
tensors = nested_xla_mesh_reduce(tensors, name) tensors = nested_xla_mesh_reduce(tensors, name)
elif is_sagemaker_mp_enabled(): elif is_sagemaker_mp_enabled():
tensors = smp_gather(tensors) tensors = smp_gather(tensors)
elif (self.args.distributed_state is not None and self.args.distributed_state.distributed_type != "NO") or ( elif self.args.parallel_mode == ParallelMode.DISTRIBUTED:
self.args.distributed_state is None and self.args.local_rank != -1
):
tensors = distributed_concat(tensors) tensors = distributed_concat(tensors)
return tensors return tensors

View File

@ -383,8 +383,6 @@ class TrainingArguments:
on PyTorch's version default of `torch.backends.cuda.matmul.allow_tf32`. For more details please refer to on PyTorch's version default of `torch.backends.cuda.matmul.allow_tf32`. For more details please refer to
the [TF32](https://huggingface.co/docs/transformers/perf_train_gpu_one#tf32) documentation. This is an the [TF32](https://huggingface.co/docs/transformers/perf_train_gpu_one#tf32) documentation. This is an
experimental API and it may change. experimental API and it may change.
local_rank (`int`, *optional*, defaults to -1):
Rank of the process during distributed training.
ddp_backend (`str`, *optional*): ddp_backend (`str`, *optional*):
The backend to use for distributed training. Must be one of `"nccl"`, `"mpi"`, `"ccl"`, `"gloo"`, `"hccl"`. The backend to use for distributed training. Must be one of `"nccl"`, `"mpi"`, `"ccl"`, `"gloo"`, `"hccl"`.
tpu_num_cores (`int`, *optional*): tpu_num_cores (`int`, *optional*):
@ -998,7 +996,6 @@ class TrainingArguments:
) )
}, },
) )
local_rank: int = field(default=-1, metadata={"help": "For distributed training: local_rank"})
ddp_backend: Optional[str] = field( ddp_backend: Optional[str] = field(
default=None, default=None,
metadata={ metadata={
@ -1900,8 +1897,7 @@ class TrainingArguments:
self._n_gpu = 0 self._n_gpu = 0
elif is_sagemaker_mp_enabled(): elif is_sagemaker_mp_enabled():
accelerator_state_kwargs["enabled"] = False accelerator_state_kwargs["enabled"] = False
local_rank = smp.local_rank() device = torch.device("cuda", smp.local_rank())
device = torch.device("cuda", local_rank)
torch.cuda.set_device(device) torch.cuda.set_device(device)
elif is_sagemaker_dp_enabled(): elif is_sagemaker_dp_enabled():
accelerator_state_kwargs["_use_sagemaker_dp"] = True accelerator_state_kwargs["_use_sagemaker_dp"] = True
@ -1925,7 +1921,6 @@ class TrainingArguments:
del os.environ["ACCELERATE_USE_DEEPSPEED"] del os.environ["ACCELERATE_USE_DEEPSPEED"]
if not is_sagemaker_mp_enabled(): if not is_sagemaker_mp_enabled():
device = self.distributed_state.device device = self.distributed_state.device
self.local_rank = self.distributed_state.local_process_index
if dist.is_available() and dist.is_initialized() and self.parallel_mode != ParallelMode.DISTRIBUTED: if dist.is_available() and dist.is_initialized() and self.parallel_mode != ParallelMode.DISTRIBUTED:
logger.warning( logger.warning(
"torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. " "torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. "
@ -2015,9 +2010,7 @@ class TrainingArguments:
return ParallelMode.SAGEMAKER_MODEL_PARALLEL return ParallelMode.SAGEMAKER_MODEL_PARALLEL
elif is_sagemaker_dp_enabled(): elif is_sagemaker_dp_enabled():
return ParallelMode.SAGEMAKER_DATA_PARALLEL return ParallelMode.SAGEMAKER_DATA_PARALLEL
elif ( elif self.distributed_state is not None and self.distributed_state.distributed_type != DistributedType.NO:
self.distributed_state is not None and self.distributed_state.distributed_type != DistributedType.NO
) or (self.distributed_state is None and self.local_rank != -1):
return ParallelMode.DISTRIBUTED return ParallelMode.DISTRIBUTED
elif self.n_gpu > 1: elif self.n_gpu > 1:
return ParallelMode.NOT_DISTRIBUTED return ParallelMode.NOT_DISTRIBUTED

View File

@ -235,8 +235,8 @@ def main():
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + f"distributed training: {bool(training_args.parallel_mode.value == 'distributed')}, 16-bits training: {training_args.fp16}"
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")

View File

@ -207,8 +207,8 @@ def main():
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + f"distributed training: {bool(training_args.parallel_mode.value == 'distributed')}, 16-bits training: {training_args.fp16}"
) )
# Set the verbosity to info of the Transformers logger (on main process only): # Set the verbosity to info of the Transformers logger (on main process only):
if training_args.should_log: if training_args.should_log:

View File

@ -140,7 +140,7 @@ if __name__ == "__main__":
training_args = parser.parse_args_into_dataclasses()[0] training_args = parser.parse_args_into_dataclasses()[0]
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
f"distributed training: {training_args.parallel_mode != ParallelMode.NOT_DISTRIBUTED}" f"distributed training: {training_args.parallel_mode != ParallelMode.NOT_DISTRIBUTED}"
) )
@ -152,7 +152,7 @@ if __name__ == "__main__":
def compute_metrics(p: EvalPrediction) -> dict: def compute_metrics(p: EvalPrediction) -> dict:
sequential = list(range(len(dataset))) sequential = list(range(len(dataset)))
success = p.predictions.tolist() == sequential and p.label_ids.tolist() == sequential success = p.predictions.tolist() == sequential and p.label_ids.tolist() == sequential
if not success and training_args.local_rank == 0: if not success and training_args.local_process_index == 0:
logger.warning( logger.warning(
"Predictions and/or labels do not match expected results:\n - predictions: " "Predictions and/or labels do not match expected results:\n - predictions: "
f"{p.predictions.tolist()}\n - labels: {p.label_ids.tolist()}\n - expected: {sequential}" f"{p.predictions.tolist()}\n - labels: {p.label_ids.tolist()}\n - expected: {sequential}"

View File

@ -68,7 +68,7 @@ def main():
training_args = parser.parse_args_into_dataclasses()[0] training_args = parser.parse_args_into_dataclasses()[0]
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, " f"Process rank: {training_args.local_process_index}, device: {training_args.device}, "
f"tpu_num_cores: {training_args.tpu_num_cores}", f"tpu_num_cores: {training_args.tpu_num_cores}",
) )