From 0419ff881d7bb503f4fc0f0a7a5aac3d012c9b91 Mon Sep 17 00:00:00 2001 From: Marc Sun <57196510+SunMarc@users.noreply.github.com> Date: Thu, 9 Oct 2025 18:54:12 +0200 Subject: [PATCH] Remove `local_rank` arg from `TrainingArguments` (#41382) --- .../legacy/multiple_choice/run_multiple_choice.py | 8 ++++---- .../legacy/question-answering/run_squad_trainer.py | 8 ++++---- examples/legacy/run_language_modeling.py | 8 ++++---- examples/legacy/seq2seq/finetune_trainer.py | 6 +++--- examples/legacy/token-classification/run_ner.py | 8 ++++---- .../audio-classification/run_audio_classification.py | 2 +- examples/pytorch/contrastive-image-text/run_clip.py | 2 +- .../image-classification/run_image_classification.py | 2 +- examples/pytorch/image-pretraining/run_mae.py | 2 +- examples/pytorch/image-pretraining/run_mim.py | 2 +- .../run_instance_segmentation.py | 2 +- examples/pytorch/language-modeling/run_clm.py | 2 +- examples/pytorch/language-modeling/run_fim.py | 2 +- examples/pytorch/language-modeling/run_mlm.py | 2 +- examples/pytorch/language-modeling/run_plm.py | 2 +- examples/pytorch/multiple-choice/run_swag.py | 2 +- .../pytorch/object-detection/run_object_detection.py | 2 +- examples/pytorch/question-answering/run_qa.py | 2 +- .../pytorch/question-answering/run_qa_beam_search.py | 2 +- examples/pytorch/question-answering/run_seq2seq_qa.py | 2 +- .../run_semantic_segmentation.py | 2 +- .../speech-recognition/run_speech_recognition_ctc.py | 8 ++++---- .../run_speech_recognition_ctc_adapter.py | 8 ++++---- .../run_speech_recognition_seq2seq.py | 8 ++++---- examples/pytorch/summarization/run_summarization.py | 2 +- .../pytorch/text-classification/run_classification.py | 2 +- examples/pytorch/text-classification/run_glue.py | 2 +- examples/pytorch/text-classification/run_xnli.py | 2 +- examples/pytorch/token-classification/run_ner.py | 2 +- examples/pytorch/translation/run_translation.py | 2 +- src/transformers/trainer.py | 4 +--- src/transformers/training_args.py | 11 ++--------- .../run_{{cookiecutter.example_shortcut}}.py | 4 ++-- .../scripts/pytorch/run_glue_model_parallelism.py | 4 ++-- tests/trainer/test_trainer_distributed.py | 4 ++-- tests/trainer/test_trainer_tpu.py | 2 +- 36 files changed, 63 insertions(+), 72 deletions(-) diff --git a/examples/legacy/multiple_choice/run_multiple_choice.py b/examples/legacy/multiple_choice/run_multiple_choice.py index 92947e2092c..0b33e47c03c 100644 --- a/examples/legacy/multiple_choice/run_multiple_choice.py +++ b/examples/legacy/multiple_choice/run_multiple_choice.py @@ -99,18 +99,18 @@ def main(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", - level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, + level=logging.INFO if training_args.local_process_index in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", - training_args.local_rank, + training_args.local_process_index, training_args.device, training_args.n_gpu, - bool(training_args.local_rank != -1), + bool(training_args.parallel_mode.value == "distributed"), training_args.fp16, ) # Set the verbosity to info of the Transformers logger (on main process only): - if is_main_process(training_args.local_rank): + if is_main_process(training_args.local_process_index): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() diff --git a/examples/legacy/question-answering/run_squad_trainer.py b/examples/legacy/question-answering/run_squad_trainer.py index 5288e3019b9..9ec8d4d852c 100644 --- a/examples/legacy/question-answering/run_squad_trainer.py +++ b/examples/legacy/question-answering/run_squad_trainer.py @@ -80,18 +80,18 @@ def main(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", - level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, + level=logging.INFO if training_args.local_process_index in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", - training_args.local_rank, + training_args.local_process_index, training_args.device, training_args.n_gpu, - bool(training_args.local_rank != -1), + bool(training_args.parallel_mode.value == "distributed"), training_args.fp16, ) # Set the verbosity to info of the Transformers logger (on main process only): - if is_main_process(training_args.local_rank): + if is_main_process(training_args.local_process_index): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() diff --git a/examples/legacy/run_language_modeling.py b/examples/legacy/run_language_modeling.py index 64c92fa205e..db1db37d983 100755 --- a/examples/legacy/run_language_modeling.py +++ b/examples/legacy/run_language_modeling.py @@ -212,18 +212,18 @@ def main(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", - level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, + level=logging.INFO if training_args.local_process_index in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", - training_args.local_rank, + training_args.local_process_index, training_args.device, training_args.n_gpu, - bool(training_args.local_rank != -1), + bool(training_args.parallel_mode.value == "distributed"), training_args.fp16, ) # Set the verbosity to info of the Transformers logger (on main process only): - if is_main_process(training_args.local_rank): + if is_main_process(training_args.local_process_index): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() diff --git a/examples/legacy/seq2seq/finetune_trainer.py b/examples/legacy/seq2seq/finetune_trainer.py index 54ca2c898c8..a39380bbab1 100755 --- a/examples/legacy/seq2seq/finetune_trainer.py +++ b/examples/legacy/seq2seq/finetune_trainer.py @@ -171,11 +171,11 @@ def main(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", - level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, + level=logging.INFO if training_args.local_process_index in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", - training_args.local_rank, + training_args.local_process_index, training_args.device, training_args.n_gpu, bool(training_args.parallel_mode == ParallelMode.DISTRIBUTED), @@ -184,7 +184,7 @@ def main(): transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Set the verbosity to info of the Transformers logger (on main process only): - if is_main_process(training_args.local_rank): + if is_main_process(training_args.local_process_index): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) diff --git a/examples/legacy/token-classification/run_ner.py b/examples/legacy/token-classification/run_ner.py index 1e6e5e40231..c73b4f99d9e 100644 --- a/examples/legacy/token-classification/run_ner.py +++ b/examples/legacy/token-classification/run_ner.py @@ -125,18 +125,18 @@ def main(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", - level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, + level=logging.INFO if training_args.local_process_index in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", - training_args.local_rank, + training_args.local_process_index, training_args.device, training_args.n_gpu, - bool(training_args.local_rank != -1), + bool(training_args.parallel_mode.value == "distributed"), training_args.fp16, ) # Set the verbosity to info of the Transformers logger (on main process only): - if is_main_process(training_args.local_rank): + if is_main_process(training_args.local_process_index): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py index 3abd6e158d3..635a947bbde 100644 --- a/examples/pytorch/audio-classification/run_audio_classification.py +++ b/examples/pytorch/audio-classification/run_audio_classification.py @@ -236,7 +236,7 @@ def main(): # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py index eac0977a212..05fa924d62b 100644 --- a/examples/pytorch/contrastive-image-text/run_clip.py +++ b/examples/pytorch/contrastive-image-text/run_clip.py @@ -265,7 +265,7 @@ def main(): # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py index 04107ffdfb3..78e4ba22b58 100755 --- a/examples/pytorch/image-classification/run_image_classification.py +++ b/examples/pytorch/image-classification/run_image_classification.py @@ -219,7 +219,7 @@ def main(): # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") diff --git a/examples/pytorch/image-pretraining/run_mae.py b/examples/pytorch/image-pretraining/run_mae.py index ea350a855fe..f3313b4b42f 100644 --- a/examples/pytorch/image-pretraining/run_mae.py +++ b/examples/pytorch/image-pretraining/run_mae.py @@ -211,7 +211,7 @@ def main(): # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") diff --git a/examples/pytorch/image-pretraining/run_mim.py b/examples/pytorch/image-pretraining/run_mim.py index c0b86d009ba..f9c874b7578 100644 --- a/examples/pytorch/image-pretraining/run_mim.py +++ b/examples/pytorch/image-pretraining/run_mim.py @@ -275,7 +275,7 @@ def main(): # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") diff --git a/examples/pytorch/instance-segmentation/run_instance_segmentation.py b/examples/pytorch/instance-segmentation/run_instance_segmentation.py index 8e6342a3160..e2bc4a289eb 100644 --- a/examples/pytorch/instance-segmentation/run_instance_segmentation.py +++ b/examples/pytorch/instance-segmentation/run_instance_segmentation.py @@ -357,7 +357,7 @@ def main(): # Setup logging and log on each process the small summary: setup_logging(training_args) logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py index 986f873870d..53fa7f13460 100755 --- a/examples/pytorch/language-modeling/run_clm.py +++ b/examples/pytorch/language-modeling/run_clm.py @@ -311,7 +311,7 @@ def main(): # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") diff --git a/examples/pytorch/language-modeling/run_fim.py b/examples/pytorch/language-modeling/run_fim.py index 14457275811..828958e7756 100644 --- a/examples/pytorch/language-modeling/run_fim.py +++ b/examples/pytorch/language-modeling/run_fim.py @@ -338,7 +338,7 @@ def main(): # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py index 80ad49bef98..027905ac420 100755 --- a/examples/pytorch/language-modeling/run_mlm.py +++ b/examples/pytorch/language-modeling/run_mlm.py @@ -283,7 +283,7 @@ def main(): # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py index f81fd37d5e4..c9d038d4e51 100755 --- a/examples/pytorch/language-modeling/run_plm.py +++ b/examples/pytorch/language-modeling/run_plm.py @@ -263,7 +263,7 @@ def main(): # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py index d1f785d51ca..30786a7af7f 100755 --- a/examples/pytorch/multiple-choice/run_swag.py +++ b/examples/pytorch/multiple-choice/run_swag.py @@ -207,7 +207,7 @@ def main(): # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") diff --git a/examples/pytorch/object-detection/run_object_detection.py b/examples/pytorch/object-detection/run_object_detection.py index e2018bf9722..7624c122f2d 100644 --- a/examples/pytorch/object-detection/run_object_detection.py +++ b/examples/pytorch/object-detection/run_object_detection.py @@ -367,7 +367,7 @@ def main(): # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py index ab5bf3267b7..92d2cfd1949 100755 --- a/examples/pytorch/question-answering/run_qa.py +++ b/examples/pytorch/question-answering/run_qa.py @@ -256,7 +256,7 @@ def main(): # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py index 882f4fa9b27..68924ba9f06 100755 --- a/examples/pytorch/question-answering/run_qa_beam_search.py +++ b/examples/pytorch/question-answering/run_qa_beam_search.py @@ -254,7 +254,7 @@ def main(): # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py index 10827108b93..1a2793d58d4 100644 --- a/examples/pytorch/question-answering/run_seq2seq_qa.py +++ b/examples/pytorch/question-answering/run_seq2seq_qa.py @@ -302,7 +302,7 @@ def main(): # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py index 6fd83df7aa8..6c773a5115f 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py @@ -215,7 +215,7 @@ def main(): # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py index f94105cd1d1..0edd243433f 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py @@ -435,15 +435,15 @@ def main(): datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) - logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN) + logger.setLevel(logging.INFO if is_main_process(training_args.local_process_index) else logging.WARN) # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): - if is_main_process(training_args.local_rank): + if is_main_process(training_args.local_process_index): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) @@ -726,7 +726,7 @@ def main(): # make sure all processes wait until data is saved with training_args.main_process_first(): # only the main process saves them - if is_main_process(training_args.local_rank): + if is_main_process(training_args.local_process_index): # save feature extractor, tokenizer and config feature_extractor.save_pretrained(training_args.output_dir) tokenizer.save_pretrained(training_args.output_dir) diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py index adfe91a31d5..0e985dfe70b 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py @@ -412,15 +412,15 @@ def main(): datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) - logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN) + logger.setLevel(logging.INFO if is_main_process(training_args.local_process_index) else logging.WARN) # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): - if is_main_process(training_args.local_rank): + if is_main_process(training_args.local_process_index): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) @@ -721,7 +721,7 @@ def main(): # make sure all processes wait until data is saved with training_args.main_process_first(): # only the main process saves them - if is_main_process(training_args.local_rank): + if is_main_process(training_args.local_process_index): # save feature extractor, tokenizer and config feature_extractor.save_pretrained(training_args.output_dir) tokenizer.save_pretrained(training_args.output_dir) diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py index 7e124d4326f..70ca37461ba 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py @@ -326,17 +326,17 @@ def main(): transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() - logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN) + logger.setLevel(logging.INFO if is_main_process(training_args.local_process_index) else logging.WARN) # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") # Set the verbosity to info of the Transformers logger (on main process only): - if is_main_process(training_args.local_rank): + if is_main_process(training_args.local_process_index): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) @@ -557,7 +557,7 @@ def main(): # make sure all processes wait until data is saved with training_args.main_process_first(): # only the main process saves them - if is_main_process(training_args.local_rank): + if is_main_process(training_args.local_process_index): # save feature extractor, tokenizer and config feature_extractor.save_pretrained(training_args.output_dir) tokenizer.save_pretrained(training_args.output_dir) diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py index c27f5dc8221..ba6d34cd12c 100755 --- a/examples/pytorch/summarization/run_summarization.py +++ b/examples/pytorch/summarization/run_summarization.py @@ -356,7 +356,7 @@ def main(): # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") diff --git a/examples/pytorch/text-classification/run_classification.py b/examples/pytorch/text-classification/run_classification.py index 78a9d009491..cf1c03af605 100755 --- a/examples/pytorch/text-classification/run_classification.py +++ b/examples/pytorch/text-classification/run_classification.py @@ -315,7 +315,7 @@ def main(): # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py index 0272eef968f..804788b30f9 100755 --- a/examples/pytorch/text-classification/run_glue.py +++ b/examples/pytorch/text-classification/run_glue.py @@ -260,7 +260,7 @@ def main(): # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py index b64ac29552f..ac3be437ecd 100755 --- a/examples/pytorch/text-classification/run_xnli.py +++ b/examples/pytorch/text-classification/run_xnli.py @@ -218,7 +218,7 @@ def main(): # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py index f21ea058d07..60548096e3a 100755 --- a/examples/pytorch/token-classification/run_ner.py +++ b/examples/pytorch/token-classification/run_ner.py @@ -257,7 +257,7 @@ def main(): # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py index b068cd3ce2d..e1d3c4ca387 100755 --- a/examples/pytorch/translation/run_translation.py +++ b/examples/pytorch/translation/run_translation.py @@ -304,7 +304,7 @@ def main(): # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 83ec5da884f..be45697762e 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -4514,9 +4514,7 @@ class Trainer: tensors = nested_xla_mesh_reduce(tensors, name) elif is_sagemaker_mp_enabled(): tensors = smp_gather(tensors) - elif (self.args.distributed_state is not None and self.args.distributed_state.distributed_type != "NO") or ( - self.args.distributed_state is None and self.args.local_rank != -1 - ): + elif self.args.parallel_mode == ParallelMode.DISTRIBUTED: tensors = distributed_concat(tensors) return tensors diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index 12b657e43ba..cd7489bcbdb 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -383,8 +383,6 @@ class TrainingArguments: on PyTorch's version default of `torch.backends.cuda.matmul.allow_tf32`. For more details please refer to the [TF32](https://huggingface.co/docs/transformers/perf_train_gpu_one#tf32) documentation. This is an experimental API and it may change. - local_rank (`int`, *optional*, defaults to -1): - Rank of the process during distributed training. ddp_backend (`str`, *optional*): The backend to use for distributed training. Must be one of `"nccl"`, `"mpi"`, `"ccl"`, `"gloo"`, `"hccl"`. tpu_num_cores (`int`, *optional*): @@ -998,7 +996,6 @@ class TrainingArguments: ) }, ) - local_rank: int = field(default=-1, metadata={"help": "For distributed training: local_rank"}) ddp_backend: Optional[str] = field( default=None, metadata={ @@ -1900,8 +1897,7 @@ class TrainingArguments: self._n_gpu = 0 elif is_sagemaker_mp_enabled(): accelerator_state_kwargs["enabled"] = False - local_rank = smp.local_rank() - device = torch.device("cuda", local_rank) + device = torch.device("cuda", smp.local_rank()) torch.cuda.set_device(device) elif is_sagemaker_dp_enabled(): accelerator_state_kwargs["_use_sagemaker_dp"] = True @@ -1925,7 +1921,6 @@ class TrainingArguments: del os.environ["ACCELERATE_USE_DEEPSPEED"] if not is_sagemaker_mp_enabled(): device = self.distributed_state.device - self.local_rank = self.distributed_state.local_process_index if dist.is_available() and dist.is_initialized() and self.parallel_mode != ParallelMode.DISTRIBUTED: logger.warning( "torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. " @@ -2015,9 +2010,7 @@ class TrainingArguments: return ParallelMode.SAGEMAKER_MODEL_PARALLEL elif is_sagemaker_dp_enabled(): return ParallelMode.SAGEMAKER_DATA_PARALLEL - elif ( - self.distributed_state is not None and self.distributed_state.distributed_type != DistributedType.NO - ) or (self.distributed_state is None and self.local_rank != -1): + elif self.distributed_state is not None and self.distributed_state.distributed_type != DistributedType.NO: return ParallelMode.DISTRIBUTED elif self.n_gpu > 1: return ParallelMode.NOT_DISTRIBUTED diff --git a/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py b/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py index 89ee64a5d88..696d144c1dd 100755 --- a/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py +++ b/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py @@ -235,8 +235,8 @@ def main(): # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + + f"distributed training: {bool(training_args.parallel_mode.value == 'distributed')}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") diff --git a/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py b/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py index bb7a263a0eb..cec877506b6 100644 --- a/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py +++ b/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py @@ -207,8 +207,8 @@ def main(): # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + + f"distributed training: {bool(training_args.parallel_mode.value == 'distributed')}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if training_args.should_log: diff --git a/tests/trainer/test_trainer_distributed.py b/tests/trainer/test_trainer_distributed.py index 54568caac37..37669013e67 100644 --- a/tests/trainer/test_trainer_distributed.py +++ b/tests/trainer/test_trainer_distributed.py @@ -140,7 +140,7 @@ if __name__ == "__main__": training_args = parser.parse_args_into_dataclasses()[0] logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " f"distributed training: {training_args.parallel_mode != ParallelMode.NOT_DISTRIBUTED}" ) @@ -152,7 +152,7 @@ if __name__ == "__main__": def compute_metrics(p: EvalPrediction) -> dict: sequential = list(range(len(dataset))) success = p.predictions.tolist() == sequential and p.label_ids.tolist() == sequential - if not success and training_args.local_rank == 0: + if not success and training_args.local_process_index == 0: logger.warning( "Predictions and/or labels do not match expected results:\n - predictions: " f"{p.predictions.tolist()}\n - labels: {p.label_ids.tolist()}\n - expected: {sequential}" diff --git a/tests/trainer/test_trainer_tpu.py b/tests/trainer/test_trainer_tpu.py index e7bb3d26406..75ee3701544 100644 --- a/tests/trainer/test_trainer_tpu.py +++ b/tests/trainer/test_trainer_tpu.py @@ -68,7 +68,7 @@ def main(): training_args = parser.parse_args_into_dataclasses()[0] logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, " + f"Process rank: {training_args.local_process_index}, device: {training_args.device}, " f"tpu_num_cores: {training_args.tpu_num_cores}", )