deprecate overwrite_output_dir (#41323)

* dep

* style

* rm

* wut

* style
This commit is contained in:
Marc Sun
2025-10-09 18:36:19 +02:00
committed by GitHub
parent 3839d51013
commit 776eea8612
81 changed files with 46 additions and 939 deletions

View File

@ -153,7 +153,7 @@ You are not required to read the following guidelines before opening an issue. H
cd examples/seq2seq cd examples/seq2seq
torchrun --nproc_per_node=2 ./finetune_trainer.py \ torchrun --nproc_per_node=2 ./finetune_trainer.py \
--model_name_or_path sshleifer/distill-mbart-en-ro-12-4 --data_dir wmt_en_ro \ --model_name_or_path sshleifer/distill-mbart-en-ro-12-4 --data_dir wmt_en_ro \
--output_dir output_dir --overwrite_output_dir \ --output_dir output_dir \
--do_train --n_train 500 --num_train_epochs 1 \ --do_train --n_train 500 --num_train_epochs 1 \
--per_device_train_batch_size 1 --freeze_embeds \ --per_device_train_batch_size 1 --freeze_embeds \
--src_lang en_XX --tgt_lang ro_RO --task translation \ --src_lang en_XX --tgt_lang ro_RO --task translation \

View File

@ -93,7 +93,6 @@ python examples/pytorch/summarization/run_summarization.py \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -117,7 +116,6 @@ torchrun \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -140,7 +138,6 @@ python xla_spawn.py --num_cores 8 \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -197,7 +194,6 @@ python examples/pytorch/summarization/run_summarization.py \
--summary_column summary_column_name \ --summary_column summary_column_name \
--source_prefix "summarize: " \ --source_prefix "summarize: " \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--overwrite_output_dir \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--predict_with_generate --predict_with_generate
@ -225,7 +221,6 @@ python examples/pytorch/summarization/run_summarization.py \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -239,8 +234,6 @@ examples/pytorch/summarization/run_summarization.py -h
خيار آخر مفيد لتمكينه هو استئناف التدريب من نقطة تفتيش سابقة. سيضمن ذلك أنك تستطيع الاستمرار من حيث توقفت دون البدء من جديد إذا تم مقاطعة تدريبك. هناك طريقتان لاستئناف التدريب من نقطة تفتيش. خيار آخر مفيد لتمكينه هو استئناف التدريب من نقطة تفتيش سابقة. سيضمن ذلك أنك تستطيع الاستمرار من حيث توقفت دون البدء من جديد إذا تم مقاطعة تدريبك. هناك طريقتان لاستئناف التدريب من نقطة تفتيش.
تستخدم الطريقة الأولى المعلمة `output_dir previous_output_dir` لاستئناف التدريب من أحدث نقطة تفتيش مخزنة في `output_dir`. في هذه الحالة، يجب عليك إزالة `overwrite_output_dir`:
```bash ```bash
python examples/pytorch/summarization/run_summarization.py python examples/pytorch/summarization/run_summarization.py
--model_name_or_path google-t5/t5-small \ --model_name_or_path google-t5/t5-small \
@ -252,24 +245,6 @@ python examples/pytorch/summarization/run_summarization.py
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--output_dir previous_output_dir \
--predict_with_generate
```
تستخدم الطريقة الثانية معلمة `resume_from_checkpoint path_to_specific_checkpoint` لاستئناف التدريب من مجلد نقطة تفتيش محددة.
```bash
python examples/pytorch/summarization/run_summarization.py
--model_name_or_path google-t5/t5-small \
--do_train \
--do_eval \
--dataset_name cnn_dailymail \
--dataset_config "3.0.0" \
--source_prefix "summarize: " \
--output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \
--overwrite_output_dir \
--resume_from_checkpoint path_to_specific_checkpoint \ --resume_from_checkpoint path_to_specific_checkpoint \
--predict_with_generate --predict_with_generate
``` ```
@ -301,6 +276,5 @@ python examples/pytorch/summarization/run_summarization.py
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```

View File

@ -611,7 +611,6 @@ accelerate launch \
--learning_rate 5e-5 \ --learning_rate 5e-5 \
--num_train_epochs 3 \ --num_train_epochs 3 \
--output_dir /tmp/$TASK_NAME/ \ --output_dir /tmp/$TASK_NAME/ \
--overwrite_output_dir
``` ```
يمكنك أيضًا تحديد المعلمات من ملف `config_file.yaml` مباشرة في سطر الأوامر: يمكنك أيضًا تحديد المعلمات من ملف `config_file.yaml` مباشرة في سطر الأوامر:
@ -634,7 +633,6 @@ accelerate launch --num_processes=2 \
--learning_rate 5e-5 \ --learning_rate 5e-5 \
--num_train_epochs 3 \ --num_train_epochs 3 \
--output_dir /tmp/$TASK_NAME/ \ --output_dir /tmp/$TASK_NAME/ \
--overwrite_output_dir
``` ```
اطلع على برنامج تعليمي [Launching your Accelerate scripts](https://huggingface.co/docs/accelerate/basic_tutorials/launch) لمعرفة المزيد حول `accelerate_launch` والتكوينات المخصصة. اطلع على برنامج تعليمي [Launching your Accelerate scripts](https://huggingface.co/docs/accelerate/basic_tutorials/launch) لمعرفة المزيد حول `accelerate_launch` والتكوينات المخصصة.

View File

@ -98,7 +98,6 @@ python examples/pytorch/summarization/run_summarization.py \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -122,7 +121,6 @@ torchrun \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -144,7 +142,6 @@ python xla_spawn.py --num_cores 8 \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -201,7 +198,6 @@ python examples/pytorch/summarization/run_summarization.py \
--summary_column summary_column_name \ --summary_column summary_column_name \
--source_prefix "summarize: " \ --source_prefix "summarize: " \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--overwrite_output_dir \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--predict_with_generate --predict_with_generate
@ -229,7 +225,6 @@ python examples/pytorch/summarization/run_summarization.py \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -243,8 +238,6 @@ examples/pytorch/summarization/run_summarization.py -h
Eine weitere hilfreiche Option, die Sie aktivieren können, ist die Wiederaufnahme des Trainings von einem früheren Kontrollpunkt aus. Auf diese Weise können Sie im Falle einer Unterbrechung Ihres Trainings dort weitermachen, wo Sie aufgehört haben, ohne von vorne beginnen zu müssen. Es gibt zwei Methoden, um das Training von einem Kontrollpunkt aus wieder aufzunehmen. Eine weitere hilfreiche Option, die Sie aktivieren können, ist die Wiederaufnahme des Trainings von einem früheren Kontrollpunkt aus. Auf diese Weise können Sie im Falle einer Unterbrechung Ihres Trainings dort weitermachen, wo Sie aufgehört haben, ohne von vorne beginnen zu müssen. Es gibt zwei Methoden, um das Training von einem Kontrollpunkt aus wieder aufzunehmen.
Die erste Methode verwendet das Argument `output_dir previous_output_dir`, um das Training ab dem letzten in `output_dir` gespeicherten Kontrollpunkt wieder aufzunehmen. In diesem Fall sollten Sie `overwrite_output_dir` entfernen:
```bash ```bash
python examples/pytorch/summarization/run_summarization.py python examples/pytorch/summarization/run_summarization.py
--model_name_or_path google-t5/t5-small \ --model_name_or_path google-t5/t5-small \
@ -256,24 +249,6 @@ python examples/pytorch/summarization/run_summarization.py
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--output_dir previous_output_dir \
--predict_with_generate
```
Die zweite Methode verwendet das Argument `Resume_from_checkpoint path_to_specific_checkpoint`, um das Training ab einem bestimmten Checkpoint-Ordner wieder aufzunehmen.
```bash
python examples/pytorch/summarization/run_summarization.py
--model_name_or_path google-t5/t5-small \
--do_train \
--do_eval \
--dataset_name cnn_dailymail \
--dataset_config "3.0.0" \
--source_prefix "summarize: " \
--output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \
--overwrite_output_dir \
--resume_from_checkpoint path_to_specific_checkpoint \ --resume_from_checkpoint path_to_specific_checkpoint \
--predict_with_generate --predict_with_generate
``` ```
@ -305,6 +280,5 @@ python examples/pytorch/summarization/run_summarization.py
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```

View File

@ -593,7 +593,7 @@ To deploy DeepSpeed on multiple GPUs, add `--num_gpus`. You don't need to add `-
deepspeed --num_gpus=2 examples/pytorch/translation/run_translation.py \ deepspeed --num_gpus=2 examples/pytorch/translation/run_translation.py \
--deepspeed tests/deepspeed/ds_config_zero3.json \ --deepspeed tests/deepspeed/ds_config_zero3.json \
--model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \ --model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \
--output_dir output_dir --overwrite_output_dir --fp16 \ --output_dir output_dir --fp16 \
--do_train --max_train_samples 500 --num_train_epochs 1 \ --do_train --max_train_samples 500 --num_train_epochs 1 \
--dataset_name wmt16 --dataset_config "ro-en" \ --dataset_name wmt16 --dataset_config "ro-en" \
--source_lang en --target_lang ro --source_lang en --target_lang ro
@ -616,7 +616,7 @@ To deploy DeepSpeed on a single GPU, add `--num_gpus`. You don't need to add `--
deepspeed --num_gpus=1 examples/pytorch/translation/run_translation.py \ deepspeed --num_gpus=1 examples/pytorch/translation/run_translation.py \
--deepspeed tests/deepspeed/ds_config_zero2.json \ --deepspeed tests/deepspeed/ds_config_zero2.json \
--model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \ --model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \
--output_dir output_dir --overwrite_output_dir --fp16 \ --output_dir output_dir --fp16 \
--do_train --max_train_samples 500 --num_train_epochs 1 \ --do_train --max_train_samples 500 --num_train_epochs 1 \
--dataset_name wmt16 --dataset_config "ro-en" \ --dataset_name wmt16 --dataset_config "ro-en" \
--source_lang en --target_lang ro --source_lang en --target_lang ro

View File

@ -61,9 +61,8 @@ The example below fine-tunes [T5-small](https://huggingface.co/google-t5/t5-smal
The example script downloads and preprocesses a dataset, and then fine-tunes it with [`Trainer`] with a supported model architecture. The example script downloads and preprocesses a dataset, and then fine-tunes it with [`Trainer`] with a supported model architecture.
Resuming training from a checkpoint is very useful if training is interrupted because you don't have to start over again. There are two ways to resume training from a checkpoint. Resuming training from a checkpoint is very useful if training is interrupted because you don't have to start over again:
* `--output dir previous_output_dir` resumes training from the latest checkpoint stored in `output_dir`. Remove the `--overwrite_output_dir` parameter if you're using this method.
* `--resume_from_checkpoint path_to_specific_checkpoint` resumes training from a specific checkpoint folder. * `--resume_from_checkpoint path_to_specific_checkpoint` resumes training from a specific checkpoint folder.
Share your model on the [Hub](https://huggingface.co/) with the `--push_to_hub` parameter. It creates a repository and uploads the model to the folder name specified in `--output_dir`. You could also use the `--push_to_hub_model_id` parameter to specify the repository name. Share your model on the [Hub](https://huggingface.co/) with the `--push_to_hub` parameter. It creates a repository and uploads the model to the folder name specified in `--output_dir`. You could also use the `--push_to_hub_model_id` parameter to specify the repository name.
@ -85,9 +84,6 @@ python examples/pytorch/summarization/run_summarization.py \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--push_to_hub \ --push_to_hub \
--push_to_hub_model_id finetuned-t5-cnn_dailymail \ --push_to_hub_model_id finetuned-t5-cnn_dailymail \
# remove if using `output_dir previous_output_dir`
# --overwrite_output_dir \
--output_dir previous_output_dir \
# --resume_from_checkpoint path_to_specific_checkpoint \ # --resume_from_checkpoint path_to_specific_checkpoint \
--predict_with_generate \ --predict_with_generate \
``` ```
@ -168,7 +164,6 @@ python examples/pytorch/summarization/run_summarization.py \
--summary_column summary_column_name \ --summary_column summary_column_name \
--source_prefix "summarize: " \ --source_prefix "summarize: " \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--overwrite_output_dir \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--predict_with_generate \ --predict_with_generate \

View File

@ -361,8 +361,7 @@ accelerate launch \
--per_device_train_batch_size 16 \ --per_device_train_batch_size 16 \
--learning_rate 5e-5 \ --learning_rate 5e-5 \
--num_train_epochs 3 \ --num_train_epochs 3 \
--output_dir /tmp/$TASK_NAME/ \ --output_dir /tmp/$TASK_NAME/
--overwrite_output_dir
``` ```
> [!TIP] > [!TIP]

View File

@ -98,7 +98,6 @@ python examples/pytorch/summarization/run_summarization.py \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -122,7 +121,6 @@ torchrun \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -144,7 +142,6 @@ python xla_spawn.py --num_cores 8 \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -201,7 +198,6 @@ python examples/pytorch/summarization/run_summarization.py \
--summary_column summary_column_name \ --summary_column summary_column_name \
--source_prefix "summarize: " \ --source_prefix "summarize: " \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--overwrite_output_dir \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--predict_with_generate --predict_with_generate
@ -229,7 +225,6 @@ python examples/pytorch/summarization/run_summarization.py \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -243,8 +238,6 @@ examples/pytorch/summarization/run_summarization.py -h
Otra opción útil para habilitar es reanudar el entrenamiento desde un punto de control anterior. Esto asegurará que puedas continuar donde lo dejaste sin comenzar de nuevo si tu entrenamiento se interrumpe. Hay dos métodos para reanudar el entrenamiento desde un punto de control. Otra opción útil para habilitar es reanudar el entrenamiento desde un punto de control anterior. Esto asegurará que puedas continuar donde lo dejaste sin comenzar de nuevo si tu entrenamiento se interrumpe. Hay dos métodos para reanudar el entrenamiento desde un punto de control.
El primer método utiliza el argumento `output_dir previous_output_dir` para reanudar el entrenamiento desde el último punto de control almacenado en `output_dir`. En este caso, debes eliminar `overwrite_output_dir`:
```bash ```bash
python examples/pytorch/summarization/run_summarization.py python examples/pytorch/summarization/run_summarization.py
--model_name_or_path google-t5/t5-small \ --model_name_or_path google-t5/t5-small \
@ -256,24 +249,6 @@ python examples/pytorch/summarization/run_summarization.py
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--output_dir previous_output_dir \
--predict_with_generate
```
El segundo método utiliza el argumento `resume_from_checkpoint path_to_specific_checkpoint` para reanudar el entrenamiento desde una carpeta de punto de control específica.
```bash
python examples/pytorch/summarization/run_summarization.py
--model_name_or_path google-t5/t5-small \
--do_train \
--do_eval \
--dataset_name cnn_dailymail \
--dataset_config "3.0.0" \
--source_prefix "summarize: " \
--output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \
--overwrite_output_dir \
--resume_from_checkpoint path_to_specific_checkpoint \ --resume_from_checkpoint path_to_specific_checkpoint \
--predict_with_generate --predict_with_generate
``` ```
@ -305,6 +280,5 @@ python examples/pytorch/summarization/run_summarization.py
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```

View File

@ -381,7 +381,6 @@ accelerate launch \
--learning_rate 5e-5 \ --learning_rate 5e-5 \
--num_train_epochs 3 \ --num_train_epochs 3 \
--output_dir /tmp/$TASK_NAME/ \ --output_dir /tmp/$TASK_NAME/ \
--overwrite_output_dir
``` ```
También puedes especificar los parámetros del archivo config_file.yaml directamente en la línea de comandos: También puedes especificar los parámetros del archivo config_file.yaml directamente en la línea de comandos:
@ -404,7 +403,6 @@ accelerate launch --num_processes=2 \
--learning_rate 5e-5 \ --learning_rate 5e-5 \
--num_train_epochs 3 \ --num_train_epochs 3 \
--output_dir /tmp/$TASK_NAME/ \ --output_dir /tmp/$TASK_NAME/ \
--overwrite_output_dir
``` ```
Consulta el tutorial [Lanzamiento de tus scripts con Accelerate](https://huggingface.co/docs/accelerate/basic_tutorials/launch) para obtener más información sobre `accelerate_launch` y las configuraciones personalizadas. Consulta el tutorial [Lanzamiento de tus scripts con Accelerate](https://huggingface.co/docs/accelerate/basic_tutorials/launch) para obtener más información sobre `accelerate_launch` y las configuraciones personalizadas.

View File

@ -100,7 +100,6 @@ python examples/pytorch/summarization/run_summarization.py \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -124,7 +123,6 @@ torchrun \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -147,7 +145,6 @@ python xla_spawn.py --num_cores 8 \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -204,7 +201,6 @@ python examples/pytorch/summarization/run_summarization.py \
--summary_column summary_column_name \ --summary_column summary_column_name \
--source_prefix "summarize: " \ --source_prefix "summarize: " \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--overwrite_output_dir \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--predict_with_generate --predict_with_generate
@ -231,7 +227,6 @@ python examples/pytorch/summarization/run_summarization.py \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -243,9 +238,7 @@ examples/pytorch/summarization/run_summarization.py -h
## Reprendre l'entraînement à partir d'un point de contrôle ## Reprendre l'entraînement à partir d'un point de contrôle
Une autre option utile est de reprendre l'entraînement à partir d'un point de contrôle précédent. Cela vous permettra de reprendre là où vous vous étiez arrêté sans recommencer si votre entraînement est interrompu. Il existe deux méthodes pour reprendre l'entraînement à partir d'un point de contrôle. Une autre option utile est de reprendre l'entraînement à partir d'un point de contrôle précédent. Cela vous permettra de reprendre là où vous vous étiez arrêté sans recommencer si votre entraînement est interrompu:
La première méthode utilise l'argument `output_dir previous_output_dir` pour reprendre l'entraînement à partir du dernier point de contrôle stocké dans `output_dir`. Dans ce cas, vous devez supprimer l'argument `overwrite_output_dir`.
```bash ```bash
python examples/pytorch/summarization/run_summarization.py python examples/pytorch/summarization/run_summarization.py
@ -258,24 +251,6 @@ python examples/pytorch/summarization/run_summarization.py
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--output_dir previous_output_dir \
--predict_with_generate
```
La seconde méthode utilise l'argument `resume_from_checkpoint path_to_specific_checkpoint` pour reprendre l'entraînement à partir d'un dossier de point de contrôle spécifique.
```bash
python examples/pytorch/summarization/run_summarization.py
--model_name_or_path google-t5/t5-small \
--do_train \
--do_eval \
--dataset_name cnn_dailymail \
--dataset_config "3.0.0" \
--source_prefix "summarize: " \
--output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \
--overwrite_output_dir \
--resume_from_checkpoint path_to_specific_checkpoint \ --resume_from_checkpoint path_to_specific_checkpoint \
--predict_with_generate --predict_with_generate
``` ```
@ -308,6 +283,5 @@ python examples/pytorch/summarization/run_summarization.py
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```

View File

@ -98,7 +98,6 @@ python examples/pytorch/summarization/run_summarization.py \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -122,7 +121,6 @@ torchrun \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -144,7 +142,6 @@ python xla_spawn.py --num_cores 8 \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -201,7 +198,6 @@ python examples/pytorch/summarization/run_summarization.py \
--summary_column summary_column_name \ --summary_column summary_column_name \
--source_prefix "summarize: " \ --source_prefix "summarize: " \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--overwrite_output_dir \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--predict_with_generate --predict_with_generate
@ -229,7 +225,6 @@ python examples/pytorch/summarization/run_summarization.py \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -243,8 +238,6 @@ examples/pytorch/summarization/run_summarization.py -h
Un'altra utile opzione è riavviare un addestramento da un checkpoint precedente. Questo garantirà che tu possa riprendere da dove hai interrotto senza ricominciare se l'addestramento viene interrotto. Ci sono due metodi per riavviare l'addestramento da un checkpoint: Un'altra utile opzione è riavviare un addestramento da un checkpoint precedente. Questo garantirà che tu possa riprendere da dove hai interrotto senza ricominciare se l'addestramento viene interrotto. Ci sono due metodi per riavviare l'addestramento da un checkpoint:
Il primo metodo usa l'argomento `output_dir previous_output_dir` per riavviare l'addestramento dall'ultima versione del checkpoint contenuto in `output_dir`. In questo caso, dovresti rimuovere `overwrite_output_dir`:
```bash ```bash
python examples/pytorch/summarization/run_summarization.py python examples/pytorch/summarization/run_summarization.py
--model_name_or_path google-t5/t5-small \ --model_name_or_path google-t5/t5-small \
@ -256,24 +249,6 @@ python examples/pytorch/summarization/run_summarization.py
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--output_dir previous_output_dir \
--predict_with_generate
```
Il secondo metodo usa l'argomento `resume_from_checkpoint path_to_specific_checkpoint` per riavviare un addestramento da una specifica cartella di checkpoint.
```bash
python examples/pytorch/summarization/run_summarization.py
--model_name_or_path google-t5/t5-small \
--do_train \
--do_eval \
--dataset_name cnn_dailymail \
--dataset_config "3.0.0" \
--source_prefix "summarize: " \
--output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \
--overwrite_output_dir \
--resume_from_checkpoint path_to_specific_checkpoint \ --resume_from_checkpoint path_to_specific_checkpoint \
--predict_with_generate --predict_with_generate
``` ```
@ -305,6 +280,5 @@ python examples/pytorch/summarization/run_summarization.py
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```

View File

@ -188,7 +188,7 @@ deepspeed --num_gpus=2 your_program.py <normal cl args> --deepspeed ds_config.js
deepspeed examples/pytorch/translation/run_translation.py \ deepspeed examples/pytorch/translation/run_translation.py \
--deepspeed tests/deepspeed/ds_config_zero3.json \ --deepspeed tests/deepspeed/ds_config_zero3.json \
--model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \ --model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \
--output_dir output_dir --overwrite_output_dir --fp16 \ --output_dir output_dir --fp16 \
--do_train --max_train_samples 500 --num_train_epochs 1 \ --do_train --max_train_samples 500 --num_train_epochs 1 \
--dataset_name wmt16 --dataset_config "ro-en" \ --dataset_name wmt16 --dataset_config "ro-en" \
--source_lang en --target_lang ro --source_lang en --target_lang ro
@ -211,7 +211,7 @@ DeepSpeed 関連の引数が 2 つありますが、簡単にするためであ
deepspeed --num_gpus=1 examples/pytorch/translation/run_translation.py \ deepspeed --num_gpus=1 examples/pytorch/translation/run_translation.py \
--deepspeed tests/deepspeed/ds_config_zero2.json \ --deepspeed tests/deepspeed/ds_config_zero2.json \
--model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \ --model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \
--output_dir output_dir --overwrite_output_dir --fp16 \ --output_dir output_dir --fp16 \
--do_train --max_train_samples 500 --num_train_epochs 1 \ --do_train --max_train_samples 500 --num_train_epochs 1 \
--dataset_name wmt16 --dataset_config "ro-en" \ --dataset_name wmt16 --dataset_config "ro-en" \
--source_lang en --target_lang ro --source_lang en --target_lang ro
@ -1789,7 +1789,7 @@ deepspeed examples/pytorch/translation/run_translation.py \
--model_name_or_path google-t5/t5-small --output_dir output_dir \ --model_name_or_path google-t5/t5-small --output_dir output_dir \
--do_eval --max_eval_samples 50 --warmup_steps 50 \ --do_eval --max_eval_samples 50 --warmup_steps 50 \
--max_source_length 128 --val_max_target_length 128 \ --max_source_length 128 --val_max_target_length 128 \
--overwrite_output_dir --per_device_eval_batch_size 4 \ --per_device_eval_batch_size 4 \
--predict_with_generate --dataset_config "ro-en" --fp16 \ --predict_with_generate --dataset_config "ro-en" --fp16 \
--source_lang en --target_lang ro --dataset_name wmt16 \ --source_lang en --target_lang ro --dataset_name wmt16 \
--source_prefix "translate English to Romanian: " --source_prefix "translate English to Romanian: "

View File

@ -534,7 +534,6 @@ python examples/pytorch/text-classification/run_glue.py \
--learning_rate 2e-5 \ --learning_rate 2e-5 \
--num_train_epochs 3 \ --num_train_epochs 3 \
--output_dir /tmp/$TASK_NAME/ \ --output_dir /tmp/$TASK_NAME/ \
--overwrite_output_dir
``` ```
**注意すべきいくつかの注意事項** **注意すべきいくつかの注意事項**
@ -669,7 +668,6 @@ accelerate launch \
--learning_rate 5e-5 \ --learning_rate 5e-5 \
--num_train_epochs 3 \ --num_train_epochs 3 \
--output_dir /tmp/$TASK_NAME/ \ --output_dir /tmp/$TASK_NAME/ \
--overwrite_output_dir
``` ```
4. `accelerate launch`するための cmd 引数を直接使用することもできます。上の例は次のようにマッピングされます。 4. `accelerate launch`するための cmd 引数を直接使用することもできます。上の例は次のようにマッピングされます。
@ -694,7 +692,6 @@ accelerate launch --num_processes=2 \
--learning_rate 5e-5 \ --learning_rate 5e-5 \
--num_train_epochs 3 \ --num_train_epochs 3 \
--output_dir /tmp/$TASK_NAME/ \ --output_dir /tmp/$TASK_NAME/ \
--overwrite_output_dir
``` ```
詳細については、🤗 Accelerate CLI ガイドを参照してください: [🤗 Accelerate スクリプトの起動](https://huggingface.co/docs/accelerate/basic_tutorials/launch)。 詳細については、🤗 Accelerate CLI ガイドを参照してください: [🤗 Accelerate スクリプトの起動](https://huggingface.co/docs/accelerate/basic_tutorials/launch)。

View File

@ -104,7 +104,6 @@ python examples/pytorch/summarization/run_summarization.py \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -131,7 +130,6 @@ torchrun \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -153,7 +151,6 @@ python xla_spawn.py --num_cores 8 \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -212,7 +209,6 @@ python examples/pytorch/summarization/run_summarization.py \
--summary_column summary_column_name \ --summary_column summary_column_name \
--source_prefix "summarize: " \ --source_prefix "summarize: " \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--overwrite_output_dir \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--predict_with_generate --predict_with_generate
@ -240,7 +236,6 @@ python examples/pytorch/summarization/run_summarization.py \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -254,8 +249,6 @@ examples/pytorch/summarization/run_summarization.py -h
以前のチェックポイントからトレーニングを再開するための役立つオプションもあります。これにより、トレーニングが中断された場合でも、最初からやり直すことなく、中断したところから再開できます。チェックポイントからトレーニングを再開するための2つの方法があります。 以前のチェックポイントからトレーニングを再開するための役立つオプションもあります。これにより、トレーニングが中断された場合でも、最初からやり直すことなく、中断したところから再開できます。チェックポイントからトレーニングを再開するための2つの方法があります。
最初の方法は、`output_dir previous_output_dir` 引数を使用して、`output_dir` に保存された最新のチェックポイントからトレーニングを再開する方法です。この場合、`overwrite_output_dir` を削除する必要があります:
```bash ```bash
python examples/pytorch/summarization/run_summarization.py python examples/pytorch/summarization/run_summarization.py
--model_name_or_path google-t5/t5-small \ --model_name_or_path google-t5/t5-small \
@ -267,25 +260,6 @@ python examples/pytorch/summarization/run_summarization.py
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--output_dir previous_output_dir \
--predict_with_generate
```
2番目の方法では、`resume_from_checkpoint path_to_specific_checkpoint` 引数を使用して、特定のチェックポイントフォルダからトレーニングを再開します。
```bash
python examples/pytorch/summarization/run_summarization.py
--model_name_or_path google-t5/t5-small \
--do_train \
--do_eval \
--dataset_name cnn_dailymail \
--dataset_config "3.0.0" \
--source_prefix "summarize: " \
--output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \
--overwrite_output_dir \
--resume_from_checkpoint path_to_specific_checkpoint \ --resume_from_checkpoint path_to_specific_checkpoint \
--predict_with_generate --predict_with_generate
``` ```
@ -319,7 +293,6 @@ python examples/pytorch/summarization/run_summarization.py
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```

View File

@ -590,7 +590,7 @@ bf16은 설정 파일에서 설정하거나 다음 인수를 전달하면 명령
deepspeed --num_gpus=2 examples/pytorch/translation/run_translation.py \ deepspeed --num_gpus=2 examples/pytorch/translation/run_translation.py \
--deepspeed tests/deepspeed/ds_config_zero3.json \ --deepspeed tests/deepspeed/ds_config_zero3.json \
--model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \ --model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \
--output_dir output_dir --overwrite_output_dir --fp16 \ --output_dir output_dir --fp16 \
--do_train --max_train_samples 500 --num_train_epochs 1 \ --do_train --max_train_samples 500 --num_train_epochs 1 \
--dataset_name wmt16 --dataset_config "ro-en" \ --dataset_name wmt16 --dataset_config "ro-en" \
--source_lang en --target_lang ro --source_lang en --target_lang ro
@ -605,7 +605,7 @@ deepspeed --num_gpus=2 examples/pytorch/translation/run_translation.py \
deepspeed --num_gpus=1 examples/pytorch/translation/run_translation.py \ deepspeed --num_gpus=1 examples/pytorch/translation/run_translation.py \
--deepspeed tests/deepspeed/ds_config_zero2.json \ --deepspeed tests/deepspeed/ds_config_zero2.json \
--model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \ --model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \
--output_dir output_dir --overwrite_output_dir --fp16 \ --output_dir output_dir --fp16 \
--do_train --max_train_samples 500 --num_train_epochs 1 \ --do_train --max_train_samples 500 --num_train_epochs 1 \
--dataset_name wmt16 --dataset_config "ro-en" \ --dataset_name wmt16 --dataset_config "ro-en" \
--source_lang en --target_lang ro --source_lang en --target_lang ro

View File

@ -54,8 +54,7 @@ python examples/pytorch/text-classification/run_glue.py \
--per_device_train_batch_size 32 \ --per_device_train_batch_size 32 \
--learning_rate 2e-5 \ --learning_rate 2e-5 \
--num_train_epochs 3 \ --num_train_epochs 3 \
--output_dir /tmp/$TASK_NAME/ \ --output_dir /tmp/$TASK_NAME/
--overwrite_output_dir
``` ```
`gloco``nccl`과 같은 [분산 학습 백엔드](https://pytorch.org/docs/stable/distributed.html#backends)는 `mps` 장치에서 지원되지 않으므로, MPS 백엔드에서는 단일 GPU로만 학습이 가능합니다. `gloco``nccl`과 같은 [분산 학습 백엔드](https://pytorch.org/docs/stable/distributed.html#backends)는 `mps` 장치에서 지원되지 않으므로, MPS 백엔드에서는 단일 GPU로만 학습이 가능합니다.

View File

@ -106,7 +106,6 @@ python examples/pytorch/summarization/run_summarization.py \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -131,7 +130,6 @@ torchrun \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -156,7 +154,6 @@ python xla_spawn.py --num_cores 8 \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -216,7 +213,6 @@ python examples/pytorch/summarization/run_summarization.py \
--summary_column summary_column_name \ --summary_column summary_column_name \
--source_prefix "summarize: " \ --source_prefix "summarize: " \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--overwrite_output_dir \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--predict_with_generate --predict_with_generate
@ -245,7 +241,6 @@ python examples/pytorch/summarization/run_summarization.py \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -262,8 +257,6 @@ examples/pytorch/summarization/run_summarization.py -h
이렇게 하면 훈련이 중단되더라도 처음부터 다시 시작하지 않고 중단한 부분부터 다시 시작할 수 있습니다. 이렇게 하면 훈련이 중단되더라도 처음부터 다시 시작하지 않고 중단한 부분부터 다시 시작할 수 있습니다.
체크포인트에서 훈련을 재개하는 방법에는 두 가지가 있습니다. 체크포인트에서 훈련을 재개하는 방법에는 두 가지가 있습니다.
첫 번째는 `output_dir previous_output_dir` 인수를 사용하여 `output_dir`에 저장된 최신 체크포인트부터 훈련을 재개하는 방법입니다.
이 경우 `overwrite_output_dir`을 제거해야 합니다:
```bash ```bash
python examples/pytorch/summarization/run_summarization.py python examples/pytorch/summarization/run_summarization.py
--model_name_or_path google-t5/t5-small \ --model_name_or_path google-t5/t5-small \
@ -275,24 +268,6 @@ python examples/pytorch/summarization/run_summarization.py
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--output_dir previous_output_dir \
--predict_with_generate
```
두 번째는 `resume_from_checkpoint path_to_specific_checkpoint` 인수를 사용하여 특정 체크포인트 폴더에서 훈련을 재개하는 방법입니다.
```bash
python examples/pytorch/summarization/run_summarization.py
--model_name_or_path google-t5/t5-small \
--do_train \
--do_eval \
--dataset_name cnn_dailymail \
--dataset_config "3.0.0" \
--source_prefix "summarize: " \
--output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \
--overwrite_output_dir \
--resume_from_checkpoint path_to_specific_checkpoint \ --resume_from_checkpoint path_to_specific_checkpoint \
--predict_with_generate --predict_with_generate
``` ```
@ -325,6 +300,5 @@ python examples/pytorch/summarization/run_summarization.py
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```

View File

@ -505,7 +505,6 @@ accelerate launch \
--learning_rate 5e-5 \ --learning_rate 5e-5 \
--num_train_epochs 3 \ --num_train_epochs 3 \
--output_dir /tmp/$TASK_NAME/ \ --output_dir /tmp/$TASK_NAME/ \
--overwrite_output_dir
``` ```
`config_file.yaml` 파일의 매개변수를 직접 지정할 수도 있습니다: `config_file.yaml` 파일의 매개변수를 직접 지정할 수도 있습니다:
@ -528,7 +527,6 @@ accelerate launch --num_processes=2 \
--learning_rate 5e-5 \ --learning_rate 5e-5 \
--num_train_epochs 3 \ --num_train_epochs 3 \
--output_dir /tmp/$TASK_NAME/ \ --output_dir /tmp/$TASK_NAME/ \
--overwrite_output_dir
``` ```
`accelerate_launch`와 사용자 정의 구성에 대해 더 알아보려면 [Accelerate 스크립트 실행](https://huggingface.co/docs/accelerate/basic_tutorials/launch) 튜토리얼을 확인하세요. `accelerate_launch`와 사용자 정의 구성에 대해 더 알아보려면 [Accelerate 스크립트 실행](https://huggingface.co/docs/accelerate/basic_tutorials/launch) 튜토리얼을 확인하세요.

View File

@ -99,7 +99,6 @@ python examples/pytorch/summarization/run_summarization.py \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -123,7 +122,6 @@ torchrun \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -145,7 +143,6 @@ python xla_spawn.py --num_cores 8 \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -203,7 +200,6 @@ python examples/pytorch/summarization/run_summarization.py \
--summary_column summary_column_name \ --summary_column summary_column_name \
--source_prefix "summarize: " \ --source_prefix "summarize: " \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--overwrite_output_dir \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--predict_with_generate --predict_with_generate
@ -231,7 +227,6 @@ python examples/pytorch/summarization/run_summarization.py \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -245,8 +240,6 @@ examples/pytorch/summarization/run_summarization.py -h
Outra opção útil para habilitar é retomar o treinamento de um checkpoint anterior. Isso garantirá que você possa continuar de onde parou sem recomeçar se o seu treinamento for interrompido. Existem dois métodos para retomar o treinamento a partir de um checkpoint. Outra opção útil para habilitar é retomar o treinamento de um checkpoint anterior. Isso garantirá que você possa continuar de onde parou sem recomeçar se o seu treinamento for interrompido. Existem dois métodos para retomar o treinamento a partir de um checkpoint.
O primeiro método usa o argumento `output_dir previous_output_dir` para retomar o treinamento do último checkpoint armazenado em `output_dir`. Neste caso, você deve remover `overwrite_output_dir`:
```bash ```bash
python examples/pytorch/summarization/run_summarization.py python examples/pytorch/summarization/run_summarization.py
--model_name_or_path google-t5/t5-small \ --model_name_or_path google-t5/t5-small \
@ -258,24 +251,6 @@ python examples/pytorch/summarization/run_summarization.py
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--output_dir previous_output_dir \
--predict_with_generate
```
O segundo método usa o argumento `resume_from_checkpoint path_to_specific_checkpoint` para retomar o treinamento de uma pasta de checkpoint específica.
```bash
python examples/pytorch/summarization/run_summarization.py
--model_name_or_path google-t5/t5-small \
--do_train \
--do_eval \
--dataset_name cnn_dailymail \
--dataset_config "3.0.0" \
--source_prefix "summarize: " \
--output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \
--overwrite_output_dir \
--resume_from_checkpoint path_to_specific_checkpoint \ --resume_from_checkpoint path_to_specific_checkpoint \
--predict_with_generate --predict_with_generate
``` ```
@ -307,6 +282,5 @@ python examples/pytorch/summarization/run_summarization.py
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```

View File

@ -179,7 +179,7 @@ deepspeed --num_gpus=2 your_program.py <normal cl args> --deepspeed ds_config.js
deepspeed examples/pytorch/translation/run_translation.py \ deepspeed examples/pytorch/translation/run_translation.py \
--deepspeed tests/deepspeed/ds_config_zero3.json \ --deepspeed tests/deepspeed/ds_config_zero3.json \
--model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \ --model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \
--output_dir output_dir --overwrite_output_dir --fp16 \ --output_dir output_dir --fp16 \
--do_train --max_train_samples 500 --num_train_epochs 1 \ --do_train --max_train_samples 500 --num_train_epochs 1 \
--dataset_name wmt16 --dataset_config "ro-en" \ --dataset_name wmt16 --dataset_config "ro-en" \
--source_lang en --target_lang ro --source_lang en --target_lang ro
@ -202,7 +202,7 @@ deepspeed examples/pytorch/translation/run_translation.py \
deepspeed --num_gpus=1 examples/pytorch/translation/run_translation.py \ deepspeed --num_gpus=1 examples/pytorch/translation/run_translation.py \
--deepspeed tests/deepspeed/ds_config_zero2.json \ --deepspeed tests/deepspeed/ds_config_zero2.json \
--model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \ --model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \
--output_dir output_dir --overwrite_output_dir --fp16 \ --output_dir output_dir --fp16 \
--do_train --max_train_samples 500 --num_train_epochs 1 \ --do_train --max_train_samples 500 --num_train_epochs 1 \
--dataset_name wmt16 --dataset_config "ro-en" \ --dataset_name wmt16 --dataset_config "ro-en" \
--source_lang en --target_lang ro --source_lang en --target_lang ro
@ -1659,7 +1659,7 @@ deepspeed examples/pytorch/translation/run_translation.py \
--model_name_or_path google-t5/t5-small --output_dir output_dir \ --model_name_or_path google-t5/t5-small --output_dir output_dir \
--do_eval --max_eval_samples 50 --warmup_steps 50 \ --do_eval --max_eval_samples 50 --warmup_steps 50 \
--max_source_length 128 --val_max_target_length 128 \ --max_source_length 128 --val_max_target_length 128 \
--overwrite_output_dir --per_device_eval_batch_size 4 \ --per_device_eval_batch_size 4 \
--predict_with_generate --dataset_config "ro-en" --fp16 \ --predict_with_generate --dataset_config "ro-en" --fp16 \
--source_lang en --target_lang ro --dataset_name wmt16 \ --source_lang en --target_lang ro --dataset_name wmt16 \
--source_prefix "translate English to Romanian: " --source_prefix "translate English to Romanian: "

View File

@ -471,7 +471,6 @@ python examples/pytorch/text-classification/run_glue.py \
--learning_rate 2e-5 \ --learning_rate 2e-5 \
--num_train_epochs 3 \ --num_train_epochs 3 \
--output_dir /tmp/$TASK_NAME/ \ --output_dir /tmp/$TASK_NAME/ \
--overwrite_output_dir
``` ```
**需要注意的一些注意事项** **需要注意的一些注意事项**
@ -606,7 +605,6 @@ accelerate launch \
--learning_rate 5e-5 \ --learning_rate 5e-5 \
--num_train_epochs 3 \ --num_train_epochs 3 \
--output_dir /tmp/$TASK_NAME/ \ --output_dir /tmp/$TASK_NAME/ \
--overwrite_output_dir
``` ```
4. 你也可以直接使用`accelerate launch`的cmd参数。上面的示例将映射到 4. 你也可以直接使用`accelerate launch`的cmd参数。上面的示例将映射到
@ -631,7 +629,6 @@ accelerate launch --num_processes=2 \
--learning_rate 5e-5 \ --learning_rate 5e-5 \
--num_train_epochs 3 \ --num_train_epochs 3 \
--output_dir /tmp/$TASK_NAME/ \ --output_dir /tmp/$TASK_NAME/ \
--overwrite_output_dir
``` ```
有关更多信息,请参阅 🤗 Accelerate CLI 指南:[启动您的 🤗 Accelerate 脚本](https://huggingface.co/docs/accelerate/basic_tutorials/launch)。 有关更多信息,请参阅 🤗 Accelerate CLI 指南:[启动您的 🤗 Accelerate 脚本](https://huggingface.co/docs/accelerate/basic_tutorials/launch)。

View File

@ -50,7 +50,6 @@ python examples/pytorch/text-classification/run_glue.py \
--learning_rate 2e-5 \ --learning_rate 2e-5 \
--num_train_epochs 3 \ --num_train_epochs 3 \
--output_dir /tmp/$TASK_NAME/ \ --output_dir /tmp/$TASK_NAME/ \
--overwrite_output_dir
``` ```
用于[分布式设置](https://pytorch.org/docs/stable/distributed.html#backends)的后端(如`gloo``nccl`)不支持`mps`设备,这也意味着使用 MPS 后端时只能在单个 GPU 上进行训练。 用于[分布式设置](https://pytorch.org/docs/stable/distributed.html#backends)的后端(如`gloo``nccl`)不支持`mps`设备,这也意味着使用 MPS 后端时只能在单个 GPU 上进行训练。

View File

@ -99,7 +99,6 @@ python examples/pytorch/summarization/run_summarization.py \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -124,7 +123,6 @@ torchrun \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -145,7 +143,6 @@ python xla_spawn.py --num_cores 8 \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -202,7 +199,6 @@ python examples/pytorch/summarization/run_summarization.py \
--summary_column summary_column_name \ --summary_column summary_column_name \
--source_prefix "summarize: " \ --source_prefix "summarize: " \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--overwrite_output_dir \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--predict_with_generate --predict_with_generate
@ -231,7 +227,6 @@ python examples/pytorch/summarization/run_summarization.py \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -245,8 +240,6 @@ examples/pytorch/summarization/run_summarization.py -h
另一个有用的选项是从之前的checkpoint恢复训练。这将确保在训练中断时您可以从之前停止的地方继续进行而无需重新开始。有两种方法可以从checkpoint恢复训练。 另一个有用的选项是从之前的checkpoint恢复训练。这将确保在训练中断时您可以从之前停止的地方继续进行而无需重新开始。有两种方法可以从checkpoint恢复训练。
第一种方法使用`output_dir previous_output_dir`参数从存储在`output_dir`中的最新的checkpoint恢复训练。在这种情况下您应该删除`overwrite_output_dir`
```bash ```bash
python examples/pytorch/summarization/run_summarization.py python examples/pytorch/summarization/run_summarization.py
--model_name_or_path google-t5/t5-small \ --model_name_or_path google-t5/t5-small \
@ -258,25 +251,6 @@ python examples/pytorch/summarization/run_summarization.py
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--output_dir previous_output_dir \
--predict_with_generate
```
第二种方法使用`resume_from_checkpoint path_to_specific_checkpoint`参数从特定的checkpoint文件夹恢复训练。
```bash
python examples/pytorch/summarization/run_summarization.py
--model_name_or_path google-t5/t5-small \
--do_train \
--do_eval \
--dataset_name cnn_dailymail \
--dataset_config "3.0.0" \
--source_prefix "summarize: " \
--output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \
--overwrite_output_dir \
--resume_from_checkpoint path_to_specific_checkpoint \ --resume_from_checkpoint path_to_specific_checkpoint \
--predict_with_generate --predict_with_generate
``` ```
@ -309,6 +283,5 @@ python examples/pytorch/summarization/run_summarization.py
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```

View File

@ -95,17 +95,6 @@ def main():
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
model_args, data_args, training_args = parser.parse_args_into_dataclasses() model_args, data_args, training_args = parser.parse_args_into_dataclasses()
if (
os.path.exists(training_args.output_dir)
and os.listdir(training_args.output_dir)
and training_args.do_train
and not training_args.overwrite_output_dir
):
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. Use"
" --overwrite_output_dir to overcome."
)
# Setup logging # Setup logging
logging.basicConfig( logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",

View File

@ -641,9 +641,6 @@ def main():
help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
) )
parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
parser.add_argument(
"--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
)
parser.add_argument( parser.add_argument(
"--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
) )
@ -677,18 +674,6 @@ def main():
"stride or increase the maximum length to ensure the features are correctly built." "stride or increase the maximum length to ensure the features are correctly built."
) )
if (
os.path.exists(args.output_dir)
and os.listdir(args.output_dir)
and args.do_train
and not args.overwrite_output_dir
):
raise ValueError(
"Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
args.output_dir
)
)
# Setup distant debugging if needed # Setup distant debugging if needed
if args.server_ip and args.server_port: if args.server_ip and args.server_port:
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script

View File

@ -76,17 +76,6 @@ def main():
else: else:
model_args, data_args, training_args = parser.parse_args_into_dataclasses() model_args, data_args, training_args = parser.parse_args_into_dataclasses()
if (
os.path.exists(training_args.output_dir)
and os.listdir(training_args.output_dir)
and training_args.do_train
and not training_args.overwrite_output_dir
):
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. Use"
" --overwrite_output_dir to overcome."
)
# Setup logging # Setup logging
logging.basicConfig( logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",

View File

@ -207,16 +207,6 @@ def main():
"Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
"or remove the --do_eval argument." "or remove the --do_eval argument."
) )
if (
os.path.exists(training_args.output_dir)
and os.listdir(training_args.output_dir)
and training_args.do_train
and not training_args.overwrite_output_dir
):
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. Use"
" --overwrite_output_dir to overcome."
)
# Setup logging # Setup logging
logging.basicConfig( logging.basicConfig(

View File

@ -557,9 +557,6 @@ def main():
help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
) )
parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
parser.add_argument(
"--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
)
parser.add_argument( parser.add_argument(
"--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
) )
@ -584,18 +581,6 @@ def main():
parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.") parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
args = parser.parse_args() args = parser.parse_args()
if (
os.path.exists(args.output_dir)
and os.listdir(args.output_dir)
and args.do_train
and not args.overwrite_output_dir
):
raise ValueError(
"Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
args.output_dir
)
)
# Setup distant debugging if needed # Setup distant debugging if needed
if args.server_ip and args.server_port: if args.server_ip and args.server_port:
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script

View File

@ -39,7 +39,6 @@ from utils import (
Seq2SeqDataset, Seq2SeqDataset,
assert_all_frozen, assert_all_frozen,
build_compute_metrics_fn, build_compute_metrics_fn,
check_output_dir,
freeze_embeds, freeze_embeds,
freeze_params, freeze_params,
lmap, lmap,
@ -168,8 +167,6 @@ def main():
else: else:
model_args, data_args, training_args = parser.parse_args_into_dataclasses() model_args, data_args, training_args = parser.parse_args_into_dataclasses()
check_output_dir(training_args)
# Setup logging # Setup logging
logging.basicConfig( logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",

View File

@ -20,7 +20,7 @@ export MAX_LEN=128
python finetune_trainer.py \ python finetune_trainer.py \
--tokenizer_name $m --model_name_or_path $m \ --tokenizer_name $m --model_name_or_path $m \
--data_dir $ENRO_DIR \ --data_dir $ENRO_DIR \
--output_dir marian_en_ro_6_3 --overwrite_output_dir \ --output_dir marian_en_ro_6_3 \
--learning_rate=3e-4 \ --learning_rate=3e-4 \
--warmup_steps 500 --sortish_sampler \ --warmup_steps 500 --sortish_sampler \
--fp16 \ --fp16 \

View File

@ -22,7 +22,7 @@ python xla_spawn.py --num_cores $TPU_NUM_CORES \
finetune_trainer.py \ finetune_trainer.py \
--tokenizer_name $m --model_name_or_path $m \ --tokenizer_name $m --model_name_or_path $m \
--data_dir $ENRO_DIR \ --data_dir $ENRO_DIR \
--output_dir marian_en_ro_6_3 --overwrite_output_dir \ --output_dir marian_en_ro_6_3 \
--learning_rate=3e-4 \ --learning_rate=3e-4 \
--warmup_steps 500 \ --warmup_steps 500 \
--per_device_train_batch_size=$BS --per_device_eval_batch_size=$BS \ --per_device_train_batch_size=$BS --per_device_eval_batch_size=$BS \

View File

@ -21,7 +21,7 @@ export MAX_TGT_LEN=142
python finetune_trainer.py \ python finetune_trainer.py \
--model_name_or_path $m --tokenizer_name $tok \ --model_name_or_path $m --tokenizer_name $tok \
--data_dir cnn_dm \ --data_dir cnn_dm \
--output_dir distilbart-cnn-12-6 --overwrite_output_dir \ --output_dir distilbart-cnn-12-6 \
--learning_rate=3e-5 \ --learning_rate=3e-5 \
--warmup_steps 500 --sortish_sampler \ --warmup_steps 500 --sortish_sampler \
--fp16 \ --fp16 \

View File

@ -15,7 +15,7 @@
python finetune_trainer.py \ python finetune_trainer.py \
--model_name_or_path=facebook/mbart-large-cc25 \ --model_name_or_path=facebook/mbart-large-cc25 \
--data_dir $ENRO_DIR \ --data_dir $ENRO_DIR \
--output_dir mbart_cc25_enro --overwrite_output_dir \ --output_dir mbart_cc25_enro \
--learning_rate=3e-5 \ --learning_rate=3e-5 \
--warmup_steps 500 \ --warmup_steps 500 \
--fp16 \ --fp16 \

View File

@ -639,27 +639,3 @@ def chunks(lst, n):
"""Yield successive n-sized chunks from lst.""" """Yield successive n-sized chunks from lst."""
for i in range(0, len(lst), n): for i in range(0, len(lst), n):
yield lst[i : i + n] yield lst[i : i + n]
def check_output_dir(args, expected_items=0):
"""
Checks whether to bail out if output_dir already exists and has more than expected_items in it
`args`: needs to have the following attributes of `args`:
- output_dir
- do_train
- overwrite_output_dir
`expected_items`: normally 0 (default) - i.e. empty dir, but in some cases a few files are expected (e.g. recovery from OOM)
"""
if (
os.path.exists(args.output_dir)
and len(os.listdir(args.output_dir)) > expected_items
and args.do_train
and not args.overwrite_output_dir
):
raise ValueError(
f"Output directory ({args.output_dir}) already exists and "
f"has {len(os.listdir(args.output_dir))} items in it (expected {expected_items} items). "
"Use --overwrite_output_dir to overcome."
)

View File

@ -111,17 +111,6 @@ def main():
else: else:
model_args, data_args, training_args = parser.parse_args_into_dataclasses() model_args, data_args, training_args = parser.parse_args_into_dataclasses()
if (
os.path.exists(training_args.output_dir)
and os.listdir(training_args.output_dir)
and training_args.do_train
and not training_args.overwrite_output_dir
):
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. Use"
" --overwrite_output_dir to overcome."
)
module = import_module("tasks") module = import_module("tasks")
try: try:
token_classification_task_clazz = getattr(module, model_args.task_type) token_classification_task_clazz = getattr(module, model_args.task_type)

View File

@ -72,8 +72,7 @@ token-classification/run_ner.py -h
You can resume training from a previous checkpoint like this: You can resume training from a previous checkpoint like this:
1. Pass `--output_dir previous_output_dir` without `--overwrite_output_dir` to resume training from the latest checkpoint in `output_dir` (what you would use if the training was interrupted, for instance). 1. Pass `--resume_from_checkpoint path_to_a_specific_checkpoint` to resume training from that checkpoint folder.
2. Pass `--resume_from_checkpoint path_to_a_specific_checkpoint` to resume training from that checkpoint folder.
Should you want to turn an example into a notebook where you'd no longer have access to the command Should you want to turn an example into a notebook where you'd no longer have access to the command
line, 🤗 Trainer supports resuming from a checkpoint via `trainer.train(resume_from_checkpoint)`. line, 🤗 Trainer supports resuming from a checkpoint via `trainer.train(resume_from_checkpoint)`.

View File

@ -34,7 +34,6 @@ python run_audio_classification.py \
--dataset_name superb \ --dataset_name superb \
--dataset_config_name ks \ --dataset_config_name ks \
--output_dir wav2vec2-base-ft-keyword-spotting \ --output_dir wav2vec2-base-ft-keyword-spotting \
--overwrite_output_dir \
--remove_unused_columns False \ --remove_unused_columns False \
--do_train \ --do_train \
--do_eval \ --do_eval \
@ -76,7 +75,6 @@ python run_audio_classification.py \
--audio_column_name audio \ --audio_column_name audio \
--label_column_name language \ --label_column_name language \
--output_dir wav2vec2-base-lang-id \ --output_dir wav2vec2-base-lang-id \
--overwrite_output_dir \
--remove_unused_columns False \ --remove_unused_columns False \
--do_train \ --do_train \
--do_eval \ --do_eval \

View File

@ -47,7 +47,6 @@ from transformers import (
TrainingArguments, TrainingArguments,
set_seed, set_seed,
) )
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version from transformers.utils import check_min_version
from transformers.utils.versions import require_version from transformers.utils.versions import require_version
@ -245,21 +244,6 @@ def main():
# Set seed before initializing model. # Set seed before initializing model.
set_seed(training_args.seed) set_seed(training_args.seed)
# Detecting last checkpoint.
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to train from scratch."
)
elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
# Initialize our dataset and prepare it for the audio classification task. # Initialize our dataset and prepare it for the audio classification task.
raw_datasets = DatasetDict() raw_datasets = DatasetDict()
raw_datasets["train"] = load_dataset( raw_datasets["train"] = load_dataset(
@ -408,8 +392,6 @@ def main():
checkpoint = None checkpoint = None
if training_args.resume_from_checkpoint is not None: if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint
train_result = trainer.train(resume_from_checkpoint=checkpoint) train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model() trainer.save_model()
trainer.log_metrics("train", train_result.metrics) trainer.log_metrics("train", train_result.metrics)

View File

@ -97,6 +97,5 @@ python run_clip.py \
--per_device_train_batch_size="64" \ --per_device_train_batch_size="64" \
--per_device_eval_batch_size="64" \ --per_device_eval_batch_size="64" \
--learning_rate="5e-5" --warmup_steps="0" --weight_decay 0.1 \ --learning_rate="5e-5" --warmup_steps="0" --weight_decay 0.1 \
--overwrite_output_dir \
--push_to_hub --push_to_hub
``` ```

View File

@ -55,7 +55,6 @@ from transformers import (
TrainingArguments, TrainingArguments,
set_seed, set_seed,
) )
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version from transformers.utils import check_min_version
from transformers.utils.versions import require_version from transformers.utils.versions import require_version
@ -271,21 +270,6 @@ def main():
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# 3. Detecting last checkpoint and eventually continue from last checkpoint
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
# 4. Load dataset # 4. Load dataset
# Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
# or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
@ -497,8 +481,6 @@ def main():
checkpoint = None checkpoint = None
if training_args.resume_from_checkpoint is not None: if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint
train_result = trainer.train(resume_from_checkpoint=checkpoint) train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model() trainer.save_model()
tokenizer.save_pretrained(training_args.output_dir) tokenizer.save_pretrained(training_args.output_dir)

View File

@ -58,7 +58,6 @@ from transformers import (
TrainingArguments, TrainingArguments,
set_seed, set_seed,
) )
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version from transformers.utils import check_min_version
from transformers.utils.versions import require_version from transformers.utils.versions import require_version
@ -225,21 +224,6 @@ def main():
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint.
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
# Set seed before initializing model. # Set seed before initializing model.
set_seed(training_args.seed) set_seed(training_args.seed)
@ -418,8 +402,6 @@ def main():
checkpoint = None checkpoint = None
if training_args.resume_from_checkpoint is not None: if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint
train_result = trainer.train(resume_from_checkpoint=checkpoint) train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model() trainer.save_model()
trainer.log_metrics("train", train_result.metrics) trainer.log_metrics("train", train_result.metrics)

View File

@ -44,7 +44,6 @@ Alternatively, one can decide to further pre-train an already pre-trained (or fi
!python run_mim.py \ !python run_mim.py \
--model_type vit \ --model_type vit \
--output_dir ./outputs/ \ --output_dir ./outputs/ \
--overwrite_output_dir \
--remove_unused_columns False \ --remove_unused_columns False \
--label_names bool_masked_pos \ --label_names bool_masked_pos \
--do_train \ --do_train \
@ -95,7 +94,6 @@ Next, we can run the script by providing the path to this custom configuration (
--config_name_or_path path_to_config \ --config_name_or_path path_to_config \
--model_type swin \ --model_type swin \
--output_dir ./outputs/ \ --output_dir ./outputs/ \
--overwrite_output_dir \
--remove_unused_columns False \ --remove_unused_columns False \
--label_names bool_masked_pos \ --label_names bool_masked_pos \
--do_train \ --do_train \

View File

@ -41,7 +41,6 @@ from transformers import (
ViTMAEConfig, ViTMAEConfig,
ViTMAEForPreTraining, ViTMAEForPreTraining,
) )
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version from transformers.utils import check_min_version
from transformers.utils.versions import require_version from transformers.utils.versions import require_version
@ -217,21 +216,6 @@ def main():
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint.
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
# Initialize our dataset. # Initialize our dataset.
ds = load_dataset( ds = load_dataset(
data_args.dataset_name, data_args.dataset_name,
@ -377,8 +361,6 @@ def main():
checkpoint = None checkpoint = None
if training_args.resume_from_checkpoint is not None: if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint
train_result = trainer.train(resume_from_checkpoint=checkpoint) train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model() trainer.save_model()
trainer.log_metrics("train", train_result.metrics) trainer.log_metrics("train", train_result.metrics)

View File

@ -44,7 +44,6 @@ from transformers import (
Trainer, Trainer,
TrainingArguments, TrainingArguments,
) )
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version from transformers.utils import check_min_version
from transformers.utils.versions import require_version from transformers.utils.versions import require_version
@ -281,21 +280,6 @@ def main():
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint.
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
# Initialize our dataset. # Initialize our dataset.
ds = load_dataset( ds = load_dataset(
data_args.dataset_name, data_args.dataset_name,
@ -456,8 +440,6 @@ def main():
checkpoint = None checkpoint = None
if training_args.resume_from_checkpoint is not None: if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint
train_result = trainer.train(resume_from_checkpoint=checkpoint) train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model() trainer.save_model()
trainer.log_metrics("train", train_result.metrics) trainer.log_metrics("train", train_result.metrics)

View File

@ -49,7 +49,6 @@ from transformers import (
) )
from transformers.image_processing_utils import BatchFeature from transformers.image_processing_utils import BatchFeature
from transformers.trainer import EvalPrediction from transformers.trainer import EvalPrediction
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version from transformers.utils import check_min_version
from transformers.utils.versions import require_version from transformers.utils.versions import require_version
@ -334,18 +333,6 @@ def find_last_checkpoint(training_args: TrainingArguments) -> Optional[str]:
checkpoint = None checkpoint = None
if training_args.resume_from_checkpoint is not None: if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint checkpoint = training_args.resume_from_checkpoint
elif os.path.isdir(training_args.output_dir) and not training_args.overwrite_output_dir:
checkpoint = get_last_checkpoint(training_args.output_dir)
if checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info(
f"Checkpoint detected, resuming training at {checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
return checkpoint return checkpoint

View File

@ -63,7 +63,6 @@ from transformers import (
set_seed, set_seed,
) )
from transformers.testing_utils import CaptureLogger from transformers.testing_utils import CaptureLogger
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version from transformers.utils import check_min_version
from transformers.utils.versions import require_version from transformers.utils.versions import require_version
@ -317,21 +316,6 @@ def main():
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint.
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
# Set seed before initializing model. # Set seed before initializing model.
set_seed(training_args.seed) set_seed(training_args.seed)
@ -665,8 +649,6 @@ def main():
checkpoint = None checkpoint = None
if training_args.resume_from_checkpoint is not None: if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint
train_result = trainer.train(resume_from_checkpoint=checkpoint) train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model() # Saves the tokenizer too for easy upload trainer.save_model() # Saves the tokenizer too for easy upload

View File

@ -66,7 +66,6 @@ from transformers import (
) )
from transformers.integrations import is_deepspeed_zero3_enabled from transformers.integrations import is_deepspeed_zero3_enabled
from transformers.testing_utils import CaptureLogger from transformers.testing_utils import CaptureLogger
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version from transformers.utils import check_min_version
from transformers.utils.versions import require_version from transformers.utils.versions import require_version
@ -344,21 +343,6 @@ def main():
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint.
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
# Set seed before initializing model. # Set seed before initializing model.
set_seed(training_args.seed) set_seed(training_args.seed)
@ -806,8 +790,6 @@ def main():
checkpoint = None checkpoint = None
if training_args.resume_from_checkpoint is not None: if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint
train_result = trainer.train(resume_from_checkpoint=checkpoint) train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model() # Saves the tokenizer too for easy upload trainer.save_model() # Saves the tokenizer too for easy upload

View File

@ -62,7 +62,6 @@ from transformers import (
is_torch_xla_available, is_torch_xla_available,
set_seed, set_seed,
) )
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version from transformers.utils import check_min_version
from transformers.utils.versions import require_version from transformers.utils.versions import require_version
@ -290,21 +289,6 @@ def main():
# Set the verbosity to info of the Transformers logger (on main process only): # Set the verbosity to info of the Transformers logger (on main process only):
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint.
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
# Set seed before initializing model. # Set seed before initializing model.
set_seed(training_args.seed) set_seed(training_args.seed)
@ -631,8 +615,6 @@ def main():
checkpoint = None checkpoint = None
if training_args.resume_from_checkpoint is not None: if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint
train_result = trainer.train(resume_from_checkpoint=checkpoint) train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model() # Saves the tokenizer too for easy upload trainer.save_model() # Saves the tokenizer too for easy upload
metrics = train_result.metrics metrics = train_result.metrics

View File

@ -55,7 +55,6 @@ from transformers import (
XLNetLMHeadModel, XLNetLMHeadModel,
set_seed, set_seed,
) )
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version from transformers.utils import check_min_version
from transformers.utils.versions import require_version from transformers.utils.versions import require_version
@ -269,21 +268,6 @@ def main():
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint.
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
# Set seed before initializing model. # Set seed before initializing model.
set_seed(training_args.seed) set_seed(training_args.seed)
@ -528,8 +512,6 @@ def main():
checkpoint = None checkpoint = None
if training_args.resume_from_checkpoint is not None: if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint
train_result = trainer.train(resume_from_checkpoint=checkpoint) train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model() # Saves the tokenizer too for easy upload trainer.save_model() # Saves the tokenizer too for easy upload
metrics = train_result.metrics metrics = train_result.metrics

View File

@ -52,7 +52,6 @@ from transformers import (
default_data_collator, default_data_collator,
set_seed, set_seed,
) )
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version from transformers.utils import check_min_version
@ -213,21 +212,6 @@ def main():
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint.
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
# Set seed before initializing model. # Set seed before initializing model.
set_seed(training_args.seed) set_seed(training_args.seed)
@ -401,8 +385,6 @@ def main():
checkpoint = None checkpoint = None
if training_args.resume_from_checkpoint is not None: if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint
train_result = trainer.train(resume_from_checkpoint=checkpoint) train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model() # Saves the tokenizer too for easy upload trainer.save_model() # Saves the tokenizer too for easy upload
metrics = train_result.metrics metrics = train_result.metrics

View File

@ -51,7 +51,6 @@ from transformers import (
from transformers.image_processing_utils import BatchFeature from transformers.image_processing_utils import BatchFeature
from transformers.image_transforms import center_to_corners_format from transformers.image_transforms import center_to_corners_format
from transformers.trainer import EvalPrediction from transformers.trainer import EvalPrediction
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version from transformers.utils import check_min_version
from transformers.utils.versions import require_version from transformers.utils.versions import require_version
@ -373,23 +372,6 @@ def main():
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint.
checkpoint = None
if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint
elif os.path.isdir(training_args.output_dir) and not training_args.overwrite_output_dir:
checkpoint = get_last_checkpoint(training_args.output_dir)
if checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info(
f"Checkpoint detected, resuming training at {checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
# ------------------------------------------------------------------------------------------------ # ------------------------------------------------------------------------------------------------
# Load dataset, prepare splits # Load dataset, prepare splits
# ------------------------------------------------------------------------------------------------ # ------------------------------------------------------------------------------------------------
@ -510,7 +492,7 @@ def main():
# Training # Training
if training_args.do_train: if training_args.do_train:
train_result = trainer.train(resume_from_checkpoint=checkpoint) train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
trainer.save_model() trainer.save_model()
trainer.log_metrics("train", train_result.metrics) trainer.log_metrics("train", train_result.metrics)
trainer.save_metrics("train", train_result.metrics) trainer.save_metrics("train", train_result.metrics)

View File

@ -55,7 +55,6 @@ class TorchXLAExamplesTests(TestCasePlus):
./examples/pytorch/text-classification/run_glue.py ./examples/pytorch/text-classification/run_glue.py
--model_name_or_path distilbert/distilbert-base-uncased --model_name_or_path distilbert/distilbert-base-uncased
--output_dir {tmp_dir} --output_dir {tmp_dir}
--overwrite_output_dir
--train_file ./tests/fixtures/tests_samples/MRPC/train.csv --train_file ./tests/fixtures/tests_samples/MRPC/train.csv
--validation_file ./tests/fixtures/tests_samples/MRPC/dev.csv --validation_file ./tests/fixtures/tests_samples/MRPC/dev.csv
--do_train --do_train

View File

@ -43,7 +43,6 @@ from transformers import (
default_data_collator, default_data_collator,
set_seed, set_seed,
) )
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version from transformers.utils import check_min_version
from transformers.utils.versions import require_version from transformers.utils.versions import require_version
@ -262,21 +261,6 @@ def main():
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint.
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
# Set seed before initializing model. # Set seed before initializing model.
set_seed(training_args.seed) set_seed(training_args.seed)
@ -646,8 +630,6 @@ def main():
checkpoint = None checkpoint = None
if training_args.resume_from_checkpoint is not None: if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint
train_result = trainer.train(resume_from_checkpoint=checkpoint) train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model() # Saves the tokenizer too for easy upload trainer.save_model() # Saves the tokenizer too for easy upload

View File

@ -41,7 +41,6 @@ from transformers import (
default_data_collator, default_data_collator,
set_seed, set_seed,
) )
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version from transformers.utils import check_min_version
from transformers.utils.versions import require_version from transformers.utils.versions import require_version
@ -260,21 +259,6 @@ def main():
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint.
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
# Set seed before initializing model. # Set seed before initializing model.
set_seed(training_args.seed) set_seed(training_args.seed)
@ -672,8 +656,6 @@ def main():
checkpoint = None checkpoint = None
if training_args.resume_from_checkpoint is not None: if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint
train_result = trainer.train(resume_from_checkpoint=checkpoint) train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model() # Saves the tokenizer too for easy upload trainer.save_model() # Saves the tokenizer too for easy upload

View File

@ -39,7 +39,7 @@ from transformers import (
Seq2SeqTrainingArguments, Seq2SeqTrainingArguments,
set_seed, set_seed,
) )
from transformers.trainer_utils import EvalLoopOutput, EvalPrediction, get_last_checkpoint from transformers.trainer_utils import EvalLoopOutput, EvalPrediction
from transformers.utils import check_min_version from transformers.utils import check_min_version
from transformers.utils.versions import require_version from transformers.utils.versions import require_version
@ -307,21 +307,6 @@ def main():
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint.
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
# Set seed before initializing model. # Set seed before initializing model.
set_seed(training_args.seed) set_seed(training_args.seed)
@ -669,8 +654,6 @@ def main():
checkpoint = None checkpoint = None
if training_args.resume_from_checkpoint is not None: if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint
train_result = trainer.train(resume_from_checkpoint=checkpoint) train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model() # Saves the tokenizer too for easy upload trainer.save_model() # Saves the tokenizer too for easy upload

View File

@ -52,7 +52,6 @@ from transformers import (
TrainingArguments, TrainingArguments,
default_data_collator, default_data_collator,
) )
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version from transformers.utils import check_min_version
from transformers.utils.versions import require_version from transformers.utils.versions import require_version
@ -221,21 +220,6 @@ def main():
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint.
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
# Load dataset # Load dataset
# In distributed training, the load_dataset function guarantees that only one local process can concurrently # In distributed training, the load_dataset function guarantees that only one local process can concurrently
# download the dataset. # download the dataset.
@ -419,8 +403,6 @@ def main():
checkpoint = None checkpoint = None
if training_args.resume_from_checkpoint is not None: if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint
train_result = trainer.train(resume_from_checkpoint=checkpoint) train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model() trainer.save_model()
trainer.log_metrics("train", train_result.metrics) trainer.log_metrics("train", train_result.metrics)

View File

@ -70,7 +70,6 @@ python run_speech_recognition_ctc.py \
--model_name_or_path="facebook/wav2vec2-large-xlsr-53" \ --model_name_or_path="facebook/wav2vec2-large-xlsr-53" \
--dataset_config_name="tr" \ --dataset_config_name="tr" \
--output_dir="./wav2vec2-common_voice-tr-demo" \ --output_dir="./wav2vec2-common_voice-tr-demo" \
--overwrite_output_dir \
--num_train_epochs="15" \ --num_train_epochs="15" \
--per_device_train_batch_size="16" \ --per_device_train_batch_size="16" \
--gradient_accumulation_steps="2" \ --gradient_accumulation_steps="2" \
@ -106,7 +105,6 @@ torchrun \
--model_name_or_path="facebook/wav2vec2-large-xlsr-53" \ --model_name_or_path="facebook/wav2vec2-large-xlsr-53" \
--dataset_config_name="tr" \ --dataset_config_name="tr" \
--output_dir="./wav2vec2-common_voice-tr-demo-dist" \ --output_dir="./wav2vec2-common_voice-tr-demo-dist" \
--overwrite_output_dir \
--num_train_epochs="15" \ --num_train_epochs="15" \
--per_device_train_batch_size="4" \ --per_device_train_batch_size="4" \
--learning_rate="3e-4" \ --learning_rate="3e-4" \
@ -156,7 +154,6 @@ However, the `--shuffle_buffer_size` argument controls how many examples we can
--train_split_name="train+validation" \ --train_split_name="train+validation" \
--eval_split_name="test" \ --eval_split_name="test" \
--output_dir="wav2vec2-xls-r-common_voice-tr-ft" \ --output_dir="wav2vec2-xls-r-common_voice-tr-ft" \
--overwrite_output_dir \
--max_steps="5000" \ --max_steps="5000" \
--per_device_train_batch_size="8" \ --per_device_train_batch_size="8" \
--gradient_accumulation_steps="2" \ --gradient_accumulation_steps="2" \
@ -390,7 +387,6 @@ python run_speech_recognition_seq2seq.py \
--freeze_feature_encoder="False" \ --freeze_feature_encoder="False" \
--gradient_checkpointing \ --gradient_checkpointing \
--fp16 \ --fp16 \
--overwrite_output_dir \
--do_train \ --do_train \
--do_eval \ --do_eval \
--predict_with_generate \ --predict_with_generate \
@ -431,7 +427,6 @@ torchrun \
--freeze_feature_encoder="False" \ --freeze_feature_encoder="False" \
--gradient_checkpointing \ --gradient_checkpointing \
--fp16 \ --fp16 \
--overwrite_output_dir \
--do_train \ --do_train \
--do_eval \ --do_eval \
--predict_with_generate \ --predict_with_generate \
@ -539,7 +534,6 @@ python run_speech_recognition_seq2seq.py \
--output_dir="./" \ --output_dir="./" \
--preprocessing_num_workers="16" \ --preprocessing_num_workers="16" \
--length_column_name="input_length" \ --length_column_name="input_length" \
--overwrite_output_dir \
--num_train_epochs="5" \ --num_train_epochs="5" \
--per_device_train_batch_size="8" \ --per_device_train_batch_size="8" \
--per_device_eval_batch_size="8" \ --per_device_eval_batch_size="8" \
@ -581,7 +575,6 @@ torchrun \
--output_dir="./" \ --output_dir="./" \
--preprocessing_num_workers="16" \ --preprocessing_num_workers="16" \
--length_column_name="input_length" \ --length_column_name="input_length" \
--overwrite_output_dir \
--num_train_epochs="5" \ --num_train_epochs="5" \
--per_device_train_batch_size="8" \ --per_device_train_batch_size="8" \
--per_device_eval_batch_size="8" \ --per_device_eval_batch_size="8" \

View File

@ -55,7 +55,7 @@ from transformers import (
Wav2Vec2Processor, Wav2Vec2Processor,
set_seed, set_seed,
) )
from transformers.trainer_utils import get_last_checkpoint, is_main_process from transformers.trainer_utils import is_main_process
from transformers.utils import check_min_version from transformers.utils import check_min_version
from transformers.utils.versions import require_version from transformers.utils.versions import require_version
@ -429,21 +429,6 @@ def main():
else: else:
model_args, data_args, training_args = parser.parse_args_into_dataclasses() model_args, data_args, training_args = parser.parse_args_into_dataclasses()
# Detecting last checkpoint.
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
# Setup logging # Setup logging
logging.basicConfig( logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@ -558,7 +543,7 @@ def main():
vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json") vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json")
with training_args.main_process_first(): with training_args.main_process_first():
if training_args.overwrite_output_dir and os.path.isfile(vocab_file): if os.path.isfile(vocab_file):
try: try:
os.remove(vocab_file) os.remove(vocab_file)
except OSError: except OSError:
@ -781,9 +766,7 @@ def main():
# Training # Training
if training_args.do_train: if training_args.do_train:
# use last checkpoint if exist # use last checkpoint if exist
if last_checkpoint is not None: if os.path.isdir(model_args.model_name_or_path):
checkpoint = last_checkpoint
elif os.path.isdir(model_args.model_name_or_path):
checkpoint = model_args.model_name_or_path checkpoint = model_args.model_name_or_path
else: else:
checkpoint = None checkpoint = None

View File

@ -58,7 +58,7 @@ from transformers import (
set_seed, set_seed,
) )
from transformers.models.wav2vec2.modeling_wav2vec2 import WAV2VEC2_ADAPTER_SAFE_FILE from transformers.models.wav2vec2.modeling_wav2vec2 import WAV2VEC2_ADAPTER_SAFE_FILE
from transformers.trainer_utils import get_last_checkpoint, is_main_process from transformers.trainer_utils import is_main_process
from transformers.utils import check_min_version from transformers.utils import check_min_version
from transformers.utils.versions import require_version from transformers.utils.versions import require_version
@ -406,21 +406,6 @@ def main():
else: else:
model_args, data_args, training_args = parser.parse_args_into_dataclasses() model_args, data_args, training_args = parser.parse_args_into_dataclasses()
# Detecting last checkpoint.
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
# Setup logging # Setup logging
logging.basicConfig( logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@ -557,7 +542,7 @@ def main():
vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json") vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json")
with training_args.main_process_first(): with training_args.main_process_first():
if training_args.overwrite_output_dir and os.path.isfile(vocab_file): if os.path.isfile(vocab_file):
try: try:
os.remove(vocab_file) os.remove(vocab_file)
except OSError: except OSError:
@ -773,9 +758,7 @@ def main():
# Training # Training
if training_args.do_train: if training_args.do_train:
# use last checkpoint if exist # use last checkpoint if exist
if last_checkpoint is not None: if os.path.isdir(model_args.model_name_or_path):
checkpoint = last_checkpoint
elif os.path.isdir(model_args.model_name_or_path):
checkpoint = model_args.model_name_or_path checkpoint = model_args.model_name_or_path
else: else:
checkpoint = None checkpoint = None

View File

@ -54,7 +54,7 @@ from transformers import (
Seq2SeqTrainingArguments, Seq2SeqTrainingArguments,
set_seed, set_seed,
) )
from transformers.trainer_utils import get_last_checkpoint, is_main_process from transformers.trainer_utils import is_main_process
from transformers.utils import check_min_version from transformers.utils import check_min_version
from transformers.utils.versions import require_version from transformers.utils.versions import require_version
@ -340,21 +340,6 @@ def main():
transformers.utils.logging.set_verbosity_info() transformers.utils.logging.set_verbosity_info()
logger.info("Training/evaluation parameters %s", training_args) logger.info("Training/evaluation parameters %s", training_args)
# 3. Detecting last checkpoint and eventually continue from last checkpoint
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
# Set seed before initializing model. # Set seed before initializing model.
set_seed(training_args.seed) set_seed(training_args.seed)
@ -603,8 +588,6 @@ def main():
checkpoint = None checkpoint = None
if training_args.resume_from_checkpoint is not None: if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint
train_result = trainer.train(resume_from_checkpoint=checkpoint) train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model() # Saves the feature extractor too for easy upload trainer.save_model() # Saves the feature extractor too for easy upload

View File

@ -50,7 +50,6 @@ python run_summarization.py \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -72,7 +71,6 @@ python run_summarization.py \
--validation_file path_to_csv_or_jsonlines_file \ --validation_file path_to_csv_or_jsonlines_file \
--source_prefix "summarize: " \ --source_prefix "summarize: " \
--output_dir /tmp/tst-summarization \ --output_dir /tmp/tst-summarization \
--overwrite_output_dir \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--predict_with_generate --predict_with_generate

View File

@ -61,7 +61,6 @@ from transformers import (
Seq2SeqTrainingArguments, Seq2SeqTrainingArguments,
set_seed, set_seed,
) )
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version, is_offline_mode from transformers.utils import check_min_version, is_offline_mode
from transformers.utils.versions import require_version from transformers.utils.versions import require_version
@ -374,21 +373,6 @@ def main():
"`--source_prefix 'summarize: ' `" "`--source_prefix 'summarize: ' `"
) )
# Detecting last checkpoint.
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
# Set seed before initializing model. # Set seed before initializing model.
set_seed(training_args.seed) set_seed(training_args.seed)
@ -698,8 +682,6 @@ def main():
checkpoint = None checkpoint = None
if training_args.resume_from_checkpoint is not None: if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint
train_result = trainer.train(resume_from_checkpoint=checkpoint) train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model() # Saves the tokenizer too for easy upload trainer.save_model() # Saves the tokenizer too for easy upload

View File

@ -104,7 +104,6 @@ class ExamplesTests(TestCasePlus):
run_glue.py run_glue.py
--model_name_or_path distilbert/distilbert-base-uncased --model_name_or_path distilbert/distilbert-base-uncased
--output_dir {tmp_dir} --output_dir {tmp_dir}
--overwrite_output_dir
--train_file ./tests/fixtures/tests_samples/MRPC/train.csv --train_file ./tests/fixtures/tests_samples/MRPC/train.csv
--validation_file ./tests/fixtures/tests_samples/MRPC/dev.csv --validation_file ./tests/fixtures/tests_samples/MRPC/dev.csv
--do_train --do_train
@ -140,7 +139,6 @@ class ExamplesTests(TestCasePlus):
--per_device_eval_batch_size 5 --per_device_eval_batch_size 5
--num_train_epochs 2 --num_train_epochs 2
--output_dir {tmp_dir} --output_dir {tmp_dir}
--overwrite_output_dir
""".split() """.split()
if backend_device_count(torch_device) > 1: if backend_device_count(torch_device) > 1:
@ -188,7 +186,6 @@ class ExamplesTests(TestCasePlus):
--train_file ./tests/fixtures/sample_text.txt --train_file ./tests/fixtures/sample_text.txt
--validation_file ./tests/fixtures/sample_text.txt --validation_file ./tests/fixtures/sample_text.txt
--output_dir {tmp_dir} --output_dir {tmp_dir}
--overwrite_output_dir
--do_train --do_train
--do_eval --do_eval
--prediction_loss_only --prediction_loss_only
@ -214,7 +211,6 @@ class ExamplesTests(TestCasePlus):
--train_file tests/fixtures/tests_samples/conll/sample.json --train_file tests/fixtures/tests_samples/conll/sample.json
--validation_file tests/fixtures/tests_samples/conll/sample.json --validation_file tests/fixtures/tests_samples/conll/sample.json
--output_dir {tmp_dir} --output_dir {tmp_dir}
--overwrite_output_dir
--do_train --do_train
--do_eval --do_eval
--warmup_steps=2 --warmup_steps=2
@ -243,7 +239,6 @@ class ExamplesTests(TestCasePlus):
--train_file tests/fixtures/tests_samples/SQUAD/sample.json --train_file tests/fixtures/tests_samples/SQUAD/sample.json
--validation_file tests/fixtures/tests_samples/SQUAD/sample.json --validation_file tests/fixtures/tests_samples/SQUAD/sample.json
--output_dir {tmp_dir} --output_dir {tmp_dir}
--overwrite_output_dir
--max_steps=10 --max_steps=10
--warmup_steps=2 --warmup_steps=2
--do_train --do_train
@ -271,7 +266,6 @@ class ExamplesTests(TestCasePlus):
--train_file tests/fixtures/tests_samples/SQUAD/sample.json --train_file tests/fixtures/tests_samples/SQUAD/sample.json
--validation_file tests/fixtures/tests_samples/SQUAD/sample.json --validation_file tests/fixtures/tests_samples/SQUAD/sample.json
--output_dir {tmp_dir} --output_dir {tmp_dir}
--overwrite_output_dir
--max_steps=10 --max_steps=10
--warmup_steps=2 --warmup_steps=2
--do_train --do_train
@ -296,7 +290,6 @@ class ExamplesTests(TestCasePlus):
--train_file tests/fixtures/tests_samples/swag/sample.json --train_file tests/fixtures/tests_samples/swag/sample.json
--validation_file tests/fixtures/tests_samples/swag/sample.json --validation_file tests/fixtures/tests_samples/swag/sample.json
--output_dir {tmp_dir} --output_dir {tmp_dir}
--overwrite_output_dir
--max_steps=20 --max_steps=20
--warmup_steps=2 --warmup_steps=2
--do_train --do_train
@ -334,7 +327,6 @@ class ExamplesTests(TestCasePlus):
--train_file tests/fixtures/tests_samples/xsum/sample.json --train_file tests/fixtures/tests_samples/xsum/sample.json
--validation_file tests/fixtures/tests_samples/xsum/sample.json --validation_file tests/fixtures/tests_samples/xsum/sample.json
--output_dir {tmp_dir} --output_dir {tmp_dir}
--overwrite_output_dir
--max_steps=50 --max_steps=50
--warmup_steps=8 --warmup_steps=8
--do_train --do_train
@ -364,7 +356,6 @@ class ExamplesTests(TestCasePlus):
--train_file tests/fixtures/tests_samples/wmt16/sample.json --train_file tests/fixtures/tests_samples/wmt16/sample.json
--validation_file tests/fixtures/tests_samples/wmt16/sample.json --validation_file tests/fixtures/tests_samples/wmt16/sample.json
--output_dir {tmp_dir} --output_dir {tmp_dir}
--overwrite_output_dir
--max_steps=50 --max_steps=50
--warmup_steps=8 --warmup_steps=8
--do_train --do_train
@ -396,7 +387,6 @@ class ExamplesTests(TestCasePlus):
--per_device_train_batch_size 2 --per_device_train_batch_size 2
--per_device_eval_batch_size 1 --per_device_eval_batch_size 1
--remove_unused_columns False --remove_unused_columns False
--overwrite_output_dir True
--dataloader_num_workers 16 --dataloader_num_workers 16
--metric_for_best_model accuracy --metric_for_best_model accuracy
--max_steps 10 --max_steps 10
@ -429,7 +419,6 @@ class ExamplesTests(TestCasePlus):
--per_device_train_batch_size 2 --per_device_train_batch_size 2
--per_device_eval_batch_size 1 --per_device_eval_batch_size 1
--remove_unused_columns False --remove_unused_columns False
--overwrite_output_dir True
--preprocessing_num_workers 16 --preprocessing_num_workers 16
--max_steps 10 --max_steps 10
--seed 42 --seed 42
@ -459,7 +448,6 @@ class ExamplesTests(TestCasePlus):
--per_device_train_batch_size 2 --per_device_train_batch_size 2
--per_device_eval_batch_size 1 --per_device_eval_batch_size 1
--remove_unused_columns False --remove_unused_columns False
--overwrite_output_dir True
--preprocessing_num_workers 16 --preprocessing_num_workers 16
--max_steps 10 --max_steps 10
--target_language tur --target_language tur
@ -491,7 +479,6 @@ class ExamplesTests(TestCasePlus):
--per_device_train_batch_size 2 --per_device_train_batch_size 2
--per_device_eval_batch_size 4 --per_device_eval_batch_size 4
--remove_unused_columns False --remove_unused_columns False
--overwrite_output_dir True
--preprocessing_num_workers 16 --preprocessing_num_workers 16
--max_steps 10 --max_steps 10
--seed 42 --seed 42
@ -523,7 +510,6 @@ class ExamplesTests(TestCasePlus):
--per_device_train_batch_size 2 --per_device_train_batch_size 2
--per_device_eval_batch_size 1 --per_device_eval_batch_size 1
--remove_unused_columns False --remove_unused_columns False
--overwrite_output_dir True
--num_train_epochs 10 --num_train_epochs 10
--max_steps 50 --max_steps 50
--seed 42 --seed 42
@ -572,7 +558,6 @@ class ExamplesTests(TestCasePlus):
--per_device_train_batch_size 2 --per_device_train_batch_size 2
--per_device_eval_batch_size 1 --per_device_eval_batch_size 1
--remove_unused_columns False --remove_unused_columns False
--overwrite_output_dir True
--dataloader_num_workers 16 --dataloader_num_workers 16
--metric_for_best_model accuracy --metric_for_best_model accuracy
--max_steps 10 --max_steps 10
@ -597,7 +582,6 @@ class ExamplesTests(TestCasePlus):
--do_train --do_train
--do_eval --do_eval
--remove_unused_columns False --remove_unused_columns False
--overwrite_output_dir True
--max_steps 10 --max_steps 10
--learning_rate=2e-4 --learning_rate=2e-4
--per_device_train_batch_size=2 --per_device_train_batch_size=2
@ -624,7 +608,6 @@ class ExamplesTests(TestCasePlus):
--do_train --do_train
--do_eval --do_eval
--remove_unused_columns False --remove_unused_columns False
--overwrite_output_dir True
--eval_do_concat_batches False --eval_do_concat_batches False
--max_steps 10 --max_steps 10
--learning_rate=1e-6 --learning_rate=1e-6

View File

@ -55,7 +55,6 @@ from transformers import (
default_data_collator, default_data_collator,
set_seed, set_seed,
) )
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version from transformers.utils import check_min_version
from transformers.utils.versions import require_version from transformers.utils.versions import require_version
@ -321,21 +320,6 @@ def main():
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint.
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
# Set seed before initializing model. # Set seed before initializing model.
set_seed(training_args.seed) set_seed(training_args.seed)
@ -693,8 +677,6 @@ def main():
checkpoint = None checkpoint = None
if training_args.resume_from_checkpoint is not None: if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint
train_result = trainer.train(resume_from_checkpoint=checkpoint) train_result = trainer.train(resume_from_checkpoint=checkpoint)
metrics = train_result.metrics metrics = train_result.metrics
max_train_samples = ( max_train_samples = (

View File

@ -57,7 +57,6 @@ from transformers import (
default_data_collator, default_data_collator,
set_seed, set_seed,
) )
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version from transformers.utils import check_min_version
from transformers.utils.versions import require_version from transformers.utils.versions import require_version
@ -266,21 +265,6 @@ def main():
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint.
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
# Set seed before initializing model. # Set seed before initializing model.
set_seed(training_args.seed) set_seed(training_args.seed)
@ -566,8 +550,6 @@ def main():
checkpoint = None checkpoint = None
if training_args.resume_from_checkpoint is not None: if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint
train_result = trainer.train(resume_from_checkpoint=checkpoint) train_result = trainer.train(resume_from_checkpoint=checkpoint)
metrics = train_result.metrics metrics = train_result.metrics
max_train_samples = ( max_train_samples = (

View File

@ -56,7 +56,6 @@ from transformers import (
default_data_collator, default_data_collator,
set_seed, set_seed,
) )
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version from transformers.utils import check_min_version
from transformers.utils.versions import require_version from transformers.utils.versions import require_version
@ -224,21 +223,6 @@ def main():
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint.
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
# Set seed before initializing model. # Set seed before initializing model.
set_seed(training_args.seed) set_seed(training_args.seed)
@ -412,8 +396,6 @@ def main():
checkpoint = None checkpoint = None
if training_args.resume_from_checkpoint is not None: if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint
train_result = trainer.train(resume_from_checkpoint=checkpoint) train_result = trainer.train(resume_from_checkpoint=checkpoint)
metrics = train_result.metrics metrics = train_result.metrics
max_train_samples = ( max_train_samples = (

View File

@ -54,7 +54,6 @@ from transformers import (
TrainingArguments, TrainingArguments,
set_seed, set_seed,
) )
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version from transformers.utils import check_min_version
from transformers.utils.versions import require_version from transformers.utils.versions import require_version
@ -263,21 +262,6 @@ def main():
) )
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint.
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
# Set seed before initializing model. # Set seed before initializing model.
set_seed(training_args.seed) set_seed(training_args.seed)
@ -587,8 +571,6 @@ def main():
checkpoint = None checkpoint = None
if training_args.resume_from_checkpoint is not None: if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint
train_result = trainer.train(resume_from_checkpoint=checkpoint) train_result = trainer.train(resume_from_checkpoint=checkpoint)
metrics = train_result.metrics metrics = train_result.metrics
trainer.save_model() # Saves the tokenizer too for easy upload trainer.save_model() # Saves the tokenizer too for easy upload

View File

@ -53,7 +53,6 @@ python run_translation.py \
--output_dir /tmp/tst-translation \ --output_dir /tmp/tst-translation \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -74,7 +73,6 @@ python run_translation.py \
--output_dir /tmp/tst-translation \ --output_dir /tmp/tst-translation \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -96,7 +94,6 @@ python run_translation.py \
--output_dir /tmp/tst-translation \ --output_dir /tmp/tst-translation \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -118,7 +115,6 @@ python run_translation.py \
--output_dir /tmp/tst-translation \ --output_dir /tmp/tst-translation \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```
@ -144,7 +140,6 @@ python run_translation.py \
--output_dir /tmp/tst-translation \ --output_dir /tmp/tst-translation \
--per_device_train_batch_size=4 \ --per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \ --per_device_eval_batch_size=4 \
--overwrite_output_dir \
--predict_with_generate --predict_with_generate
``` ```

View File

@ -60,7 +60,6 @@ from transformers import (
default_data_collator, default_data_collator,
set_seed, set_seed,
) )
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version from transformers.utils import check_min_version
from transformers.utils.versions import require_version from transformers.utils.versions import require_version
@ -322,21 +321,6 @@ def main():
"`--source_prefix 'translate English to German: ' `" "`--source_prefix 'translate English to German: ' `"
) )
# Detecting last checkpoint.
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
# Set seed before initializing model. # Set seed before initializing model.
set_seed(training_args.seed) set_seed(training_args.seed)
@ -617,8 +601,6 @@ def main():
checkpoint = None checkpoint = None
if training_args.resume_from_checkpoint is not None: if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint
train_result = trainer.train(resume_from_checkpoint=checkpoint) train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model() # Saves the tokenizer too for easy upload trainer.save_model() # Saves the tokenizer too for easy upload

View File

@ -1644,7 +1644,7 @@ class NeptuneCallback(TrainerCallback):
def on_init_end(self, args, state, control, **kwargs): def on_init_end(self, args, state, control, **kwargs):
self._volatile_checkpoints_dir = None self._volatile_checkpoints_dir = None
if self._log_checkpoints and (args.overwrite_output_dir or args.save_total_limit is not None): if self._log_checkpoints and args.save_total_limit is not None:
self._volatile_checkpoints_dir = tempfile.TemporaryDirectory().name self._volatile_checkpoints_dir = tempfile.TemporaryDirectory().name
if self._log_checkpoints == "best" and not args.load_best_model_at_end: if self._log_checkpoints == "best" and not args.load_best_model_at_end:

View File

@ -21,11 +21,11 @@
from typing import Optional from typing import Optional
from ...configuration_utils import PretrainedConfig, layer_type_validation from ...configuration_utils import PreTrainedConfig, layer_type_validation
from ...modeling_rope_utils import rope_config_validation from ...modeling_rope_utils import rope_config_validation
class CwmConfig(PretrainedConfig): class CwmConfig(PreTrainedConfig):
""" """
Configuration for Code World Model (CWM). Configuration for Code World Model (CWM).
This is an inherited Llama3-compatible configuration with layer-interleaved This is an inherited Llama3-compatible configuration with layer-interleaved
@ -136,13 +136,6 @@ class CwmConfig(PretrainedConfig):
layer_types: Optional[list[str]] = None, # ["full_attention"|"sliding_attention"] per layer layer_types: Optional[list[str]] = None, # ["full_attention"|"sliding_attention"] per layer
**kwargs, **kwargs,
): ):
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
if rope_scaling is None: if rope_scaling is None:
rope_scaling = { rope_scaling = {
"factor": 16.0, "factor": 16.0,
@ -189,6 +182,14 @@ class CwmConfig(PretrainedConfig):
self.rope_scaling["rope_type"] = self.rope_scaling["type"] self.rope_scaling["rope_type"] = self.rope_scaling["type"]
rope_config_validation(self) rope_config_validation(self)
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
self.sliding_window = int(sliding_window) if sliding_window else None self.sliding_window = int(sliding_window) if sliding_window else None
self.layer_types = list(layer_types) self.layer_types = list(layer_types)

View File

@ -19,7 +19,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from typing import Callable, Optional, Union from collections.abc import Callable
from typing import Optional, Union
import torch import torch
from torch import nn from torch import nn
@ -36,7 +37,6 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
from ...processing_utils import Unpack from ...processing_utils import Unpack
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
from ...utils.deprecation import deprecate_kwarg
from ...utils.generic import check_model_inputs from ...utils.generic import check_model_inputs
from .configuration_cwm import CwmConfig from .configuration_cwm import CwmConfig
@ -131,7 +131,6 @@ class CwmAttention(nn.Module):
self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False) self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
self.sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None self.sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None
@deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
def forward( def forward(
self, self,
hidden_states: torch.Tensor, hidden_states: torch.Tensor,
@ -225,7 +224,6 @@ class CwmDecoderLayer(GradientCheckpointingLayer):
self.post_attention_layernorm = CwmRMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.post_attention_layernorm = CwmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.attention_type = config.layer_types[layer_idx] self.attention_type = config.layer_types[layer_idx]
@deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
def forward( def forward(
self, self,
hidden_states: torch.Tensor, hidden_states: torch.Tensor,
@ -339,7 +337,7 @@ class CwmModel(CwmPreTrainedModel):
# Initialize weights and apply final processing # Initialize weights and apply final processing
self.post_init() self.post_init()
@check_model_inputs @check_model_inputs()
@auto_docstring @auto_docstring
def forward( def forward(
self, self,

View File

@ -208,9 +208,6 @@ class TrainingArguments:
Parameters: Parameters:
output_dir (`str`, *optional*, defaults to `"trainer_output"`): output_dir (`str`, *optional*, defaults to `"trainer_output"`):
The output directory where the model predictions and checkpoints will be written. The output directory where the model predictions and checkpoints will be written.
overwrite_output_dir (`bool`, *optional*, defaults to `False`):
If `True`, overwrite the content of the output directory. Use this to continue training if `output_dir`
points to a checkpoint directory.
do_train (`bool`, *optional*, defaults to `False`): do_train (`bool`, *optional*, defaults to `False`):
Whether to run training or not. This argument is not directly used by [`Trainer`], it's intended to be used Whether to run training or not. This argument is not directly used by [`Trainer`], it's intended to be used
by your training/evaluation scripts instead. See the [example by your training/evaluation scripts instead. See the [example
@ -787,15 +784,6 @@ class TrainingArguments:
"help": "The output directory where the model predictions and checkpoints will be written. Defaults to 'trainer_output' if not provided." "help": "The output directory where the model predictions and checkpoints will be written. Defaults to 'trainer_output' if not provided."
}, },
) )
overwrite_output_dir: bool = field(
default=False,
metadata={
"help": (
"Overwrite the content of the output directory. "
"Use this to continue training if output_dir points to a checkpoint directory."
)
},
)
do_train: bool = field(default=False, metadata={"help": "Whether to run training."}) do_train: bool = field(default=False, metadata={"help": "Whether to run training."})
do_eval: bool = field(default=False, metadata={"help": "Whether to run eval on the dev set."}) do_eval: bool = field(default=False, metadata={"help": "Whether to run eval on the dev set."})

View File

@ -219,21 +219,6 @@ def main():
else: else:
model_args, data_args, training_args = parser.parse_args_into_dataclasses() model_args, data_args, training_args = parser.parse_args_into_dataclasses()
# Detecting last checkpoint.
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
# Setup logging # Setup logging
logging.basicConfig( logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@ -452,16 +437,12 @@ def main():
# Training # Training
if training_args.do_train: if training_args.do_train:
{%- if cookiecutter.can_train_from_scratch == "False" %} {%- if cookiecutter.can_train_from_scratch == "False" %}
if last_checkpoint is not None: if os.path.isdir(model_args.model_name_or_path):
checkpoint = last_checkpoint
elif os.path.isdir(model_args.model_name_or_path):
checkpoint = model_args.model_name_or_path checkpoint = model_args.model_name_or_path
else: else:
checkpoint = None checkpoint = None
{%- elif cookiecutter.can_train_from_scratch == "True" %} {%- elif cookiecutter.can_train_from_scratch == "True" %}
if last_checkpoint is not None: if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path):
checkpoint = last_checkpoint
elif model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path):
checkpoint = model_args.model_name_or_path checkpoint = model_args.model_name_or_path
else: else:
checkpoint = None checkpoint = None

View File

@ -1303,7 +1303,6 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
--train_file {data_dir}/train.json --train_file {data_dir}/train.json
--validation_file {data_dir}/val.json --validation_file {data_dir}/val.json
--output_dir {output_dir} --output_dir {output_dir}
--overwrite_output_dir
--max_source_length {max_len} --max_source_length {max_len}
--max_target_length {max_len} --max_target_length {max_len}
--val_max_target_length {max_len} --val_max_target_length {max_len}
@ -1373,7 +1372,6 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
--train_file {data_dir}/sample_text.txt --train_file {data_dir}/sample_text.txt
--validation_file {data_dir}/sample_text.txt --validation_file {data_dir}/sample_text.txt
--output_dir {output_dir} --output_dir {output_dir}
--overwrite_output_dir
--do_train --do_train
--do_eval --do_eval
--max_train_samples 16 --max_train_samples 16
@ -1410,7 +1408,6 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
--train_file {data_dir}/sample_text.txt --train_file {data_dir}/sample_text.txt
--validation_file {data_dir}/sample_text.txt --validation_file {data_dir}/sample_text.txt
--output_dir {output_dir} --output_dir {output_dir}
--overwrite_output_dir
--do_train --do_train
--max_train_samples 4 --max_train_samples 4
--per_device_train_batch_size 2 --per_device_train_batch_size 2

View File

@ -161,7 +161,6 @@ def make_task_cmds():
--num_train_epochs 1 --num_train_epochs 1
--fp16 --fp16
--report_to none --report_to none
--overwrite_output_dir
""".split() """.split()
# try to cover as many models as possible once (it's enough to run on one task per model) # try to cover as many models as possible once (it's enough to run on one task per model)

View File

@ -267,7 +267,6 @@ class TestTrainerExt(TestCasePlus):
--validation_file {data_dir}/val.json --validation_file {data_dir}/val.json
--test_file {data_dir}/test.json --test_file {data_dir}/test.json
--output_dir {output_dir} --output_dir {output_dir}
--overwrite_output_dir
--max_train_samples 8 --max_train_samples 8
--max_source_length {max_len} --max_source_length {max_len}
--max_target_length {max_len} --max_target_length {max_len}

View File

@ -447,7 +447,6 @@ class TrainerIntegrationFSDP(TestCasePlus, TrainerIntegrationCommon):
--model_name_or_path google-bert/bert-base-cased --model_name_or_path google-bert/bert-base-cased
--task_name mrpc --task_name mrpc
--output_dir {output_dir} --output_dir {output_dir}
--overwrite_output_dir
--do_train --do_train
--max_seq_length 128 --max_seq_length 128
--per_device_train_batch_size 16 --per_device_train_batch_size 16

View File

@ -21,7 +21,6 @@ class SageMakerTestEnvironment:
"do_eval": True, "do_eval": True,
"do_predict": True, "do_predict": True,
"output_dir": "/opt/ml/model", "output_dir": "/opt/ml/model",
"overwrite_output_dir": True,
"max_steps": 500, "max_steps": 500,
"save_steps": 5500, "save_steps": 5500,
} }

View File

@ -38,7 +38,6 @@ from transformers import (
set_seed, set_seed,
) )
from transformers.trainer import Trainer from transformers.trainer import Trainer
from transformers.trainer_utils import get_last_checkpoint
from transformers.training_args import TrainingArguments from transformers.training_args import TrainingArguments
from transformers.utils import check_min_version from transformers.utils import check_min_version
@ -198,21 +197,6 @@ def main():
else: else:
model_args, data_args, training_args = parser.parse_args_into_dataclasses() model_args, data_args, training_args = parser.parse_args_into_dataclasses()
# Detecting last checkpoint.
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
# Setup logging # Setup logging
logging.basicConfig( logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@ -459,9 +443,7 @@ def main():
# Training # Training
if training_args.do_train: if training_args.do_train:
checkpoint = None checkpoint = None
if last_checkpoint is not None: if os.path.isdir(model_args.model_name_or_path):
checkpoint = last_checkpoint
elif os.path.isdir(model_args.model_name_or_path):
# Check the config from that potential checkpoint has the right number of labels before using it as a # Check the config from that potential checkpoint has the right number of labels before using it as a
# checkpoint. # checkpoint.
if AutoConfig.from_pretrained(model_args.model_name_or_path).num_labels == num_labels: if AutoConfig.from_pretrained(model_args.model_name_or_path).num_labels == num_labels:

View File

@ -4455,7 +4455,6 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
"1", "1",
"--output_dir", "--output_dir",
tmpdir, tmpdir,
"--overwrite_output_dir",
"--do_train", "--do_train",
"--max_train_samples", "--max_train_samples",
"64", "64",