mirror of
https://github.com/huggingface/trl.git
synced 2025-10-20 18:43:52 +08:00
🗳️ Remove logging_steps
parameter from for simpler setup (#3612)
This commit is contained in:
committed by
GitHub
parent
d6a969ff7d
commit
ed9b78a5f7
@ -31,7 +31,7 @@ model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
|
||||
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
|
||||
train_dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train")
|
||||
|
||||
training_args = CPOConfig(output_dir="Qwen2-0.5B-CPO", logging_steps=10)
|
||||
training_args = CPOConfig(output_dir="Qwen2-0.5B-CPO")
|
||||
trainer = CPOTrainer(model=model, args=training_args, processing_class=tokenizer, train_dataset=train_dataset)
|
||||
trainer.train()
|
||||
```
|
||||
@ -57,7 +57,6 @@ accelerate launch examples/scripts/cpo.py \
|
||||
--model_name_or_path Qwen/Qwen2-0.5B-Instruct \
|
||||
--dataset_name trl-lib/ultrafeedback_binarized \
|
||||
--num_train_epochs 1 \
|
||||
--logging_steps 25 \
|
||||
--output_dir Qwen2-0.5B-CPO
|
||||
```
|
||||
|
||||
|
@ -46,7 +46,7 @@ model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
|
||||
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
|
||||
train_dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train")
|
||||
|
||||
training_args = DPOConfig(output_dir="Qwen2-0.5B-DPO", logging_steps=10)
|
||||
training_args = DPOConfig(output_dir="Qwen2-0.5B-DPO")
|
||||
trainer = DPOTrainer(model=model, args=training_args, processing_class=tokenizer, train_dataset=train_dataset)
|
||||
trainer.train()
|
||||
```
|
||||
@ -113,7 +113,6 @@ accelerate launch trl/scripts/dpo.py \
|
||||
--model_name_or_path Qwen/Qwen2-0.5B-Instruct \
|
||||
--dataset_name trl-lib/ultrafeedback_binarized \
|
||||
--num_train_epochs 1 \
|
||||
--logging_steps 25 \
|
||||
--output_dir Qwen2-0.5B-DPO
|
||||
```
|
||||
|
||||
@ -195,8 +194,8 @@ First install `unsloth` according to the [official documentation](https://github
|
||||
+ model = FastLanguageModel.get_peft_model(model)
|
||||
train_dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train")
|
||||
|
||||
- training_args = DPOConfig(output_dir="Qwen2-0.5B-DPO", logging_steps=10)
|
||||
+ training_args = DPOConfig(output_dir="Qwen2-0.5B-DPO", logging_steps=10, bf16=True)
|
||||
- training_args = DPOConfig(output_dir="Qwen2-0.5B-DPO")
|
||||
+ training_args = DPOConfig(output_dir="Qwen2-0.5B-DPO", bf16=True)
|
||||
trainer = DPOTrainer(model=model, args=training_args, processing_class=tokenizer, train_dataset=train_dataset)
|
||||
trainer.train()
|
||||
|
||||
|
@ -36,7 +36,7 @@ dataset = load_dataset("trl-lib/tldr", split="train")
|
||||
def reward_len(completions, **kwargs):
|
||||
return [-abs(20 - len(completion)) for completion in completions]
|
||||
|
||||
training_args = GRPOConfig(output_dir="Qwen2-0.5B-GRPO", logging_steps=10)
|
||||
training_args = GRPOConfig(output_dir="Qwen2-0.5B-GRPO")
|
||||
trainer = GRPOTrainer(
|
||||
model="Qwen/Qwen2-0.5B-Instruct",
|
||||
reward_funcs=reward_len,
|
||||
@ -294,7 +294,6 @@ def main():
|
||||
per_device_train_batch_size=4,
|
||||
bf16=True,
|
||||
gradient_checkpointing=True,
|
||||
logging_steps=10,
|
||||
use_vllm=True,
|
||||
vllm_server_host=args.vllm_server_host.replace("ip-", "").replace("-", "."), # from ip-X-X-X-X to X.X.X.X
|
||||
)
|
||||
|
@ -85,7 +85,6 @@ config = IterativeSFTConfig(
|
||||
per_device_train_batch_size=4,
|
||||
gradient_accumulation_steps=4,
|
||||
max_steps=1000,
|
||||
logging_steps=10,
|
||||
save_steps=100,
|
||||
optim="adamw_torch",
|
||||
report_to="wandb",
|
||||
|
@ -38,7 +38,7 @@ model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
|
||||
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
|
||||
train_dataset = load_dataset("trl-lib/kto-mix-14k", split="train")
|
||||
|
||||
training_args = KTOConfig(output_dir="Qwen2-0.5B-KTO", logging_steps=10)
|
||||
training_args = KTOConfig(output_dir="Qwen2-0.5B-KTO")
|
||||
trainer = KTOTrainer(model=model, args=training_args, processing_class=tokenizer, train_dataset=train_dataset)
|
||||
trainer.train()
|
||||
```
|
||||
@ -89,7 +89,6 @@ accelerate launch trl/scripts/kto.py \
|
||||
--model_name_or_path Qwen/Qwen2-0.5B-Instruct \
|
||||
--dataset_name trl-lib/kto-mix-14k \
|
||||
--num_train_epochs 1 \
|
||||
--logging_steps 25 \
|
||||
--output_dir Qwen2-0.5B-KTO
|
||||
```
|
||||
|
||||
|
@ -36,7 +36,7 @@ tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
|
||||
judge = PairRMJudge()
|
||||
train_dataset = load_dataset("trl-lib/ultrafeedback-prompt", split="train")
|
||||
|
||||
training_args = NashMDConfig(output_dir="Qwen2-0.5B-NashMD", logging_steps=10)
|
||||
training_args = NashMDConfig(output_dir="Qwen2-0.5B-NashMD")
|
||||
trainer = NashMDTrainer(
|
||||
model=model, judge=judge, args=training_args, processing_class=tokenizer, train_dataset=train_dataset
|
||||
)
|
||||
@ -125,7 +125,6 @@ python examples/scripts/nash_md.py \
|
||||
--judge pair_rm \
|
||||
--dataset_name trl-lib/ultrafeedback-prompt \
|
||||
--learning_rate 5.0e-7 \
|
||||
--logging_steps 25 \
|
||||
--output_dir Qwen2.5-0.5B-NashMD-PairRM \
|
||||
--warmup_ratio 0.1 \
|
||||
--push_to_hub
|
||||
|
@ -36,7 +36,7 @@ tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
|
||||
judge = PairRMJudge()
|
||||
train_dataset = load_dataset("trl-lib/ultrafeedback-prompt", split="train")
|
||||
|
||||
training_args = OnlineDPOConfig(output_dir="Qwen2-0.5B-OnlineDPO", logging_steps=10)
|
||||
training_args = OnlineDPOConfig(output_dir="Qwen2-0.5B-OnlineDPO")
|
||||
trainer = OnlineDPOTrainer(
|
||||
model=model, judge=judge, args=training_args, processing_class=tokenizer, train_dataset=train_dataset
|
||||
)
|
||||
@ -125,7 +125,6 @@ python examples/scripts/dpo_online.py \
|
||||
--judge pair_rm \
|
||||
--dataset_name trl-lib/ultrafeedback-prompt \
|
||||
--learning_rate 5.0e-7 \
|
||||
--logging_steps 25 \
|
||||
--output_dir Qwen2.5-0.5B-Online-DPO-PairRM \
|
||||
--warmup_ratio 0.1 \
|
||||
--push_to_hub
|
||||
@ -171,7 +170,6 @@ accelerate launch --config_file examples/accelerate_configs/multi_gpu.yaml \
|
||||
--max_new_tokens 53 \
|
||||
--warmup_ratio 0.1 \
|
||||
--missing_eos_penalty 1.0 \
|
||||
--logging_steps 20 \
|
||||
--save_steps 0.1 \
|
||||
--push_to_hub
|
||||
|
||||
@ -191,7 +189,6 @@ accelerate launch --config_file examples/accelerate_configs/deepspeed_zero2.yaml
|
||||
--warmup_ratio 0.1 \
|
||||
--missing_eos_penalty 1.0 \
|
||||
--bf16 \
|
||||
--logging_steps 20 \
|
||||
--save_steps 0.1 \
|
||||
--push_to_hub
|
||||
|
||||
@ -212,7 +209,6 @@ accelerate launch --config_file examples/accelerate_configs/deepspeed_zero2.yaml
|
||||
--missing_eos_penalty 1.0 \
|
||||
--bf16 \
|
||||
--gradient_checkpointing \
|
||||
--logging_steps 20 \
|
||||
--save_steps 0.1 \
|
||||
--push_to_hub
|
||||
```
|
||||
|
@ -41,7 +41,7 @@ model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
|
||||
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
|
||||
train_dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train")
|
||||
|
||||
training_args = ORPOConfig(output_dir="Qwen2-0.5B-ORPO", logging_steps=10)
|
||||
training_args = ORPOConfig(output_dir="Qwen2-0.5B-ORPO")
|
||||
trainer = ORPOTrainer(model=model, args=training_args, processing_class=tokenizer, train_dataset=train_dataset)
|
||||
trainer.train()
|
||||
```
|
||||
@ -94,7 +94,6 @@ accelerate launch examples/scripts/orpo.py \
|
||||
--model_name_or_path Qwen/Qwen2-0.5B-Instruct \
|
||||
--dataset_name trl-lib/ultrafeedback_binarized \
|
||||
--num_train_epochs 1 \
|
||||
--logging_steps 25 \
|
||||
--output_dir Qwen2-0.5B-ORPO
|
||||
```
|
||||
|
||||
|
@ -42,7 +42,7 @@ model = AutoModelForTokenClassification.from_pretrained("Qwen/Qwen2-0.5B", num_l
|
||||
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B")
|
||||
train_dataset = load_dataset("trl-lib/math_shepherd", split="train[:10%]")
|
||||
|
||||
training_args = PRMConfig(output_dir="Qwen2-0.5B-Reward-Math-Sheperd", logging_steps=10)
|
||||
training_args = PRMConfig(output_dir="Qwen2-0.5B-Reward-Math-Sheperd")
|
||||
trainer = PRMTrainer(model=model, args=training_args, processing_class=tokenizer, train_dataset=train_dataset)
|
||||
trainer.train()
|
||||
```
|
||||
@ -112,7 +112,6 @@ accelerate launch examples/scripts/prm.py \
|
||||
--model_name_or_path Qwen/Qwen2-0.5B \
|
||||
--dataset_name trl-lib/math_shepherd \
|
||||
--num_train_epochs 1 \
|
||||
--logging_steps 25 \
|
||||
--output_dir Qwen2-0.5B-Reward-Math-Sheperd
|
||||
```
|
||||
|
||||
|
@ -253,7 +253,6 @@ training_args = SFTConfig(
|
||||
gradient_accumulation_steps=4, # Number of steps before performing a backward/update pass to accumulate gradients. multi-image -> gradient_accumulation_steps=1
|
||||
gradient_checkpointing=True, # Enable gradient checkpointing to reduce memory usage during training.
|
||||
optim="adamw_torch_fused", # Use the fused AdamW optimizer for better performance.
|
||||
logging_steps=10, # Frequency of logging training progress (log every 10 steps).
|
||||
save_strategy="epoch", # Save checkpoints at the end of each epoch.
|
||||
learning_rate=2e-05, # Learning rate for training.
|
||||
bf16=True, # Enable bfloat16 precision for training to save memory and speed up computations.
|
||||
|
@ -39,7 +39,6 @@ training_args = GRPOConfig(
|
||||
use_vllm=True,
|
||||
bf16=True,
|
||||
gradient_checkpointing=True,
|
||||
logging_steps=10,
|
||||
)
|
||||
|
||||
trainer = GRPOTrainer(
|
||||
|
@ -35,7 +35,7 @@ tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
|
||||
judge = PairRMJudge()
|
||||
train_dataset = load_dataset("trl-lib/ultrafeedback-prompt", split="train")
|
||||
|
||||
training_args = XPOConfig(output_dir="Qwen2-0.5B-XPO", logging_steps=10)
|
||||
training_args = XPOConfig(output_dir="Qwen2-0.5B-XPO")
|
||||
trainer = XPOTrainer(
|
||||
model=model, judge=judge, args=training_args, processing_class=tokenizer, train_dataset=train_dataset
|
||||
)
|
||||
@ -124,7 +124,6 @@ python examples/scripts/xpo.py \
|
||||
--judge pair_rm \
|
||||
--dataset_name trl-lib/ultrafeedback-prompt \
|
||||
--learning_rate 5.0e-7 \
|
||||
--logging_steps 25 \
|
||||
--output_dir Qwen2.5-0.5B-XPO-PairRM \
|
||||
--warmup_ratio 0.1 \
|
||||
--push_to_hub
|
||||
|
@ -75,7 +75,6 @@ if __name__ == "__main__":
|
||||
push_to_hub=True,
|
||||
hub_model_id=config.hub_model_id,
|
||||
output_dir=config.output_dir,
|
||||
logging_steps=500,
|
||||
save_steps=1000,
|
||||
save_total_limit=2,
|
||||
)
|
||||
|
@ -148,7 +148,6 @@ training_args = TrainingArguments(
|
||||
label_names=[],
|
||||
bf16=script_args.bf16,
|
||||
logging_strategy="steps",
|
||||
logging_steps=10,
|
||||
optim=script_args.optim,
|
||||
lr_scheduler_type=script_args.lr_scheduler_type,
|
||||
seed=script_args.seed,
|
||||
|
@ -22,7 +22,6 @@ There were two main steps to the DPO training process:
|
||||
accelerate launch examples/research_projects/stack_llama_2/scripts/sft_llama2.py \
|
||||
--output_dir="./sft" \
|
||||
--max_steps=500 \
|
||||
--logging_steps=10 \
|
||||
--save_steps=10 \
|
||||
--per_device_train_batch_size=4 \
|
||||
--per_device_eval_batch_size=1 \
|
||||
|
@ -26,7 +26,6 @@ python examples/scripts/bco.py \
|
||||
--learning_rate 1e-6 \
|
||||
--gradient_checkpointing \
|
||||
--gradient_accumulation_steps 1 \
|
||||
--logging_steps 0.01 \
|
||||
--eval_steps 0.2 \
|
||||
--save_strategy no \
|
||||
--output_dir=bco-aligned-model \
|
||||
@ -48,7 +47,6 @@ python examples/scripts/bco.py \
|
||||
--learning_rate 1e-6 \
|
||||
--gradient_checkpointing \
|
||||
--gradient_accumulation_steps 1 \
|
||||
--logging_steps 0.01 \
|
||||
--eval_steps 0.2 \
|
||||
--save_strategy no \
|
||||
--output_dir=bco-aligned-model-lora \
|
||||
|
@ -24,7 +24,6 @@ python examples/scripts/cpo.py \
|
||||
--max_steps 1000 \
|
||||
--learning_rate 8e-6 \
|
||||
--gradient_accumulation_steps 1 \
|
||||
--logging_steps 10 \
|
||||
--eval_steps 500 \
|
||||
--output_dir="gpt2-aligned-cpo" \
|
||||
--warmup_steps 150 \
|
||||
@ -41,7 +40,6 @@ python examples/scripts/cpo.py \
|
||||
--max_steps 1000 \
|
||||
--learning_rate 8e-5 \
|
||||
--gradient_accumulation_steps 1 \
|
||||
--logging_steps 10 \
|
||||
--eval_steps 500 \
|
||||
--output_dir="gpt2-lora-aligned-cpo" \
|
||||
--optim rmsprop \
|
||||
|
@ -22,7 +22,6 @@ python examples/scripts/gkd.py \
|
||||
--per_device_train_batch_size 4 \
|
||||
--gradient_accumulation_steps 8 \
|
||||
--output_dir gkd-model \
|
||||
--logging_steps 10 \
|
||||
--num_train_epochs 1 \
|
||||
--push_to_hub \
|
||||
--gradient_checkpointing
|
||||
@ -36,7 +35,6 @@ python examples/scripts/gkd.py \
|
||||
--per_device_train_batch_size 4 \
|
||||
--gradient_accumulation_steps 8 \
|
||||
--output_dir gkd-model \
|
||||
--logging_steps 10 \
|
||||
--num_train_epochs 1 \
|
||||
--push_to_hub \
|
||||
--gradient_checkpointing \
|
||||
|
@ -24,7 +24,6 @@ python trl/scripts/kto.py \
|
||||
--learning_rate 5e-7 \
|
||||
--lr_scheduler_type=cosine \
|
||||
--gradient_accumulation_steps 1 \
|
||||
--logging_steps 10 \
|
||||
--eval_steps 500 \
|
||||
--output_dir=kto-aligned-model \
|
||||
--warmup_ratio 0.1 \
|
||||
@ -41,7 +40,6 @@ python trl/scripts/kto.py \
|
||||
--learning_rate 5e-7 \
|
||||
--lr_scheduler_type=cosine \
|
||||
--gradient_accumulation_steps 1 \
|
||||
--logging_steps 10 \
|
||||
--eval_steps 500 \
|
||||
--output_dir=kto-aligned-model-lora \
|
||||
--warmup_ratio 0.1 \
|
||||
|
@ -24,7 +24,6 @@ python examples/scripts/orpo.py \
|
||||
--max_steps 1000 \
|
||||
--learning_rate 8e-6 \
|
||||
--gradient_accumulation_steps 1 \
|
||||
--logging_steps 10 \
|
||||
--eval_steps 500 \
|
||||
--output_dir="gpt2-aligned-orpo" \
|
||||
--warmup_steps 150 \
|
||||
@ -41,7 +40,6 @@ python examples/scripts/orpo.py \
|
||||
--max_steps 1000 \
|
||||
--learning_rate 8e-5 \
|
||||
--gradient_accumulation_steps 1 \
|
||||
--logging_steps 10 \
|
||||
--eval_steps 500 \
|
||||
--output_dir="gpt2-lora-aligned-orpo" \
|
||||
--optim rmsprop \
|
||||
|
@ -22,7 +22,6 @@ python examples/scripts/prm.py \
|
||||
--num_train_epochs 1 \
|
||||
--gradient_checkpointing True \
|
||||
--learning_rate 1.0e-5 \
|
||||
--logging_steps 25 \
|
||||
--eval_strategy steps \
|
||||
--eval_steps 50
|
||||
|
||||
@ -35,7 +34,6 @@ python examples/scripts/prm.py \
|
||||
--num_train_epochs 1 \
|
||||
--gradient_checkpointing True \
|
||||
--learning_rate 1.0e-4 \
|
||||
--logging_steps 25 \
|
||||
--eval_strategy steps \
|
||||
--eval_steps 50
|
||||
--use_peft \
|
||||
|
@ -22,7 +22,6 @@ python examples/scripts/reward_modeling.py \
|
||||
--num_train_epochs 1 \
|
||||
--gradient_checkpointing True \
|
||||
--learning_rate 1.0e-5 \
|
||||
--logging_steps 25 \
|
||||
--eval_strategy steps \
|
||||
--eval_steps 50 \
|
||||
--max_length 2048
|
||||
@ -36,7 +35,6 @@ python examples/scripts/reward_modeling.py \
|
||||
--num_train_epochs 1 \
|
||||
--gradient_checkpointing True \
|
||||
--learning_rate 1.0e-4 \
|
||||
--logging_steps 25 \
|
||||
--eval_strategy steps \
|
||||
--eval_steps 50 \
|
||||
--max_length 2048 \
|
||||
|
@ -36,7 +36,6 @@ def main():
|
||||
# Train model
|
||||
training_args = SFTConfig(
|
||||
output_dir=f"{model_id}-codeforces-SFT",
|
||||
logging_steps=10,
|
||||
bf16=True,
|
||||
use_liger_kernel=True,
|
||||
gradient_checkpointing=True,
|
||||
|
@ -27,7 +27,6 @@ accelerate launch \
|
||||
--gradient_accumulation_steps=4 \
|
||||
--num_train_epochs=4 \
|
||||
--optim="adamw_torch_fused" \
|
||||
--logging_steps=1 \
|
||||
--log_level="debug" \
|
||||
--log_level_replica="debug" \
|
||||
--save_strategy="steps" \
|
||||
|
@ -23,7 +23,6 @@ python trl/scripts/dpo.py \
|
||||
--per_device_train_batch_size 2 \
|
||||
--gradient_accumulation_steps 8 \
|
||||
--gradient_checkpointing \
|
||||
--logging_steps 25 \
|
||||
--eval_strategy steps \
|
||||
--eval_steps 50 \
|
||||
--output_dir Qwen2-0.5B-DPO \
|
||||
@ -40,7 +39,6 @@ python trl/scripts/dpo.py \
|
||||
--per_device_train_batch_size 2 \
|
||||
--gradient_accumulation_steps 8 \
|
||||
--gradient_checkpointing \
|
||||
--logging_steps 25 \
|
||||
--eval_strategy steps \
|
||||
--eval_steps 50 \
|
||||
--output_dir Qwen2-0.5B-DPO \
|
||||
|
@ -24,7 +24,6 @@ python trl/scripts/kto.py \
|
||||
--learning_rate 5e-7 \
|
||||
--lr_scheduler_type=cosine \
|
||||
--gradient_accumulation_steps 1 \
|
||||
--logging_steps 10 \
|
||||
--eval_steps 500 \
|
||||
--output_dir=kto-aligned-model \
|
||||
--warmup_ratio 0.1 \
|
||||
@ -41,7 +40,6 @@ python trl/scripts/kto.py \
|
||||
--learning_rate 5e-7 \
|
||||
--lr_scheduler_type=cosine \
|
||||
--gradient_accumulation_steps 1 \
|
||||
--logging_steps 10 \
|
||||
--eval_steps 500 \
|
||||
--output_dir=kto-aligned-model-lora \
|
||||
--warmup_ratio 0.1 \
|
||||
|
@ -24,7 +24,6 @@ python trl/scripts/sft.py \
|
||||
--gradient_accumulation_steps 8 \
|
||||
--gradient_checkpointing \
|
||||
--eos_token '<|im_end|>' \
|
||||
--logging_steps 25 \
|
||||
--eval_strategy steps \
|
||||
--eval_steps 100 \
|
||||
--output_dir Qwen2-0.5B-SFT \
|
||||
@ -41,7 +40,6 @@ python trl/scripts/sft.py \
|
||||
--gradient_accumulation_steps 8 \
|
||||
--gradient_checkpointing \
|
||||
--eos_token '<|im_end|>' \
|
||||
--logging_steps 25 \
|
||||
--eval_strategy steps \
|
||||
--eval_steps 100 \
|
||||
--use_peft \
|
||||
|
Reference in New Issue
Block a user