Compare commits

...

8 Commits

Author SHA1 Message Date
728a73caa4 Add completion entropy metrics 2025-04-29 14:06:03 +00:00
2bf48478e8 📋 Allow calling trl cli in sft mode with config file (#3380)
Co-authored-by: Quentin Gallouédec <gallouedec.quentin@gmail.com>
2025-04-28 14:23:42 -07:00
a8cfca6d01 ⚰️ Remove deprecated (#3364) 2025-04-26 11:11:35 -07:00
1bca49515e Better guards for DeepSpeed imports (#3351) 2025-04-26 10:18:11 +02:00
39e96394a9 🎭 Fix train and eval mode checking in GRPOTrainer and SFTTrainer (#3337)
Co-authored-by: Jiaming Ma <jiaming.ma@connect.polyu.hk>
Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>
Co-authored-by: Quentin Gallouédec <gallouedec.quentin@gmail.com>
2025-04-25 17:42:43 -07:00
8e6ed93dfd 🥸🔢 Adding pad_multiple to SFT trainer (#3365) 2025-04-25 18:12:35 -06:00
29c5e05e3a 🔢 Pad to multiple of (#3362) 2025-04-25 09:53:20 -07:00
a9b27f82d6 ⬆️ Bump dev version (#3357) 2025-04-24 16:22:12 -07:00
19 changed files with 221 additions and 385 deletions

View File

@ -69,7 +69,7 @@ To create the package for PyPI.
from setuptools import find_packages, setup
__version__ = "0.17.0" # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
__version__ = "0.18.0.dev0" # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
REQUIRED_PKGS = [
"accelerate>=0.34.0",

View File

@ -13,12 +13,15 @@
# limitations under the License.
import os
import sys
import tempfile
import unittest
from io import StringIO
from unittest.mock import patch
import yaml
@unittest.skipIf(
sys.version_info < (3, 10),
@ -67,6 +70,33 @@ class TestCLI(unittest.TestCase):
with patch("sys.argv", command.split(" ")):
main()
def test_sft_config_file(self):
from trl.cli import main
with tempfile.TemporaryDirectory() as tmp_dir: # Create a temporary directory
output_dir = os.path.join(tmp_dir, "output")
# Create a temporary config file
config_path = os.path.join(tmp_dir, "config.yaml")
config_content = {
"model_name_or_path": "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5",
"dataset_name": "trl-internal-testing/zen",
"dataset_config": "standard_language_modeling",
"report_to": "none",
"output_dir": output_dir,
"lr_scheduler_type": "cosine_with_restarts",
}
with open(config_path, "w") as config_file:
yaml.dump(config_content, config_file)
# Test the CLI with config file
command = f"trl sft --config {config_path}"
with patch("sys.argv", command.split(" ")):
main()
# Verify that output directory was created
self.assertTrue(os.path.exists(output_dir))
if __name__ == "__main__":
unittest.main()

View File

@ -370,7 +370,6 @@ class TrainerArgTester(unittest.TestCase):
packing=True,
max_length=256,
dataset_num_proc=4,
dataset_batch_size=512,
neftune_noise_alpha=0.1,
model_init_kwargs={"trust_remote_code": True},
dataset_kwargs={"append_concat_token": True, "skip_prepare_dataset": True},
@ -381,7 +380,6 @@ class TrainerArgTester(unittest.TestCase):
self.assertEqual(trainer.args.packing, True)
self.assertEqual(trainer.args.max_length, 256)
self.assertEqual(trainer.args.dataset_num_proc, 4)
self.assertEqual(trainer.args.dataset_batch_size, 512)
self.assertEqual(trainer.args.neftune_noise_alpha, 0.1)
self.assertEqual(trainer.args.model_init_kwargs, {"trust_remote_code": True})
self.assertIn("append_concat_token", trainer.args.dataset_kwargs)

View File

@ -95,6 +95,38 @@ class TestPad(unittest.TestCase):
)
self.assertTrue(torch.equal(output, expected))
def test_pad_to_multiple_of_1(self):
x = torch.tensor([1, 2, 3])
y = torch.tensor([4, 5])
# Max length is 3, pad to multiple of 4
output = pad((x, y), padding_value=0, padding_side="right", pad_to_multiple_of=4)
expected = torch.tensor([[1, 2, 3, 0], [4, 5, 0, 0]])
self.assertTrue(torch.equal(output, expected))
def test_pad_to_multiple_of_2(self):
x = torch.tensor([1, 2, 3, 4, 5])
y = torch.tensor([6, 7, 8])
# Max length is 3, pad to multiple of 4
output = pad((x, y), padding_value=0, padding_side="right", pad_to_multiple_of=4)
expected = torch.tensor([[1, 2, 3, 4, 5, 0, 0, 0], [6, 7, 8, 0, 0, 0, 0, 0]])
self.assertTrue(torch.equal(output, expected))
def test_pad_to_multiple_of_side_left(self):
x = torch.tensor([1, 2, 3, 4, 5])
y = torch.tensor([6, 7, 8])
# Max length is 3, pad to multiple of 4
output = pad((x, y), padding_value=0, padding_side="left", pad_to_multiple_of=4)
expected = torch.tensor([[0, 0, 0, 1, 2, 3, 4, 5], [0, 0, 0, 0, 0, 6, 7, 8]])
self.assertTrue(torch.equal(output, expected))
def test_pad_to_multiple_of_no_extra_padding(self):
x = torch.tensor([1, 2, 3, 4])
y = torch.tensor([5, 6, 7, 8])
# Already multiple of 4
output = pad((x, y), padding_value=0, padding_side="left", pad_to_multiple_of=4)
expected = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 8]])
self.assertTrue(torch.equal(output, expected))
@require_peft
class TestGetPEFTConfig(unittest.TestCase):

View File

@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
__version__ = "0.17.0"
__version__ = "0.18.0.dev0"
from typing import TYPE_CHECKING

View File

@ -46,7 +46,7 @@ def main():
make_vllm_serve_parser(subparsers)
# Parse the arguments
args = parser.parse_args()
args = parser.parse_args_and_config()[0]
if args.command == "chat":
(chat_args,) = parser.parse_args_and_config()

View File

@ -18,7 +18,6 @@ from copy import deepcopy
from dataclasses import dataclass
from typing import TYPE_CHECKING, Literal, Optional, Union
from accelerate.utils import is_deepspeed_available
from packaging import version
from transformers import PreTrainedModel, PreTrainedTokenizer
@ -30,12 +29,10 @@ SUPPORTED_ARCHITECTURES = (
AutoModelForSeq2SeqLMWithValueHead,
)
if is_deepspeed_available():
import deepspeed
if TYPE_CHECKING:
from accelerate import Accelerator
from deepspeed.runtime.engine import DeepSpeedEngine
from torch.nn import Module
from torch.nn.parallel.distributed import DistributedDataParallel
@ -167,6 +164,8 @@ def iter_params(module, recurse=False):
def add_hooks(model: "DeepSpeedEngine") -> None:
"""Adds the optimizer hooks from a DeepSpeed ZeRO-3 model."""
import deepspeed
if not hasattr(model, "optimizer"): # before the first training step, the model has no optimizer
return
if model.optimizer is not None and hasattr(model.optimizer, "parameter_offload"):
@ -214,6 +213,8 @@ def unwrap_model_for_generation(
if not gather_deepspeed3_params:
yield accelerator.unwrap_model(model)
else:
import deepspeed
with deepspeed.zero.GatheredParameters(model.parameters()):
remove_hooks(model)
yield accelerator.unwrap_model(model)
@ -222,8 +223,13 @@ def unwrap_model_for_generation(
yield unwrapped_model
def prepare_deepspeed(model, accelerator):
# Adapted from accelerate: https://github.com/huggingface/accelerate/blob/739b135f8367becb67ffaada12fe76e3aa60fefd/src/accelerate/accelerator.py#L1473
def prepare_deepspeed(model: "Module", accelerator: "Accelerator"):
"""Prepares the model for DeepSpeed inference or evaluation by initializing it with the appropriate configuration.
Adapted from accelerate: https://github.com/huggingface/accelerate/blob/739b135f8367becb67ffaada12fe76e3aa60fefd/src/accelerate/accelerator.py#L1473
"""
import deepspeed # local import (instead of top-level) to avoid DS init interfering with other backends (like vllm): https://github.com/deepspeedai/DeepSpeed/issues/7252
deepspeed_plugin = accelerator.state.deepspeed_plugin
config_kwargs = deepcopy(deepspeed_plugin.deepspeed_config)
stage = config_kwargs["zero_optimization"]["stage"]

View File

@ -51,7 +51,7 @@ class ScriptArguments:
type, inplace operation. See https://github.com/huggingface/transformers/issues/22482#issuecomment-1595790992.
"""
dataset_name: str = field(metadata={"help": "Dataset name."})
dataset_name: Optional[str] = field(default=None, metadata={"help": "Dataset name."})
dataset_config: Optional[str] = field(
default=None,
metadata={

View File

@ -19,7 +19,6 @@ import textwrap
import warnings
from collections import defaultdict
from contextlib import contextmanager, nullcontext
from copy import deepcopy
from operator import itemgetter
from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Union
@ -32,7 +31,7 @@ import torch.nn.functional as F
import transformers
from accelerate import PartialState
from accelerate.logging import get_logger
from accelerate.utils import is_deepspeed_available, tqdm
from accelerate.utils import tqdm
from datasets import Dataset
from packaging import version
from torch.utils.data import DataLoader, SequentialSampler
@ -56,7 +55,7 @@ from transformers.utils import is_peft_available
from ..data_utils import maybe_apply_chat_template
from ..import_utils import is_joblib_available
from ..models import PreTrainedModelWrapper, create_reference_model
from ..models import create_reference_model, prepare_deepspeed
from .bco_config import BCOConfig
from .utils import (
DPODataCollatorWithPadding,
@ -83,9 +82,6 @@ if is_sklearn_available():
if is_joblib_available():
import joblib
if is_deepspeed_available():
import deepspeed
if TYPE_CHECKING:
from transformers import PreTrainedModel, PreTrainedTokenizer
@ -712,7 +708,7 @@ class BCOTrainer(Trainer):
)
else:
if self.is_deepspeed_enabled:
self.ref_model = self._prepare_deepspeed(self.ref_model)
self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator)
else:
self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True)
@ -846,37 +842,6 @@ class BCOTrainer(Trainer):
return all_embeddings
def _prepare_deepspeed(self, model: PreTrainedModelWrapper):
# Adapted from accelerate: https://github.com/huggingface/accelerate/blob/739b135f8367becb67ffaada12fe76e3aa60fefd/src/accelerate/accelerator.py#L1473
deepspeed_plugin = self.accelerator.state.deepspeed_plugin
config_kwargs = deepcopy(deepspeed_plugin.deepspeed_config)
if model is not None:
if hasattr(model, "config"):
hidden_size = (
max(model.config.hidden_sizes)
if getattr(model.config, "hidden_sizes", None)
else getattr(model.config, "hidden_size", None)
)
if hidden_size is not None and config_kwargs["zero_optimization"]["stage"] == 3:
# Note that `stage3_prefetch_bucket_size` can produce DeepSpeed messages like: `Invalidate trace cache @ step 0: expected module 1, but got module 0`
# This is expected and is not an error, see: https://github.com/microsoft/DeepSpeed/discussions/4081
config_kwargs.update(
{
"zero_optimization.reduce_bucket_size": hidden_size * hidden_size,
"zero_optimization.stage3_param_persistence_threshold": 10 * hidden_size,
"zero_optimization.stage3_prefetch_bucket_size": 0.9 * hidden_size * hidden_size,
}
)
# If ZeRO-3 is used, we shard both the active and reference model.
# Otherwise, we assume the reference model fits in memory and is initialized on each device with ZeRO disabled (stage 0)
if config_kwargs["zero_optimization"]["stage"] != 3:
config_kwargs["zero_optimization"]["stage"] = 0
model, *_ = deepspeed.initialize(model=model, config=config_kwargs)
model.eval()
return model
def _save_optimizer_and_scheduler(self, output_dir):
output_dir = output_dir if output_dir is not None else self.args.output_dir
super()._save_optimizer_and_scheduler(output_dir)

View File

@ -19,7 +19,7 @@ import pandas as pd
import torch
from accelerate import Accelerator
from accelerate.state import AcceleratorState
from accelerate.utils import gather_object, is_comet_ml_available, is_deepspeed_available, is_wandb_available
from accelerate.utils import gather_object, is_comet_ml_available, is_wandb_available
from rich.console import Console, Group
from rich.live import Live
from rich.panel import Panel
@ -44,9 +44,6 @@ from .judges import BasePairwiseJudge
from .utils import log_table_to_comet_experiment
if is_deepspeed_available():
import deepspeed
if is_comet_ml_available():
pass
@ -115,6 +112,8 @@ class SyncRefModelCallback(TrainerCallback):
def sync_target_model(model, target_model, alpha):
deepspeed_plugin = AcceleratorState().deepspeed_plugin
if deepspeed_plugin is not None and deepspeed_plugin.zero_stage == 3:
import deepspeed
with deepspeed.zero.GatheredParameters(
list(model.parameters()) + list(target_model.parameters()), modifier_rank=0
):

View File

@ -19,7 +19,6 @@ import textwrap
import warnings
from collections import defaultdict
from contextlib import contextmanager, nullcontext
from copy import deepcopy
from dataclasses import dataclass
from typing import Any, Callable, Literal, Optional, Union
@ -30,7 +29,7 @@ import torch.nn as nn
import torch.nn.functional as F
import transformers
from accelerate import PartialState
from accelerate.utils import is_deepspeed_available, tqdm
from accelerate.utils import tqdm
from datasets import Dataset, IterableDataset
from packaging import version
from torch.utils.data import DataLoader
@ -53,7 +52,7 @@ from transformers.trainer_utils import EvalLoopOutput
from transformers.utils import is_peft_available, is_torch_xpu_available
from ..data_utils import maybe_apply_chat_template, maybe_extract_prompt
from ..models import PreTrainedModelWrapper, create_reference_model
from ..models import create_reference_model, prepare_deepspeed
from ..models.utils import prepare_fsdp
from .callbacks import SyncRefModelCallback
from .dpo_config import DPOConfig, FDivergenceConstants, FDivergenceType
@ -80,9 +79,6 @@ if is_peft_available():
if is_wandb_available():
import wandb
if is_deepspeed_available():
import deepspeed
@dataclass
class DataCollatorForPreference(DataCollatorMixin):
@ -184,7 +180,6 @@ class DPOTrainer(Trainer):
Processing class used to process the data. If provided, will be used to automatically process the inputs
for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
reuse the fine-tuned model.
This supercedes the `tokenizer` argument, which is now deprecated.
model_init (`Callable[[], transformers.PreTrainedModel]`):
The model initializer to use for training. If None is specified, the default model initializer will be used.
compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
@ -510,7 +505,7 @@ class DPOTrainer(Trainer):
)
else:
if self.is_deepspeed_enabled:
self.ref_model = self._prepare_deepspeed(self.ref_model)
self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator)
elif self.is_fsdp_enabled:
self.ref_model = prepare_fsdp(self.ref_model, self.accelerator)
else:
@ -676,37 +671,6 @@ class DPOTrainer(Trainer):
return output
def _prepare_deepspeed(self, model: PreTrainedModelWrapper):
# Adapted from accelerate: https://github.com/huggingface/accelerate/blob/739b135f8367becb67ffaada12fe76e3aa60fefd/src/accelerate/accelerator.py#L1473
deepspeed_plugin = self.accelerator.state.deepspeed_plugin
config_kwargs = deepcopy(deepspeed_plugin.deepspeed_config)
if model is not None:
if hasattr(model, "config"):
hidden_size = (
max(model.config.hidden_sizes)
if getattr(model.config, "hidden_sizes", None)
else getattr(model.config, "hidden_size", None)
)
if hidden_size is not None and config_kwargs["zero_optimization"]["stage"] == 3:
# Note that `stage3_prefetch_bucket_size` can produce DeepSpeed messages like: `Invalidate trace cache @ step 0: expected module 1, but got module 0`
# This is expected and is not an error, see: https://github.com/microsoft/DeepSpeed/discussions/4081
config_kwargs.update(
{
"zero_optimization.reduce_bucket_size": hidden_size * hidden_size,
"zero_optimization.stage3_param_persistence_threshold": 10 * hidden_size,
"zero_optimization.stage3_prefetch_bucket_size": 0.9 * hidden_size * hidden_size,
}
)
# If ZeRO-3 is used, we shard both the active and reference model.
# Otherwise, we assume the reference model fits in memory and is initialized on each device with ZeRO disabled (stage 0)
if config_kwargs["zero_optimization"]["stage"] != 3:
config_kwargs["zero_optimization"]["stage"] = 0
model, *_ = deepspeed.initialize(model=model, config=config_kwargs)
model.eval()
return model
def _set_signature_columns_if_needed(self):
# If `self.args.remove_unused_columns` is True, non-signature columns are removed.
# By default, this method sets `self._signature_columns` to the model's expected inputs.

View File

@ -15,13 +15,11 @@
import os
import random
import textwrap
from copy import deepcopy
from typing import Any, Callable, Optional, Union
import torch
import torch.nn as nn
import torch.nn.functional as F
from accelerate.utils import is_deepspeed_available
from datasets import Dataset
from transformers import (
AutoModelForCausalLM,
@ -38,7 +36,7 @@ from transformers.trainer_callback import TrainerCallback
from transformers.trainer_utils import EvalPrediction
from transformers.utils import is_peft_available
from ..models import PreTrainedModelWrapper
from ..models import prepare_deepspeed
from ..models.utils import unwrap_model_for_generation
from .gkd_config import GKDConfig
from .sft_trainer import SFTTrainer
@ -51,10 +49,6 @@ from .utils import (
)
if is_deepspeed_available():
import deepspeed
if is_peft_available():
from peft import PeftConfig
@ -124,7 +118,7 @@ class GKDTrainer(SFTTrainer):
disable_dropout_in_model(self.model)
if self.is_deepspeed_enabled:
self.teacher_model = self._prepare_deepspeed(teacher_model)
self.teacher_model = prepare_deepspeed(teacher_model, self.accelerator)
else:
self.teacher_model = self.accelerator.prepare_model(teacher_model, evaluation_mode=True)
@ -311,37 +305,6 @@ class GKDTrainer(SFTTrainer):
loss = super().training_step(model, inputs, num_items_in_batch)
return loss
def _prepare_deepspeed(self, model: PreTrainedModelWrapper):
# Adapted from accelerate: https://github.com/huggingface/accelerate/blob/739b135f8367becb67ffaada12fe76e3aa60fefd/src/accelerate/accelerator.py#L1473
deepspeed_plugin = self.accelerator.state.deepspeed_plugin
config_kwargs = deepcopy(deepspeed_plugin.deepspeed_config)
if model is not None:
if hasattr(model, "config"):
hidden_size = (
max(model.config.hidden_sizes)
if getattr(model.config, "hidden_sizes", None)
else getattr(model.config, "hidden_size", None)
)
if hidden_size is not None and config_kwargs["zero_optimization"]["stage"] == 3:
# Note that `stage3_prefetch_bucket_size` can produce DeepSpeed messages like: `Invalidate trace cache @ step 0: expected module 1, but got module 0`
# This is expected and is not an error, see: https://github.com/microsoft/DeepSpeed/discussions/4081
config_kwargs.update(
{
"zero_optimization.reduce_bucket_size": hidden_size * hidden_size,
"zero_optimization.stage3_param_persistence_threshold": 10 * hidden_size,
"zero_optimization.stage3_prefetch_bucket_size": 0.9 * hidden_size * hidden_size,
}
)
# If ZeRO-3 is used, we shard both the active and reference model.
# Otherwise, we assume the reference model fits in memory and is initialized on each device with ZeRO disabled (stage 0)
if config_kwargs["zero_optimization"]["stage"] != 3:
config_kwargs["zero_optimization"]["stage"] = 0
model, *_ = deepspeed.initialize(model=model, config=config_kwargs)
model.eval()
return model
def create_model_card(
self,
model_name: Optional[str] = None,

View File

@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import warnings
from dataclasses import dataclass, field
from typing import Optional, Union
@ -412,83 +412,3 @@ class GRPOConfig(TrainingArguments):
"all prompts are logged."
},
)
# Deprecated parameters
vllm_device: Optional[str] = field(
default=None,
metadata={
"help": "This parameter is deprecated and will be removed in version 0.18.0. To use vLLM, start a vLLM "
"server with the `trl vllm-serve` command."
},
)
vllm_gpu_memory_utilization: Optional[float] = field(
default=None,
metadata={
"help": "This parameter is deprecated and will be removed in version 0.18.0. To control the GPU memory "
"utilization for vLLM, you should now use the `gpu_memory_utilization` parameter in the vLLM server "
"configuration."
},
)
vllm_dtype: Optional[str] = field(
default=None,
metadata={
"help": "This parameter is deprecated and will be removed in version 0.18.0. To control the data type for "
"vLLM generation, you should now use the `dtype` parameter in the vLLM server configuration."
},
)
vllm_max_model_len: Optional[int] = field(
default=None,
metadata={
"help": "This parameter is deprecated and will be removed in version 0.18.0. To control the "
"`max_model_len` for vLLM, you should now use the `max_model_len` parameter in the vLLM server "
"configuration."
},
)
vllm_enable_prefix_caching: Optional[bool] = field(
default=None,
metadata={
"help": "This parameter is deprecated and will be removed in version 0.18.0. To control prefix caching in "
"vLLM, you should now use the `enable_prefix_caching` parameter in the vLLM server configuration."
},
)
def __post_init__(self):
super().__post_init__()
if self.vllm_device is not None:
warnings.warn(
"`vllm_device` is deprecated and will be removed in version 0.18.0. To use vLLM, start a vLLM server "
"with the `trl vllm-serve` command.",
DeprecationWarning,
)
if self.vllm_gpu_memory_utilization is not None:
warnings.warn(
"`vllm_gpu_memory_utilization` is deprecated and will be removed in v0.18. To control the GPU memory "
"utilization for vLLM, you should now use the `gpu_memory_utilization` parameter in the vLLM server "
"configuration.",
DeprecationWarning,
)
if self.vllm_dtype is not None:
warnings.warn(
"`vllm_dtype` is deprecated and will be removed in version 0.18.0. To control the data type for vLLM "
"generation, you should now use the `dtype` parameter in the vLLM server configuration.",
DeprecationWarning,
)
if self.vllm_max_model_len is not None:
warnings.warn(
"`vllm_max_model_len` is deprecated and will be removed in version 0.18.0. To control the "
"`max_model_len` for vLLM, you should now use the `max_model_len` parameter in the vLLM server "
"configuration.",
DeprecationWarning,
)
if self.vllm_enable_prefix_caching is not None:
warnings.warn(
"`vllm_enable_prefix_caching` is deprecated and will be removed in version 0.18.0. To control prefix "
"caching in vLLM, you should now use the `enable_prefix_caching` parameter in the vLLM server "
"configuration.",
DeprecationWarning,
)

View File

@ -47,7 +47,7 @@ from transformers.utils import is_datasets_available, is_peft_available
from ..data_utils import apply_chat_template, is_conversational, maybe_apply_chat_template
from ..extras.profiling import profiling_context, profiling_decorator
from ..extras.vllm_client import VLLMClient
from ..import_utils import is_deepspeed_available, is_liger_kernel_available, is_rich_available, is_vllm_available
from ..import_utils import is_liger_kernel_available, is_rich_available, is_vllm_available
from ..models import create_reference_model, prepare_deepspeed, unwrap_model_for_generation
from .callbacks import SyncRefModelCallback
from .grpo_config import GRPOConfig
@ -61,9 +61,6 @@ from .utils import (
)
if is_deepspeed_available():
import deepspeed
if is_peft_available():
from peft import PeftConfig, get_peft_model
@ -175,15 +172,6 @@ class RepeatSampler(Sampler):
return self.num_samples * self.mini_repeat_count * self.repeat_count
class RepeatRandomSampler(RepeatSampler):
def __init__(self, *args, **kwargs):
warnings.warn(
"RepeatRandomSampler is deprecated and will be removed in version 0.18. Use RepeatSampler instead.",
DeprecationWarning,
)
super().__init__(*args, **kwargs)
# torch.nanstd doesn't exist, so we define it here
def nanstd(tensor: torch.Tensor) -> torch.Tensor:
"""
@ -809,11 +797,29 @@ class GRPOTrainer(Trainer):
last_hidden_state = last_hidden_state[:, -logits_to_keep:, :] # (B, logits_to_keep, H)
return last_hidden_state
# Get the per-token log probabilities for the completions for the model and the reference model
@profiling_decorator
def _get_per_token_logps(self, model, input_ids, attention_mask, logits_to_keep, batch_size=None) -> torch.Tensor:
def _get_per_token_logps_and_probs(
self, model, input_ids, attention_mask, logits_to_keep, batch_size=None
) -> tuple[torch.Tensor, torch.Tensor]:
"""
Get both per-token log probabilities and probability distributions for completions in a single forward pass.
Args:
model: The model to generate logits.
input_ids: The input token IDs.
attention_mask: The attention mask.
logits_to_keep: The number of logits to keep (corresponding to completion length).
batch_size: Optional batch size to process inputs in chunks to reduce memory peak.
Returns:
tuple: (per_token_logps, probs)
- per_token_logps: Tensor of shape (batch_size, logits_to_keep) containing log probabilities
- probs: Tensor of shape (batch_size, logits_to_keep, vocab_size) containing full probability distributions
"""
batch_size = batch_size or input_ids.size(0) # Chunk inputs into smaller batches to reduce memory peak
all_logps = []
all_probs = []
for i in range(0, input_ids.size(0), batch_size):
input_ids_batch = input_ids[i : i + batch_size]
attention_mask_batch = attention_mask[i : i + batch_size]
@ -827,19 +833,49 @@ class GRPOTrainer(Trainer):
# For transformers<=4.48, logits_to_keep argument isn't supported, so here we drop logits ourselves.
# See https://github.com/huggingface/trl/issues/2770
logits = logits[:, -logits_to_keep:]
# Divide logits by sampling temperature.
# See https://huggingface.co/blog/the_n_implementation_details_of_rlhf_with_ppo#policy-training-implementation-details
logits = logits / self.temperature
logps = selective_log_softmax(logits, input_ids_batch) # compute logprobs for the input tokens
# Divide logits by sampling temperature
scaled_logits = logits / self.temperature
# Compute full probability distributions
probs = torch.softmax(scaled_logits, dim=-1) # (B, seq_len, vocab_size)
all_probs.append(probs)
# Compute log probs for the selected tokens (original functionality)
logps = selective_log_softmax(scaled_logits, input_ids_batch) # (B, seq_len)
all_logps.append(logps)
return torch.cat(all_logps, dim=0)
return torch.cat(all_logps, dim=0), torch.cat(all_probs, dim=0)
@profiling_decorator
def _get_per_token_logps(self, model, input_ids, attention_mask, logits_to_keep, batch_size=None) -> torch.Tensor:
"""
Get the per-token log probabilities for the completions (backward compatible version).
Args:
model: The model to generate logits.
input_ids: The input token IDs.
attention_mask: The attention mask.
logits_to_keep: The number of logits to keep (corresponding to completion length).
batch_size: Optional batch size to process inputs in chunks.
Returns:
Tensor of shape (batch_size, logits_to_keep) containing log probabilities
"""
logps, _ = self._get_per_token_logps_and_probs(model, input_ids, attention_mask, logits_to_keep, batch_size)
return logps
@profiling_decorator
def _move_model_to_vllm(self):
# For DeepSpeed ZeRO-3, we need to gather all parameters before operations
deepspeed_plugin = self.accelerator.state.deepspeed_plugin
zero_stage_3 = deepspeed_plugin is not None and deepspeed_plugin.zero_stage == 3
gather_if_zero3 = deepspeed.zero.GatheredParameters if zero_stage_3 else nullcontext
if zero_stage_3:
import deepspeed
gather_if_zero3 = deepspeed.zero.GatheredParameters
else:
gather_if_zero3 = nullcontext
if is_peft_model(self.model):
# With PEFT and DeepSpeed ZeRO Stage 3, we must gather the full model at once before merging, as merging
@ -891,7 +927,7 @@ class GRPOTrainer(Trainer):
# - Completions are generated for each batch without buffering or reuse
# Returns a single local batch in both cases.
mode = "eval" if self.control.should_evaluate else "train"
mode = "train" if self.model.training else "eval"
if mode == "train":
generate_every = self.args.gradient_accumulation_steps * self.num_iterations
if self._step % generate_every == 0 or self._buffered_inputs is None:
@ -911,7 +947,7 @@ class GRPOTrainer(Trainer):
self, inputs: list[dict[str, Union[torch.Tensor, Any]]]
) -> dict[str, Union[torch.Tensor, Any]]:
device = self.accelerator.device
mode = "eval" if self.control.should_evaluate else "train"
mode = "train" if self.model.training else "eval"
prompts = [x["prompt"] for x in inputs]
prompts_text = [maybe_apply_chat_template(example, self.processing_class)["prompt"] for example in inputs]
@ -1170,7 +1206,7 @@ class GRPOTrainer(Trainer):
mean_kl = metrics[0] if self.beta != 0.0 else None
clip_ratio = metrics[-1]
mode = "eval" if self.control.should_evaluate else "train"
mode = "train" if self.model.training else "eval"
if self.beta != 0.0:
self._metrics[mode]["kl"].append(self.accelerator.gather_for_metrics(mean_kl).mean().item())
self._metrics[mode]["clip_ratio"].append(self.accelerator.gather_for_metrics(clip_ratio).mean().item())
@ -1194,8 +1230,25 @@ class GRPOTrainer(Trainer):
attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)
logits_to_keep = completion_ids.size(1) # we only need to compute the logits for the completion tokens
per_token_logps = self._get_per_token_logps(model, input_ids, attention_mask, logits_to_keep)
# Get both log probabilities and full probability distributions
per_token_logps, probs = self._get_per_token_logps_and_probs(model, input_ids, attention_mask, logits_to_keep)
# Calculate entropy from the full probability distributions
eps = 1e-8 # Small epsilon to avoid log(0)
entropy_per_token = -torch.sum(probs * torch.log(probs + eps), dim=-1) # (batch_size, seq_len)
# Average entropy across valid completion tokens (masked for padding)
mean_entropy = (entropy_per_token * completion_mask).sum(dim=1) / completion_mask.sum(dim=1).clamp(min=1.0)
# Log entropy metrics
mode = "train" if self.model.training else "eval"
agg_entropy = self.accelerator.gather_for_metrics(mean_entropy)
self._metrics[mode]["completions/entropy/mean"].append(agg_entropy.nanmean().item())
self._metrics[mode]["completions/entropy/min"].append(nanmin(agg_entropy).item())
self._metrics[mode]["completions/entropy/max"].append(nanmax(agg_entropy).item())
self._metrics[mode]["completions/entropy/std"].append(nanstd(agg_entropy).item())
# Continue with the original loss computation...
# Compute the KL divergence between the model and the reference model
if self.beta != 0.0:
ref_per_token_logps = inputs["ref_per_token_logps"]
@ -1226,7 +1279,7 @@ class GRPOTrainer(Trainer):
raise ValueError(f"Unknown loss type: {self.loss_type}")
# Log the metrics
mode = "eval" if self.control.should_evaluate else "train"
mode = "train" if self.model.training else "eval"
if self.beta != 0.0:
mean_kl = (per_token_kl * completion_mask).sum() / completion_mask.sum()
@ -1260,7 +1313,7 @@ class GRPOTrainer(Trainer):
return loss, None, None
def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None:
mode = "eval" if self.control.should_evaluate else "train"
mode = "train" if self.model.training else "eval"
metrics = {key: sum(val) / len(val) for key, val in self._metrics[mode].items()} # average the metrics
# This method can be called both in training and evaluation. When called in evaluation, the keys in `logs`

View File

@ -19,7 +19,6 @@ import textwrap
import warnings
from collections import defaultdict
from contextlib import contextmanager, nullcontext
from copy import deepcopy
from operator import itemgetter
from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Union
@ -31,7 +30,7 @@ import torch.nn as nn
import torch.nn.functional as F
import transformers
from accelerate import PartialState
from accelerate.utils import is_deepspeed_available, tqdm
from accelerate.utils import tqdm
from datasets import Dataset, concatenate_datasets
from packaging import version
from torch.utils.data import DataLoader, SequentialSampler
@ -54,7 +53,7 @@ from transformers.utils import is_peft_available
from ..data_utils import maybe_apply_chat_template, maybe_extract_prompt, maybe_unpair_preference_dataset
from ..import_utils import is_liger_kernel_available
from ..models import PreTrainedModelWrapper, create_reference_model
from ..models import create_reference_model, prepare_deepspeed
from .kto_config import KTOConfig
from .utils import (
DPODataCollatorWithPadding,
@ -68,9 +67,6 @@ from .utils import (
)
if is_deepspeed_available():
import deepspeed
if is_liger_kernel_available():
from liger_kernel.chunked_loss import LigerFusedLinearKTOLoss
@ -779,7 +775,7 @@ class KTOTrainer(Trainer):
)
else:
if self.is_deepspeed_enabled:
self.ref_model = self._prepare_deepspeed(self.ref_model)
self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator)
else:
self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True)
@ -808,37 +804,6 @@ class KTOTrainer(Trainer):
ignore_index=self.label_pad_token_id, beta=self.beta, use_ref_model=(self.ref_model is not None)
)
def _prepare_deepspeed(self, model: PreTrainedModelWrapper):
# Adapted from accelerate: https://github.com/huggingface/accelerate/blob/739b135f8367becb67ffaada12fe76e3aa60fefd/src/accelerate/accelerator.py#L1473
deepspeed_plugin = self.accelerator.state.deepspeed_plugin
config_kwargs = deepcopy(deepspeed_plugin.deepspeed_config)
if model is not None:
if hasattr(model, "config"):
hidden_size = (
max(model.config.hidden_sizes)
if getattr(model.config, "hidden_sizes", None)
else getattr(model.config, "hidden_size", None)
)
if hidden_size is not None and config_kwargs["zero_optimization"]["stage"] == 3:
# Note that `stage3_prefetch_bucket_size` can produce DeepSpeed messages like: `Invalidate trace cache @ step 0: expected module 1, but got module 0`
# This is expected and is not an error, see: https://github.com/microsoft/DeepSpeed/discussions/4081
config_kwargs.update(
{
"zero_optimization.reduce_bucket_size": hidden_size * hidden_size,
"zero_optimization.stage3_param_persistence_threshold": 10 * hidden_size,
"zero_optimization.stage3_prefetch_bucket_size": 0.9 * hidden_size * hidden_size,
}
)
# If ZeRO-3 is used, we shard both the active and reference model.
# Otherwise, we assume the reference model fits in memory and is initialized on each device with ZeRO disabled (stage 0)
if config_kwargs["zero_optimization"]["stage"] != 3:
config_kwargs["zero_optimization"]["stage"] = 0
model, *_ = deepspeed.initialize(model=model, config=config_kwargs)
model.eval()
return model
@contextmanager
def null_ref_context(self):
"""Context manager for handling null reference model (that is, peft adapter manipulation)."""

View File

@ -19,7 +19,6 @@ import textwrap
import warnings
from collections import defaultdict
from contextlib import nullcontext
from copy import deepcopy
from typing import Any, Callable, Literal, Optional, Union
import numpy as np
@ -30,7 +29,6 @@ import torch.nn as nn
import torch.nn.functional as F
import transformers
from accelerate import PartialState
from accelerate.utils import is_deepspeed_available
from datasets import Dataset
from packaging import version
from torch.utils.data import DataLoader
@ -52,7 +50,6 @@ from transformers.trainer_utils import EvalLoopOutput
from transformers.utils import is_peft_available, is_torch_fx_proxy
from ..data_utils import maybe_apply_chat_template, maybe_extract_prompt
from ..models import PreTrainedModelWrapper
from .orpo_config import ORPOConfig
from .utils import (
DPODataCollatorWithPadding,
@ -75,9 +72,6 @@ if is_peft_available():
if is_wandb_available():
import wandb
if is_deepspeed_available():
import deepspeed
if is_torch_xla_available():
import torch_xla.core.xla_model as xm
@ -358,37 +352,6 @@ class ORPOTrainer(Trainer):
"Your `Trainer` does not have an `accelerator` object. Consider upgrading `transformers`."
)
def _prepare_deepspeed(self, model: PreTrainedModelWrapper):
# Adapted from accelerate: https://github.com/huggingface/accelerate/blob/739b135f8367becb67ffaada12fe76e3aa60fefd/src/accelerate/accelerator.py#L1473
deepspeed_plugin = self.accelerator.state.deepspeed_plugin
config_kwargs = deepcopy(deepspeed_plugin.deepspeed_config)
if model is not None:
if hasattr(model, "config"):
hidden_size = (
max(model.config.hidden_sizes)
if getattr(model.config, "hidden_sizes", None)
else getattr(model.config, "hidden_size", None)
)
if hidden_size is not None and config_kwargs["zero_optimization"]["stage"] == 3:
# Note that `stage3_prefetch_bucket_size` can produce DeepSpeed messages like: `Invalidate trace cache @ step 0: expected module 1, but got module 0`
# This is expected and is not an error, see: https://github.com/microsoft/DeepSpeed/discussions/4081
config_kwargs.update(
{
"zero_optimization.reduce_bucket_size": hidden_size * hidden_size,
"zero_optimization.stage3_param_persistence_threshold": 10 * hidden_size,
"zero_optimization.stage3_prefetch_bucket_size": 0.9 * hidden_size * hidden_size,
}
)
# If ZeRO-3 is used, we shard both the active and reference model.
# Otherwise, we assume the reference model fits in memory and is initialized on each device with ZeRO disabled (stage 0)
if config_kwargs["zero_optimization"]["stage"] != 3:
config_kwargs["zero_optimization"]["stage"] = 0
model, *_ = deepspeed.initialize(model=model, config=config_kwargs)
model.eval()
return model
def build_tokenized_answer(self, prompt, answer):
"""
Llama tokenizer does satisfy `enc(a + b) = enc(a) + enc(b)`.

View File

@ -62,6 +62,8 @@ class SFTConfig(TrainingArguments):
continuous sequence. This reduces memory usage by eliminating padding overhead. Currently, this is only
supported with the `flash_attention_2` attention implementation, which can efficiently handle the flattened
batch structure.
pad_to_multiple_of (`int` or `None`, *optional*, defaults to `None`):
If set, the sequences will be padded to a multiple of this value.
eval_packing (`bool` or `None`, *optional*, defaults to `None`):
Whether to pack the eval dataset. If `None`, uses the same value as `packing`.
@ -140,6 +142,10 @@ class SFTConfig(TrainingArguments):
"handle the flattened batch structure."
},
)
pad_to_multiple_of: Optional[int] = field(
default=None,
metadata={"help": "If set, the sequences will be padded to a multiple of this value."},
)
eval_packing: Optional[bool] = field(
default=None,
metadata={"help": "Whether to pack the eval dataset. If `None`, uses the same value as `packing`."},
@ -167,77 +173,19 @@ class SFTConfig(TrainingArguments):
)
# Deprecated parameters
dataset_batch_size: Optional[int] = field(
default=None,
metadata={
"help": "This parameter is deprecated and will be removed in version 0.18.0. You can safely remove this "
"parameter from your configuration."
},
)
num_of_sequences: Optional[int] = field(
default=None,
metadata={
"help": "This parameter is deprecated and will be removed in version 0.18.0. Use `max_length` instead, "
"which specifies the maximum length of the tokenized sequence, unlike `num_of_sequences`, which referred "
"to string sequences."
},
)
chars_per_token: Optional[float] = field(
default=None,
metadata={
"help": "This parameter is deprecated and will be removed in version 0.18.0. If you want to customize the "
"packing length, use `max_length`."
},
)
max_seq_length: Optional[int] = field(
default=None,
metadata={
"help": "This parameter is deprecated and will be removed in version 0.20.0. Use `max_length` instead."
},
)
use_liger: Optional[bool] = field(
default=None,
metadata={
"help": "This parameter is deprecated and will be removed in version 0.18.0. Use `use_liger_kernel` "
"instead."
},
)
def __post_init__(self):
super().__post_init__()
if self.dataset_batch_size is not None:
warnings.warn(
"`dataset_batch_size` is deprecated and will be removed in version 0.18.0. You can safely remove this "
"parameter from your configuration.",
DeprecationWarning,
)
if self.num_of_sequences is not None:
warnings.warn(
"`num_of_sequences` is deprecated and will be removed in version 0.18.0. Use `max_length` instead, "
"which specifies the maximum length of the tokenized sequence, unlike `num_of_sequences`, which "
"referred to string sequences.",
DeprecationWarning,
)
if self.chars_per_token is not None:
warnings.warn(
"`chars_per_token` is deprecated and will be removed in version 0.18.0. If you want to customize the "
"packing length, use `max_length`.",
DeprecationWarning,
)
if self.max_seq_length is not None:
warnings.warn(
"`max_seq_length` is deprecated and will be removed in version 0.20.0. Use `max_length` instead.",
DeprecationWarning,
)
self.max_length = self.max_seq_length
if self.use_liger is not None:
warnings.warn(
"`use_liger` is deprecated and will be removed in version 0.18.0. Use `use_liger_kernel` instead.",
DeprecationWarning,
)
self.use_liger_kernel = self.use_liger

View File

@ -80,7 +80,9 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
Token ID to use for padding.
completion_only_loss (`bool`, *optional*, defaults to `True`):
When the input contains a completion mask (`completion_mask`), the labels are set to -100 for the tokens
that are not in the completion.
that are no in the completion.
pad_to_multiple_of (`int` or `None`, *optional*, defaults to `None`):
If set, the sequences will be padded to a multiple of this value.
return_tensors (`str`, *optional*, defaults to `"pt"`):
Type of Tensor to return. Only `"pt"` is currently supported.
@ -116,6 +118,7 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
pad_token_id: int
completion_only_loss: bool = True
pad_to_multiple_of: Optional[int] = None
return_tensors: str = "pt"
def torch_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
@ -128,11 +131,22 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
# Pad
output = {}
output["input_ids"] = pad(input_ids, padding_value=self.pad_token_id, padding_side="right")
output["attention_mask"] = pad(attention_mask, padding_value=0, padding_side="right")
output["labels"] = pad(labels, padding_value=-100, padding_side="right")
output["input_ids"] = pad(
input_ids,
padding_value=self.pad_token_id,
padding_side="right",
pad_to_multiple_of=self.pad_to_multiple_of,
)
output["attention_mask"] = pad(
attention_mask, padding_value=0, padding_side="right", pad_to_multiple_of=self.pad_to_multiple_of
)
output["labels"] = pad(
labels, padding_value=-100, padding_side="right", pad_to_multiple_of=self.pad_to_multiple_of
)
if self.completion_only_loss and "completion_mask" in examples[0]:
completion_mask = pad(completion_mask, padding_value=0, padding_side="right")
completion_mask = pad(
completion_mask, padding_value=0, padding_side="right", pad_to_multiple_of=self.pad_to_multiple_of
)
output["labels"][completion_mask == 0] = -100 # mask everything that is not in the completion
return output
@ -315,7 +329,9 @@ class SFTTrainer(Trainer):
f"`processing_class` ({processing_class.__class__.__name__}). Ensure that the `pad_token` exists "
"in the vocabulary before using it as a padding token."
)
data_collator = DataCollatorForLanguageModeling(pad_token_id, self.completion_only_loss)
data_collator = DataCollatorForLanguageModeling(
pad_token_id, self.completion_only_loss, args.pad_to_multiple_of
)
# Dataset
preprocess_dataset = args.dataset_kwargs is None or not args.dataset_kwargs.get("skip_prepare_dataset", False)
@ -650,7 +666,7 @@ class SFTTrainer(Trainer):
"""
Compute training loss and additionally compute token accuracies
"""
mode = "eval" if self.control.should_evaluate else "train"
mode = "train" if self.model.training else "eval"
(loss, outputs) = super().compute_loss(
model, inputs, return_outputs=True, num_items_in_batch=num_items_in_batch
)
@ -695,7 +711,7 @@ class SFTTrainer(Trainer):
return (loss, outputs) if return_outputs else loss
def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None:
mode = "eval" if self.control.should_evaluate else "train"
mode = "train" if self.model.training else "eval"
metrics = {key: sum(val) / len(val) for key, val in self._metrics[mode].items()} # average the metrics
# This method can be called both in training and evaluation. When called in evaluation, the keys in `logs`

View File

@ -415,7 +415,12 @@ class RewardDataCollatorWithPadding:
return batch
def pad(tensors: list[torch.Tensor], padding_value: int = 0, padding_side: str = "right") -> torch.Tensor:
def pad(
tensors: list[torch.Tensor],
padding_value: int = 0,
padding_side: str = "right",
pad_to_multiple_of: Optional[int] = None,
) -> torch.Tensor:
"""
Pads a list of tensors to the same shape along the first dimension.
@ -426,6 +431,8 @@ def pad(tensors: list[torch.Tensor], padding_value: int = 0, padding_side: str =
Value to use for padding. Default is 0.
padding_side (`str`):
Side on which to add padding. Must be 'left' or 'right'. Default is 'right'.
pad_to_multiple_of (`int`, *optional*, defaults to `None`):
If set will pad the sequence to a multiple of the provided value.
Returns:
`torch.Tensor`:
@ -446,18 +453,25 @@ def pad(tensors: list[torch.Tensor], padding_value: int = 0, padding_side: str =
# Determine the maximum shape for each dimension
output_shape = np.max([t.shape for t in tensors], 0).tolist()
# Apply pad_to_multiple_of to the first (sequence) dimension
if pad_to_multiple_of is not None:
remainder = output_shape[0] % pad_to_multiple_of
if remainder != 0:
output_shape[0] += pad_to_multiple_of - remainder
# Create an output tensor filled with the padding value
output = torch.full((len(tensors), *output_shape), padding_value, dtype=tensors[0].dtype, device=tensors[0].device)
for i, t in enumerate(tensors):
# Determine the slice for the sequence dimension
if padding_side == "left":
seq_slice = slice(output_shape[0] - t.shape[0], output_shape[0])
seq_start = output_shape[0] - t.shape[0]
elif padding_side == "right":
seq_slice = slice(0, t.shape[0])
seq_start = 0
else:
raise ValueError("padding_side must be 'left' or 'right'")
# Define the slices
seq_slice = slice(seq_start, seq_start + t.shape[0])
slices = (seq_slice,) + tuple(slice(0, s) for s in t.shape[1:])
output[i][slices] = t