mirror of
https://github.com/huggingface/trl.git
synced 2025-10-21 02:53:59 +08:00
Compare commits
8 Commits
v0.17.0
...
add-entrop
Author | SHA1 | Date | |
---|---|---|---|
728a73caa4 | |||
2bf48478e8 | |||
a8cfca6d01 | |||
1bca49515e | |||
39e96394a9 | |||
8e6ed93dfd | |||
29c5e05e3a | |||
a9b27f82d6 |
2
setup.py
2
setup.py
@ -69,7 +69,7 @@ To create the package for PyPI.
|
||||
from setuptools import find_packages, setup
|
||||
|
||||
|
||||
__version__ = "0.17.0" # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
|
||||
__version__ = "0.18.0.dev0" # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
|
||||
|
||||
REQUIRED_PKGS = [
|
||||
"accelerate>=0.34.0",
|
||||
|
@ -13,12 +13,15 @@
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
import unittest
|
||||
from io import StringIO
|
||||
from unittest.mock import patch
|
||||
|
||||
import yaml
|
||||
|
||||
|
||||
@unittest.skipIf(
|
||||
sys.version_info < (3, 10),
|
||||
@ -67,6 +70,33 @@ class TestCLI(unittest.TestCase):
|
||||
with patch("sys.argv", command.split(" ")):
|
||||
main()
|
||||
|
||||
def test_sft_config_file(self):
|
||||
from trl.cli import main
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp_dir: # Create a temporary directory
|
||||
output_dir = os.path.join(tmp_dir, "output")
|
||||
|
||||
# Create a temporary config file
|
||||
config_path = os.path.join(tmp_dir, "config.yaml")
|
||||
config_content = {
|
||||
"model_name_or_path": "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5",
|
||||
"dataset_name": "trl-internal-testing/zen",
|
||||
"dataset_config": "standard_language_modeling",
|
||||
"report_to": "none",
|
||||
"output_dir": output_dir,
|
||||
"lr_scheduler_type": "cosine_with_restarts",
|
||||
}
|
||||
with open(config_path, "w") as config_file:
|
||||
yaml.dump(config_content, config_file)
|
||||
|
||||
# Test the CLI with config file
|
||||
command = f"trl sft --config {config_path}"
|
||||
with patch("sys.argv", command.split(" ")):
|
||||
main()
|
||||
|
||||
# Verify that output directory was created
|
||||
self.assertTrue(os.path.exists(output_dir))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
@ -370,7 +370,6 @@ class TrainerArgTester(unittest.TestCase):
|
||||
packing=True,
|
||||
max_length=256,
|
||||
dataset_num_proc=4,
|
||||
dataset_batch_size=512,
|
||||
neftune_noise_alpha=0.1,
|
||||
model_init_kwargs={"trust_remote_code": True},
|
||||
dataset_kwargs={"append_concat_token": True, "skip_prepare_dataset": True},
|
||||
@ -381,7 +380,6 @@ class TrainerArgTester(unittest.TestCase):
|
||||
self.assertEqual(trainer.args.packing, True)
|
||||
self.assertEqual(trainer.args.max_length, 256)
|
||||
self.assertEqual(trainer.args.dataset_num_proc, 4)
|
||||
self.assertEqual(trainer.args.dataset_batch_size, 512)
|
||||
self.assertEqual(trainer.args.neftune_noise_alpha, 0.1)
|
||||
self.assertEqual(trainer.args.model_init_kwargs, {"trust_remote_code": True})
|
||||
self.assertIn("append_concat_token", trainer.args.dataset_kwargs)
|
||||
|
@ -95,6 +95,38 @@ class TestPad(unittest.TestCase):
|
||||
)
|
||||
self.assertTrue(torch.equal(output, expected))
|
||||
|
||||
def test_pad_to_multiple_of_1(self):
|
||||
x = torch.tensor([1, 2, 3])
|
||||
y = torch.tensor([4, 5])
|
||||
# Max length is 3, pad to multiple of 4
|
||||
output = pad((x, y), padding_value=0, padding_side="right", pad_to_multiple_of=4)
|
||||
expected = torch.tensor([[1, 2, 3, 0], [4, 5, 0, 0]])
|
||||
self.assertTrue(torch.equal(output, expected))
|
||||
|
||||
def test_pad_to_multiple_of_2(self):
|
||||
x = torch.tensor([1, 2, 3, 4, 5])
|
||||
y = torch.tensor([6, 7, 8])
|
||||
# Max length is 3, pad to multiple of 4
|
||||
output = pad((x, y), padding_value=0, padding_side="right", pad_to_multiple_of=4)
|
||||
expected = torch.tensor([[1, 2, 3, 4, 5, 0, 0, 0], [6, 7, 8, 0, 0, 0, 0, 0]])
|
||||
self.assertTrue(torch.equal(output, expected))
|
||||
|
||||
def test_pad_to_multiple_of_side_left(self):
|
||||
x = torch.tensor([1, 2, 3, 4, 5])
|
||||
y = torch.tensor([6, 7, 8])
|
||||
# Max length is 3, pad to multiple of 4
|
||||
output = pad((x, y), padding_value=0, padding_side="left", pad_to_multiple_of=4)
|
||||
expected = torch.tensor([[0, 0, 0, 1, 2, 3, 4, 5], [0, 0, 0, 0, 0, 6, 7, 8]])
|
||||
self.assertTrue(torch.equal(output, expected))
|
||||
|
||||
def test_pad_to_multiple_of_no_extra_padding(self):
|
||||
x = torch.tensor([1, 2, 3, 4])
|
||||
y = torch.tensor([5, 6, 7, 8])
|
||||
# Already multiple of 4
|
||||
output = pad((x, y), padding_value=0, padding_side="left", pad_to_multiple_of=4)
|
||||
expected = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 8]])
|
||||
self.assertTrue(torch.equal(output, expected))
|
||||
|
||||
|
||||
@require_peft
|
||||
class TestGetPEFTConfig(unittest.TestCase):
|
||||
|
@ -12,7 +12,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
__version__ = "0.17.0"
|
||||
__version__ = "0.18.0.dev0"
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
|
@ -46,7 +46,7 @@ def main():
|
||||
make_vllm_serve_parser(subparsers)
|
||||
|
||||
# Parse the arguments
|
||||
args = parser.parse_args()
|
||||
args = parser.parse_args_and_config()[0]
|
||||
|
||||
if args.command == "chat":
|
||||
(chat_args,) = parser.parse_args_and_config()
|
||||
|
@ -18,7 +18,6 @@ from copy import deepcopy
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING, Literal, Optional, Union
|
||||
|
||||
from accelerate.utils import is_deepspeed_available
|
||||
from packaging import version
|
||||
from transformers import PreTrainedModel, PreTrainedTokenizer
|
||||
|
||||
@ -30,12 +29,10 @@ SUPPORTED_ARCHITECTURES = (
|
||||
AutoModelForSeq2SeqLMWithValueHead,
|
||||
)
|
||||
|
||||
if is_deepspeed_available():
|
||||
import deepspeed
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from accelerate import Accelerator
|
||||
from deepspeed.runtime.engine import DeepSpeedEngine
|
||||
from torch.nn import Module
|
||||
from torch.nn.parallel.distributed import DistributedDataParallel
|
||||
|
||||
|
||||
@ -167,6 +164,8 @@ def iter_params(module, recurse=False):
|
||||
|
||||
def add_hooks(model: "DeepSpeedEngine") -> None:
|
||||
"""Adds the optimizer hooks from a DeepSpeed ZeRO-3 model."""
|
||||
import deepspeed
|
||||
|
||||
if not hasattr(model, "optimizer"): # before the first training step, the model has no optimizer
|
||||
return
|
||||
if model.optimizer is not None and hasattr(model.optimizer, "parameter_offload"):
|
||||
@ -214,6 +213,8 @@ def unwrap_model_for_generation(
|
||||
if not gather_deepspeed3_params:
|
||||
yield accelerator.unwrap_model(model)
|
||||
else:
|
||||
import deepspeed
|
||||
|
||||
with deepspeed.zero.GatheredParameters(model.parameters()):
|
||||
remove_hooks(model)
|
||||
yield accelerator.unwrap_model(model)
|
||||
@ -222,8 +223,13 @@ def unwrap_model_for_generation(
|
||||
yield unwrapped_model
|
||||
|
||||
|
||||
def prepare_deepspeed(model, accelerator):
|
||||
# Adapted from accelerate: https://github.com/huggingface/accelerate/blob/739b135f8367becb67ffaada12fe76e3aa60fefd/src/accelerate/accelerator.py#L1473
|
||||
def prepare_deepspeed(model: "Module", accelerator: "Accelerator"):
|
||||
"""Prepares the model for DeepSpeed inference or evaluation by initializing it with the appropriate configuration.
|
||||
|
||||
Adapted from accelerate: https://github.com/huggingface/accelerate/blob/739b135f8367becb67ffaada12fe76e3aa60fefd/src/accelerate/accelerator.py#L1473
|
||||
"""
|
||||
import deepspeed # local import (instead of top-level) to avoid DS init interfering with other backends (like vllm): https://github.com/deepspeedai/DeepSpeed/issues/7252
|
||||
|
||||
deepspeed_plugin = accelerator.state.deepspeed_plugin
|
||||
config_kwargs = deepcopy(deepspeed_plugin.deepspeed_config)
|
||||
stage = config_kwargs["zero_optimization"]["stage"]
|
||||
|
@ -51,7 +51,7 @@ class ScriptArguments:
|
||||
type, inplace operation. See https://github.com/huggingface/transformers/issues/22482#issuecomment-1595790992.
|
||||
"""
|
||||
|
||||
dataset_name: str = field(metadata={"help": "Dataset name."})
|
||||
dataset_name: Optional[str] = field(default=None, metadata={"help": "Dataset name."})
|
||||
dataset_config: Optional[str] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
|
@ -19,7 +19,6 @@ import textwrap
|
||||
import warnings
|
||||
from collections import defaultdict
|
||||
from contextlib import contextmanager, nullcontext
|
||||
from copy import deepcopy
|
||||
from operator import itemgetter
|
||||
from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Union
|
||||
|
||||
@ -32,7 +31,7 @@ import torch.nn.functional as F
|
||||
import transformers
|
||||
from accelerate import PartialState
|
||||
from accelerate.logging import get_logger
|
||||
from accelerate.utils import is_deepspeed_available, tqdm
|
||||
from accelerate.utils import tqdm
|
||||
from datasets import Dataset
|
||||
from packaging import version
|
||||
from torch.utils.data import DataLoader, SequentialSampler
|
||||
@ -56,7 +55,7 @@ from transformers.utils import is_peft_available
|
||||
|
||||
from ..data_utils import maybe_apply_chat_template
|
||||
from ..import_utils import is_joblib_available
|
||||
from ..models import PreTrainedModelWrapper, create_reference_model
|
||||
from ..models import create_reference_model, prepare_deepspeed
|
||||
from .bco_config import BCOConfig
|
||||
from .utils import (
|
||||
DPODataCollatorWithPadding,
|
||||
@ -83,9 +82,6 @@ if is_sklearn_available():
|
||||
if is_joblib_available():
|
||||
import joblib
|
||||
|
||||
if is_deepspeed_available():
|
||||
import deepspeed
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from transformers import PreTrainedModel, PreTrainedTokenizer
|
||||
|
||||
@ -712,7 +708,7 @@ class BCOTrainer(Trainer):
|
||||
)
|
||||
else:
|
||||
if self.is_deepspeed_enabled:
|
||||
self.ref_model = self._prepare_deepspeed(self.ref_model)
|
||||
self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator)
|
||||
else:
|
||||
self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True)
|
||||
|
||||
@ -846,37 +842,6 @@ class BCOTrainer(Trainer):
|
||||
|
||||
return all_embeddings
|
||||
|
||||
def _prepare_deepspeed(self, model: PreTrainedModelWrapper):
|
||||
# Adapted from accelerate: https://github.com/huggingface/accelerate/blob/739b135f8367becb67ffaada12fe76e3aa60fefd/src/accelerate/accelerator.py#L1473
|
||||
deepspeed_plugin = self.accelerator.state.deepspeed_plugin
|
||||
config_kwargs = deepcopy(deepspeed_plugin.deepspeed_config)
|
||||
|
||||
if model is not None:
|
||||
if hasattr(model, "config"):
|
||||
hidden_size = (
|
||||
max(model.config.hidden_sizes)
|
||||
if getattr(model.config, "hidden_sizes", None)
|
||||
else getattr(model.config, "hidden_size", None)
|
||||
)
|
||||
if hidden_size is not None and config_kwargs["zero_optimization"]["stage"] == 3:
|
||||
# Note that `stage3_prefetch_bucket_size` can produce DeepSpeed messages like: `Invalidate trace cache @ step 0: expected module 1, but got module 0`
|
||||
# This is expected and is not an error, see: https://github.com/microsoft/DeepSpeed/discussions/4081
|
||||
config_kwargs.update(
|
||||
{
|
||||
"zero_optimization.reduce_bucket_size": hidden_size * hidden_size,
|
||||
"zero_optimization.stage3_param_persistence_threshold": 10 * hidden_size,
|
||||
"zero_optimization.stage3_prefetch_bucket_size": 0.9 * hidden_size * hidden_size,
|
||||
}
|
||||
)
|
||||
|
||||
# If ZeRO-3 is used, we shard both the active and reference model.
|
||||
# Otherwise, we assume the reference model fits in memory and is initialized on each device with ZeRO disabled (stage 0)
|
||||
if config_kwargs["zero_optimization"]["stage"] != 3:
|
||||
config_kwargs["zero_optimization"]["stage"] = 0
|
||||
model, *_ = deepspeed.initialize(model=model, config=config_kwargs)
|
||||
model.eval()
|
||||
return model
|
||||
|
||||
def _save_optimizer_and_scheduler(self, output_dir):
|
||||
output_dir = output_dir if output_dir is not None else self.args.output_dir
|
||||
super()._save_optimizer_and_scheduler(output_dir)
|
||||
|
@ -19,7 +19,7 @@ import pandas as pd
|
||||
import torch
|
||||
from accelerate import Accelerator
|
||||
from accelerate.state import AcceleratorState
|
||||
from accelerate.utils import gather_object, is_comet_ml_available, is_deepspeed_available, is_wandb_available
|
||||
from accelerate.utils import gather_object, is_comet_ml_available, is_wandb_available
|
||||
from rich.console import Console, Group
|
||||
from rich.live import Live
|
||||
from rich.panel import Panel
|
||||
@ -44,9 +44,6 @@ from .judges import BasePairwiseJudge
|
||||
from .utils import log_table_to_comet_experiment
|
||||
|
||||
|
||||
if is_deepspeed_available():
|
||||
import deepspeed
|
||||
|
||||
if is_comet_ml_available():
|
||||
pass
|
||||
|
||||
@ -115,6 +112,8 @@ class SyncRefModelCallback(TrainerCallback):
|
||||
def sync_target_model(model, target_model, alpha):
|
||||
deepspeed_plugin = AcceleratorState().deepspeed_plugin
|
||||
if deepspeed_plugin is not None and deepspeed_plugin.zero_stage == 3:
|
||||
import deepspeed
|
||||
|
||||
with deepspeed.zero.GatheredParameters(
|
||||
list(model.parameters()) + list(target_model.parameters()), modifier_rank=0
|
||||
):
|
||||
|
@ -19,7 +19,6 @@ import textwrap
|
||||
import warnings
|
||||
from collections import defaultdict
|
||||
from contextlib import contextmanager, nullcontext
|
||||
from copy import deepcopy
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Callable, Literal, Optional, Union
|
||||
|
||||
@ -30,7 +29,7 @@ import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import transformers
|
||||
from accelerate import PartialState
|
||||
from accelerate.utils import is_deepspeed_available, tqdm
|
||||
from accelerate.utils import tqdm
|
||||
from datasets import Dataset, IterableDataset
|
||||
from packaging import version
|
||||
from torch.utils.data import DataLoader
|
||||
@ -53,7 +52,7 @@ from transformers.trainer_utils import EvalLoopOutput
|
||||
from transformers.utils import is_peft_available, is_torch_xpu_available
|
||||
|
||||
from ..data_utils import maybe_apply_chat_template, maybe_extract_prompt
|
||||
from ..models import PreTrainedModelWrapper, create_reference_model
|
||||
from ..models import create_reference_model, prepare_deepspeed
|
||||
from ..models.utils import prepare_fsdp
|
||||
from .callbacks import SyncRefModelCallback
|
||||
from .dpo_config import DPOConfig, FDivergenceConstants, FDivergenceType
|
||||
@ -80,9 +79,6 @@ if is_peft_available():
|
||||
if is_wandb_available():
|
||||
import wandb
|
||||
|
||||
if is_deepspeed_available():
|
||||
import deepspeed
|
||||
|
||||
|
||||
@dataclass
|
||||
class DataCollatorForPreference(DataCollatorMixin):
|
||||
@ -184,7 +180,6 @@ class DPOTrainer(Trainer):
|
||||
Processing class used to process the data. If provided, will be used to automatically process the inputs
|
||||
for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
|
||||
reuse the fine-tuned model.
|
||||
This supercedes the `tokenizer` argument, which is now deprecated.
|
||||
model_init (`Callable[[], transformers.PreTrainedModel]`):
|
||||
The model initializer to use for training. If None is specified, the default model initializer will be used.
|
||||
compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
|
||||
@ -510,7 +505,7 @@ class DPOTrainer(Trainer):
|
||||
)
|
||||
else:
|
||||
if self.is_deepspeed_enabled:
|
||||
self.ref_model = self._prepare_deepspeed(self.ref_model)
|
||||
self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator)
|
||||
elif self.is_fsdp_enabled:
|
||||
self.ref_model = prepare_fsdp(self.ref_model, self.accelerator)
|
||||
else:
|
||||
@ -676,37 +671,6 @@ class DPOTrainer(Trainer):
|
||||
|
||||
return output
|
||||
|
||||
def _prepare_deepspeed(self, model: PreTrainedModelWrapper):
|
||||
# Adapted from accelerate: https://github.com/huggingface/accelerate/blob/739b135f8367becb67ffaada12fe76e3aa60fefd/src/accelerate/accelerator.py#L1473
|
||||
deepspeed_plugin = self.accelerator.state.deepspeed_plugin
|
||||
config_kwargs = deepcopy(deepspeed_plugin.deepspeed_config)
|
||||
|
||||
if model is not None:
|
||||
if hasattr(model, "config"):
|
||||
hidden_size = (
|
||||
max(model.config.hidden_sizes)
|
||||
if getattr(model.config, "hidden_sizes", None)
|
||||
else getattr(model.config, "hidden_size", None)
|
||||
)
|
||||
if hidden_size is not None and config_kwargs["zero_optimization"]["stage"] == 3:
|
||||
# Note that `stage3_prefetch_bucket_size` can produce DeepSpeed messages like: `Invalidate trace cache @ step 0: expected module 1, but got module 0`
|
||||
# This is expected and is not an error, see: https://github.com/microsoft/DeepSpeed/discussions/4081
|
||||
config_kwargs.update(
|
||||
{
|
||||
"zero_optimization.reduce_bucket_size": hidden_size * hidden_size,
|
||||
"zero_optimization.stage3_param_persistence_threshold": 10 * hidden_size,
|
||||
"zero_optimization.stage3_prefetch_bucket_size": 0.9 * hidden_size * hidden_size,
|
||||
}
|
||||
)
|
||||
|
||||
# If ZeRO-3 is used, we shard both the active and reference model.
|
||||
# Otherwise, we assume the reference model fits in memory and is initialized on each device with ZeRO disabled (stage 0)
|
||||
if config_kwargs["zero_optimization"]["stage"] != 3:
|
||||
config_kwargs["zero_optimization"]["stage"] = 0
|
||||
model, *_ = deepspeed.initialize(model=model, config=config_kwargs)
|
||||
model.eval()
|
||||
return model
|
||||
|
||||
def _set_signature_columns_if_needed(self):
|
||||
# If `self.args.remove_unused_columns` is True, non-signature columns are removed.
|
||||
# By default, this method sets `self._signature_columns` to the model's expected inputs.
|
||||
|
@ -15,13 +15,11 @@
|
||||
import os
|
||||
import random
|
||||
import textwrap
|
||||
from copy import deepcopy
|
||||
from typing import Any, Callable, Optional, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from accelerate.utils import is_deepspeed_available
|
||||
from datasets import Dataset
|
||||
from transformers import (
|
||||
AutoModelForCausalLM,
|
||||
@ -38,7 +36,7 @@ from transformers.trainer_callback import TrainerCallback
|
||||
from transformers.trainer_utils import EvalPrediction
|
||||
from transformers.utils import is_peft_available
|
||||
|
||||
from ..models import PreTrainedModelWrapper
|
||||
from ..models import prepare_deepspeed
|
||||
from ..models.utils import unwrap_model_for_generation
|
||||
from .gkd_config import GKDConfig
|
||||
from .sft_trainer import SFTTrainer
|
||||
@ -51,10 +49,6 @@ from .utils import (
|
||||
)
|
||||
|
||||
|
||||
if is_deepspeed_available():
|
||||
import deepspeed
|
||||
|
||||
|
||||
if is_peft_available():
|
||||
from peft import PeftConfig
|
||||
|
||||
@ -124,7 +118,7 @@ class GKDTrainer(SFTTrainer):
|
||||
disable_dropout_in_model(self.model)
|
||||
|
||||
if self.is_deepspeed_enabled:
|
||||
self.teacher_model = self._prepare_deepspeed(teacher_model)
|
||||
self.teacher_model = prepare_deepspeed(teacher_model, self.accelerator)
|
||||
else:
|
||||
self.teacher_model = self.accelerator.prepare_model(teacher_model, evaluation_mode=True)
|
||||
|
||||
@ -311,37 +305,6 @@ class GKDTrainer(SFTTrainer):
|
||||
loss = super().training_step(model, inputs, num_items_in_batch)
|
||||
return loss
|
||||
|
||||
def _prepare_deepspeed(self, model: PreTrainedModelWrapper):
|
||||
# Adapted from accelerate: https://github.com/huggingface/accelerate/blob/739b135f8367becb67ffaada12fe76e3aa60fefd/src/accelerate/accelerator.py#L1473
|
||||
deepspeed_plugin = self.accelerator.state.deepspeed_plugin
|
||||
config_kwargs = deepcopy(deepspeed_plugin.deepspeed_config)
|
||||
|
||||
if model is not None:
|
||||
if hasattr(model, "config"):
|
||||
hidden_size = (
|
||||
max(model.config.hidden_sizes)
|
||||
if getattr(model.config, "hidden_sizes", None)
|
||||
else getattr(model.config, "hidden_size", None)
|
||||
)
|
||||
if hidden_size is not None and config_kwargs["zero_optimization"]["stage"] == 3:
|
||||
# Note that `stage3_prefetch_bucket_size` can produce DeepSpeed messages like: `Invalidate trace cache @ step 0: expected module 1, but got module 0`
|
||||
# This is expected and is not an error, see: https://github.com/microsoft/DeepSpeed/discussions/4081
|
||||
config_kwargs.update(
|
||||
{
|
||||
"zero_optimization.reduce_bucket_size": hidden_size * hidden_size,
|
||||
"zero_optimization.stage3_param_persistence_threshold": 10 * hidden_size,
|
||||
"zero_optimization.stage3_prefetch_bucket_size": 0.9 * hidden_size * hidden_size,
|
||||
}
|
||||
)
|
||||
|
||||
# If ZeRO-3 is used, we shard both the active and reference model.
|
||||
# Otherwise, we assume the reference model fits in memory and is initialized on each device with ZeRO disabled (stage 0)
|
||||
if config_kwargs["zero_optimization"]["stage"] != 3:
|
||||
config_kwargs["zero_optimization"]["stage"] = 0
|
||||
model, *_ = deepspeed.initialize(model=model, config=config_kwargs)
|
||||
model.eval()
|
||||
return model
|
||||
|
||||
def create_model_card(
|
||||
self,
|
||||
model_name: Optional[str] = None,
|
||||
|
@ -11,7 +11,7 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import warnings
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional, Union
|
||||
|
||||
@ -412,83 +412,3 @@ class GRPOConfig(TrainingArguments):
|
||||
"all prompts are logged."
|
||||
},
|
||||
)
|
||||
|
||||
# Deprecated parameters
|
||||
vllm_device: Optional[str] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": "This parameter is deprecated and will be removed in version 0.18.0. To use vLLM, start a vLLM "
|
||||
"server with the `trl vllm-serve` command."
|
||||
},
|
||||
)
|
||||
vllm_gpu_memory_utilization: Optional[float] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": "This parameter is deprecated and will be removed in version 0.18.0. To control the GPU memory "
|
||||
"utilization for vLLM, you should now use the `gpu_memory_utilization` parameter in the vLLM server "
|
||||
"configuration."
|
||||
},
|
||||
)
|
||||
vllm_dtype: Optional[str] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": "This parameter is deprecated and will be removed in version 0.18.0. To control the data type for "
|
||||
"vLLM generation, you should now use the `dtype` parameter in the vLLM server configuration."
|
||||
},
|
||||
)
|
||||
vllm_max_model_len: Optional[int] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": "This parameter is deprecated and will be removed in version 0.18.0. To control the "
|
||||
"`max_model_len` for vLLM, you should now use the `max_model_len` parameter in the vLLM server "
|
||||
"configuration."
|
||||
},
|
||||
)
|
||||
vllm_enable_prefix_caching: Optional[bool] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": "This parameter is deprecated and will be removed in version 0.18.0. To control prefix caching in "
|
||||
"vLLM, you should now use the `enable_prefix_caching` parameter in the vLLM server configuration."
|
||||
},
|
||||
)
|
||||
|
||||
def __post_init__(self):
|
||||
super().__post_init__()
|
||||
|
||||
if self.vllm_device is not None:
|
||||
warnings.warn(
|
||||
"`vllm_device` is deprecated and will be removed in version 0.18.0. To use vLLM, start a vLLM server "
|
||||
"with the `trl vllm-serve` command.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
|
||||
if self.vllm_gpu_memory_utilization is not None:
|
||||
warnings.warn(
|
||||
"`vllm_gpu_memory_utilization` is deprecated and will be removed in v0.18. To control the GPU memory "
|
||||
"utilization for vLLM, you should now use the `gpu_memory_utilization` parameter in the vLLM server "
|
||||
"configuration.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
|
||||
if self.vllm_dtype is not None:
|
||||
warnings.warn(
|
||||
"`vllm_dtype` is deprecated and will be removed in version 0.18.0. To control the data type for vLLM "
|
||||
"generation, you should now use the `dtype` parameter in the vLLM server configuration.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
|
||||
if self.vllm_max_model_len is not None:
|
||||
warnings.warn(
|
||||
"`vllm_max_model_len` is deprecated and will be removed in version 0.18.0. To control the "
|
||||
"`max_model_len` for vLLM, you should now use the `max_model_len` parameter in the vLLM server "
|
||||
"configuration.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
|
||||
if self.vllm_enable_prefix_caching is not None:
|
||||
warnings.warn(
|
||||
"`vllm_enable_prefix_caching` is deprecated and will be removed in version 0.18.0. To control prefix "
|
||||
"caching in vLLM, you should now use the `enable_prefix_caching` parameter in the vLLM server "
|
||||
"configuration.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
|
@ -47,7 +47,7 @@ from transformers.utils import is_datasets_available, is_peft_available
|
||||
from ..data_utils import apply_chat_template, is_conversational, maybe_apply_chat_template
|
||||
from ..extras.profiling import profiling_context, profiling_decorator
|
||||
from ..extras.vllm_client import VLLMClient
|
||||
from ..import_utils import is_deepspeed_available, is_liger_kernel_available, is_rich_available, is_vllm_available
|
||||
from ..import_utils import is_liger_kernel_available, is_rich_available, is_vllm_available
|
||||
from ..models import create_reference_model, prepare_deepspeed, unwrap_model_for_generation
|
||||
from .callbacks import SyncRefModelCallback
|
||||
from .grpo_config import GRPOConfig
|
||||
@ -61,9 +61,6 @@ from .utils import (
|
||||
)
|
||||
|
||||
|
||||
if is_deepspeed_available():
|
||||
import deepspeed
|
||||
|
||||
if is_peft_available():
|
||||
from peft import PeftConfig, get_peft_model
|
||||
|
||||
@ -175,15 +172,6 @@ class RepeatSampler(Sampler):
|
||||
return self.num_samples * self.mini_repeat_count * self.repeat_count
|
||||
|
||||
|
||||
class RepeatRandomSampler(RepeatSampler):
|
||||
def __init__(self, *args, **kwargs):
|
||||
warnings.warn(
|
||||
"RepeatRandomSampler is deprecated and will be removed in version 0.18. Use RepeatSampler instead.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
|
||||
# torch.nanstd doesn't exist, so we define it here
|
||||
def nanstd(tensor: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
@ -809,11 +797,29 @@ class GRPOTrainer(Trainer):
|
||||
last_hidden_state = last_hidden_state[:, -logits_to_keep:, :] # (B, logits_to_keep, H)
|
||||
return last_hidden_state
|
||||
|
||||
# Get the per-token log probabilities for the completions for the model and the reference model
|
||||
@profiling_decorator
|
||||
def _get_per_token_logps(self, model, input_ids, attention_mask, logits_to_keep, batch_size=None) -> torch.Tensor:
|
||||
def _get_per_token_logps_and_probs(
|
||||
self, model, input_ids, attention_mask, logits_to_keep, batch_size=None
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
Get both per-token log probabilities and probability distributions for completions in a single forward pass.
|
||||
|
||||
Args:
|
||||
model: The model to generate logits.
|
||||
input_ids: The input token IDs.
|
||||
attention_mask: The attention mask.
|
||||
logits_to_keep: The number of logits to keep (corresponding to completion length).
|
||||
batch_size: Optional batch size to process inputs in chunks to reduce memory peak.
|
||||
|
||||
Returns:
|
||||
tuple: (per_token_logps, probs)
|
||||
- per_token_logps: Tensor of shape (batch_size, logits_to_keep) containing log probabilities
|
||||
- probs: Tensor of shape (batch_size, logits_to_keep, vocab_size) containing full probability distributions
|
||||
"""
|
||||
batch_size = batch_size or input_ids.size(0) # Chunk inputs into smaller batches to reduce memory peak
|
||||
all_logps = []
|
||||
all_probs = []
|
||||
|
||||
for i in range(0, input_ids.size(0), batch_size):
|
||||
input_ids_batch = input_ids[i : i + batch_size]
|
||||
attention_mask_batch = attention_mask[i : i + batch_size]
|
||||
@ -827,19 +833,49 @@ class GRPOTrainer(Trainer):
|
||||
# For transformers<=4.48, logits_to_keep argument isn't supported, so here we drop logits ourselves.
|
||||
# See https://github.com/huggingface/trl/issues/2770
|
||||
logits = logits[:, -logits_to_keep:]
|
||||
# Divide logits by sampling temperature.
|
||||
# See https://huggingface.co/blog/the_n_implementation_details_of_rlhf_with_ppo#policy-training-implementation-details
|
||||
logits = logits / self.temperature
|
||||
logps = selective_log_softmax(logits, input_ids_batch) # compute logprobs for the input tokens
|
||||
|
||||
# Divide logits by sampling temperature
|
||||
scaled_logits = logits / self.temperature
|
||||
|
||||
# Compute full probability distributions
|
||||
probs = torch.softmax(scaled_logits, dim=-1) # (B, seq_len, vocab_size)
|
||||
all_probs.append(probs)
|
||||
|
||||
# Compute log probs for the selected tokens (original functionality)
|
||||
logps = selective_log_softmax(scaled_logits, input_ids_batch) # (B, seq_len)
|
||||
all_logps.append(logps)
|
||||
return torch.cat(all_logps, dim=0)
|
||||
|
||||
return torch.cat(all_logps, dim=0), torch.cat(all_probs, dim=0)
|
||||
|
||||
@profiling_decorator
|
||||
def _get_per_token_logps(self, model, input_ids, attention_mask, logits_to_keep, batch_size=None) -> torch.Tensor:
|
||||
"""
|
||||
Get the per-token log probabilities for the completions (backward compatible version).
|
||||
|
||||
Args:
|
||||
model: The model to generate logits.
|
||||
input_ids: The input token IDs.
|
||||
attention_mask: The attention mask.
|
||||
logits_to_keep: The number of logits to keep (corresponding to completion length).
|
||||
batch_size: Optional batch size to process inputs in chunks.
|
||||
|
||||
Returns:
|
||||
Tensor of shape (batch_size, logits_to_keep) containing log probabilities
|
||||
"""
|
||||
logps, _ = self._get_per_token_logps_and_probs(model, input_ids, attention_mask, logits_to_keep, batch_size)
|
||||
return logps
|
||||
|
||||
@profiling_decorator
|
||||
def _move_model_to_vllm(self):
|
||||
# For DeepSpeed ZeRO-3, we need to gather all parameters before operations
|
||||
deepspeed_plugin = self.accelerator.state.deepspeed_plugin
|
||||
zero_stage_3 = deepspeed_plugin is not None and deepspeed_plugin.zero_stage == 3
|
||||
gather_if_zero3 = deepspeed.zero.GatheredParameters if zero_stage_3 else nullcontext
|
||||
if zero_stage_3:
|
||||
import deepspeed
|
||||
|
||||
gather_if_zero3 = deepspeed.zero.GatheredParameters
|
||||
else:
|
||||
gather_if_zero3 = nullcontext
|
||||
|
||||
if is_peft_model(self.model):
|
||||
# With PEFT and DeepSpeed ZeRO Stage 3, we must gather the full model at once before merging, as merging
|
||||
@ -891,7 +927,7 @@ class GRPOTrainer(Trainer):
|
||||
# - Completions are generated for each batch without buffering or reuse
|
||||
# Returns a single local batch in both cases.
|
||||
|
||||
mode = "eval" if self.control.should_evaluate else "train"
|
||||
mode = "train" if self.model.training else "eval"
|
||||
if mode == "train":
|
||||
generate_every = self.args.gradient_accumulation_steps * self.num_iterations
|
||||
if self._step % generate_every == 0 or self._buffered_inputs is None:
|
||||
@ -911,7 +947,7 @@ class GRPOTrainer(Trainer):
|
||||
self, inputs: list[dict[str, Union[torch.Tensor, Any]]]
|
||||
) -> dict[str, Union[torch.Tensor, Any]]:
|
||||
device = self.accelerator.device
|
||||
mode = "eval" if self.control.should_evaluate else "train"
|
||||
mode = "train" if self.model.training else "eval"
|
||||
|
||||
prompts = [x["prompt"] for x in inputs]
|
||||
prompts_text = [maybe_apply_chat_template(example, self.processing_class)["prompt"] for example in inputs]
|
||||
@ -1170,7 +1206,7 @@ class GRPOTrainer(Trainer):
|
||||
mean_kl = metrics[0] if self.beta != 0.0 else None
|
||||
clip_ratio = metrics[-1]
|
||||
|
||||
mode = "eval" if self.control.should_evaluate else "train"
|
||||
mode = "train" if self.model.training else "eval"
|
||||
if self.beta != 0.0:
|
||||
self._metrics[mode]["kl"].append(self.accelerator.gather_for_metrics(mean_kl).mean().item())
|
||||
self._metrics[mode]["clip_ratio"].append(self.accelerator.gather_for_metrics(clip_ratio).mean().item())
|
||||
@ -1194,8 +1230,25 @@ class GRPOTrainer(Trainer):
|
||||
attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)
|
||||
logits_to_keep = completion_ids.size(1) # we only need to compute the logits for the completion tokens
|
||||
|
||||
per_token_logps = self._get_per_token_logps(model, input_ids, attention_mask, logits_to_keep)
|
||||
# Get both log probabilities and full probability distributions
|
||||
per_token_logps, probs = self._get_per_token_logps_and_probs(model, input_ids, attention_mask, logits_to_keep)
|
||||
|
||||
# Calculate entropy from the full probability distributions
|
||||
eps = 1e-8 # Small epsilon to avoid log(0)
|
||||
entropy_per_token = -torch.sum(probs * torch.log(probs + eps), dim=-1) # (batch_size, seq_len)
|
||||
|
||||
# Average entropy across valid completion tokens (masked for padding)
|
||||
mean_entropy = (entropy_per_token * completion_mask).sum(dim=1) / completion_mask.sum(dim=1).clamp(min=1.0)
|
||||
|
||||
# Log entropy metrics
|
||||
mode = "train" if self.model.training else "eval"
|
||||
agg_entropy = self.accelerator.gather_for_metrics(mean_entropy)
|
||||
self._metrics[mode]["completions/entropy/mean"].append(agg_entropy.nanmean().item())
|
||||
self._metrics[mode]["completions/entropy/min"].append(nanmin(agg_entropy).item())
|
||||
self._metrics[mode]["completions/entropy/max"].append(nanmax(agg_entropy).item())
|
||||
self._metrics[mode]["completions/entropy/std"].append(nanstd(agg_entropy).item())
|
||||
|
||||
# Continue with the original loss computation...
|
||||
# Compute the KL divergence between the model and the reference model
|
||||
if self.beta != 0.0:
|
||||
ref_per_token_logps = inputs["ref_per_token_logps"]
|
||||
@ -1226,7 +1279,7 @@ class GRPOTrainer(Trainer):
|
||||
raise ValueError(f"Unknown loss type: {self.loss_type}")
|
||||
|
||||
# Log the metrics
|
||||
mode = "eval" if self.control.should_evaluate else "train"
|
||||
mode = "train" if self.model.training else "eval"
|
||||
|
||||
if self.beta != 0.0:
|
||||
mean_kl = (per_token_kl * completion_mask).sum() / completion_mask.sum()
|
||||
@ -1260,7 +1313,7 @@ class GRPOTrainer(Trainer):
|
||||
return loss, None, None
|
||||
|
||||
def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None:
|
||||
mode = "eval" if self.control.should_evaluate else "train"
|
||||
mode = "train" if self.model.training else "eval"
|
||||
metrics = {key: sum(val) / len(val) for key, val in self._metrics[mode].items()} # average the metrics
|
||||
|
||||
# This method can be called both in training and evaluation. When called in evaluation, the keys in `logs`
|
||||
|
@ -19,7 +19,6 @@ import textwrap
|
||||
import warnings
|
||||
from collections import defaultdict
|
||||
from contextlib import contextmanager, nullcontext
|
||||
from copy import deepcopy
|
||||
from operator import itemgetter
|
||||
from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Union
|
||||
|
||||
@ -31,7 +30,7 @@ import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import transformers
|
||||
from accelerate import PartialState
|
||||
from accelerate.utils import is_deepspeed_available, tqdm
|
||||
from accelerate.utils import tqdm
|
||||
from datasets import Dataset, concatenate_datasets
|
||||
from packaging import version
|
||||
from torch.utils.data import DataLoader, SequentialSampler
|
||||
@ -54,7 +53,7 @@ from transformers.utils import is_peft_available
|
||||
|
||||
from ..data_utils import maybe_apply_chat_template, maybe_extract_prompt, maybe_unpair_preference_dataset
|
||||
from ..import_utils import is_liger_kernel_available
|
||||
from ..models import PreTrainedModelWrapper, create_reference_model
|
||||
from ..models import create_reference_model, prepare_deepspeed
|
||||
from .kto_config import KTOConfig
|
||||
from .utils import (
|
||||
DPODataCollatorWithPadding,
|
||||
@ -68,9 +67,6 @@ from .utils import (
|
||||
)
|
||||
|
||||
|
||||
if is_deepspeed_available():
|
||||
import deepspeed
|
||||
|
||||
if is_liger_kernel_available():
|
||||
from liger_kernel.chunked_loss import LigerFusedLinearKTOLoss
|
||||
|
||||
@ -779,7 +775,7 @@ class KTOTrainer(Trainer):
|
||||
)
|
||||
else:
|
||||
if self.is_deepspeed_enabled:
|
||||
self.ref_model = self._prepare_deepspeed(self.ref_model)
|
||||
self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator)
|
||||
else:
|
||||
self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True)
|
||||
|
||||
@ -808,37 +804,6 @@ class KTOTrainer(Trainer):
|
||||
ignore_index=self.label_pad_token_id, beta=self.beta, use_ref_model=(self.ref_model is not None)
|
||||
)
|
||||
|
||||
def _prepare_deepspeed(self, model: PreTrainedModelWrapper):
|
||||
# Adapted from accelerate: https://github.com/huggingface/accelerate/blob/739b135f8367becb67ffaada12fe76e3aa60fefd/src/accelerate/accelerator.py#L1473
|
||||
deepspeed_plugin = self.accelerator.state.deepspeed_plugin
|
||||
config_kwargs = deepcopy(deepspeed_plugin.deepspeed_config)
|
||||
|
||||
if model is not None:
|
||||
if hasattr(model, "config"):
|
||||
hidden_size = (
|
||||
max(model.config.hidden_sizes)
|
||||
if getattr(model.config, "hidden_sizes", None)
|
||||
else getattr(model.config, "hidden_size", None)
|
||||
)
|
||||
if hidden_size is not None and config_kwargs["zero_optimization"]["stage"] == 3:
|
||||
# Note that `stage3_prefetch_bucket_size` can produce DeepSpeed messages like: `Invalidate trace cache @ step 0: expected module 1, but got module 0`
|
||||
# This is expected and is not an error, see: https://github.com/microsoft/DeepSpeed/discussions/4081
|
||||
config_kwargs.update(
|
||||
{
|
||||
"zero_optimization.reduce_bucket_size": hidden_size * hidden_size,
|
||||
"zero_optimization.stage3_param_persistence_threshold": 10 * hidden_size,
|
||||
"zero_optimization.stage3_prefetch_bucket_size": 0.9 * hidden_size * hidden_size,
|
||||
}
|
||||
)
|
||||
|
||||
# If ZeRO-3 is used, we shard both the active and reference model.
|
||||
# Otherwise, we assume the reference model fits in memory and is initialized on each device with ZeRO disabled (stage 0)
|
||||
if config_kwargs["zero_optimization"]["stage"] != 3:
|
||||
config_kwargs["zero_optimization"]["stage"] = 0
|
||||
model, *_ = deepspeed.initialize(model=model, config=config_kwargs)
|
||||
model.eval()
|
||||
return model
|
||||
|
||||
@contextmanager
|
||||
def null_ref_context(self):
|
||||
"""Context manager for handling null reference model (that is, peft adapter manipulation)."""
|
||||
|
@ -19,7 +19,6 @@ import textwrap
|
||||
import warnings
|
||||
from collections import defaultdict
|
||||
from contextlib import nullcontext
|
||||
from copy import deepcopy
|
||||
from typing import Any, Callable, Literal, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
@ -30,7 +29,6 @@ import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import transformers
|
||||
from accelerate import PartialState
|
||||
from accelerate.utils import is_deepspeed_available
|
||||
from datasets import Dataset
|
||||
from packaging import version
|
||||
from torch.utils.data import DataLoader
|
||||
@ -52,7 +50,6 @@ from transformers.trainer_utils import EvalLoopOutput
|
||||
from transformers.utils import is_peft_available, is_torch_fx_proxy
|
||||
|
||||
from ..data_utils import maybe_apply_chat_template, maybe_extract_prompt
|
||||
from ..models import PreTrainedModelWrapper
|
||||
from .orpo_config import ORPOConfig
|
||||
from .utils import (
|
||||
DPODataCollatorWithPadding,
|
||||
@ -75,9 +72,6 @@ if is_peft_available():
|
||||
if is_wandb_available():
|
||||
import wandb
|
||||
|
||||
if is_deepspeed_available():
|
||||
import deepspeed
|
||||
|
||||
if is_torch_xla_available():
|
||||
import torch_xla.core.xla_model as xm
|
||||
|
||||
@ -358,37 +352,6 @@ class ORPOTrainer(Trainer):
|
||||
"Your `Trainer` does not have an `accelerator` object. Consider upgrading `transformers`."
|
||||
)
|
||||
|
||||
def _prepare_deepspeed(self, model: PreTrainedModelWrapper):
|
||||
# Adapted from accelerate: https://github.com/huggingface/accelerate/blob/739b135f8367becb67ffaada12fe76e3aa60fefd/src/accelerate/accelerator.py#L1473
|
||||
deepspeed_plugin = self.accelerator.state.deepspeed_plugin
|
||||
config_kwargs = deepcopy(deepspeed_plugin.deepspeed_config)
|
||||
|
||||
if model is not None:
|
||||
if hasattr(model, "config"):
|
||||
hidden_size = (
|
||||
max(model.config.hidden_sizes)
|
||||
if getattr(model.config, "hidden_sizes", None)
|
||||
else getattr(model.config, "hidden_size", None)
|
||||
)
|
||||
if hidden_size is not None and config_kwargs["zero_optimization"]["stage"] == 3:
|
||||
# Note that `stage3_prefetch_bucket_size` can produce DeepSpeed messages like: `Invalidate trace cache @ step 0: expected module 1, but got module 0`
|
||||
# This is expected and is not an error, see: https://github.com/microsoft/DeepSpeed/discussions/4081
|
||||
config_kwargs.update(
|
||||
{
|
||||
"zero_optimization.reduce_bucket_size": hidden_size * hidden_size,
|
||||
"zero_optimization.stage3_param_persistence_threshold": 10 * hidden_size,
|
||||
"zero_optimization.stage3_prefetch_bucket_size": 0.9 * hidden_size * hidden_size,
|
||||
}
|
||||
)
|
||||
|
||||
# If ZeRO-3 is used, we shard both the active and reference model.
|
||||
# Otherwise, we assume the reference model fits in memory and is initialized on each device with ZeRO disabled (stage 0)
|
||||
if config_kwargs["zero_optimization"]["stage"] != 3:
|
||||
config_kwargs["zero_optimization"]["stage"] = 0
|
||||
model, *_ = deepspeed.initialize(model=model, config=config_kwargs)
|
||||
model.eval()
|
||||
return model
|
||||
|
||||
def build_tokenized_answer(self, prompt, answer):
|
||||
"""
|
||||
Llama tokenizer does satisfy `enc(a + b) = enc(a) + enc(b)`.
|
||||
|
@ -62,6 +62,8 @@ class SFTConfig(TrainingArguments):
|
||||
continuous sequence. This reduces memory usage by eliminating padding overhead. Currently, this is only
|
||||
supported with the `flash_attention_2` attention implementation, which can efficiently handle the flattened
|
||||
batch structure.
|
||||
pad_to_multiple_of (`int` or `None`, *optional*, defaults to `None`):
|
||||
If set, the sequences will be padded to a multiple of this value.
|
||||
eval_packing (`bool` or `None`, *optional*, defaults to `None`):
|
||||
Whether to pack the eval dataset. If `None`, uses the same value as `packing`.
|
||||
|
||||
@ -140,6 +142,10 @@ class SFTConfig(TrainingArguments):
|
||||
"handle the flattened batch structure."
|
||||
},
|
||||
)
|
||||
pad_to_multiple_of: Optional[int] = field(
|
||||
default=None,
|
||||
metadata={"help": "If set, the sequences will be padded to a multiple of this value."},
|
||||
)
|
||||
eval_packing: Optional[bool] = field(
|
||||
default=None,
|
||||
metadata={"help": "Whether to pack the eval dataset. If `None`, uses the same value as `packing`."},
|
||||
@ -167,77 +173,19 @@ class SFTConfig(TrainingArguments):
|
||||
)
|
||||
|
||||
# Deprecated parameters
|
||||
dataset_batch_size: Optional[int] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": "This parameter is deprecated and will be removed in version 0.18.0. You can safely remove this "
|
||||
"parameter from your configuration."
|
||||
},
|
||||
)
|
||||
num_of_sequences: Optional[int] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": "This parameter is deprecated and will be removed in version 0.18.0. Use `max_length` instead, "
|
||||
"which specifies the maximum length of the tokenized sequence, unlike `num_of_sequences`, which referred "
|
||||
"to string sequences."
|
||||
},
|
||||
)
|
||||
chars_per_token: Optional[float] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": "This parameter is deprecated and will be removed in version 0.18.0. If you want to customize the "
|
||||
"packing length, use `max_length`."
|
||||
},
|
||||
)
|
||||
max_seq_length: Optional[int] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": "This parameter is deprecated and will be removed in version 0.20.0. Use `max_length` instead."
|
||||
},
|
||||
)
|
||||
use_liger: Optional[bool] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": "This parameter is deprecated and will be removed in version 0.18.0. Use `use_liger_kernel` "
|
||||
"instead."
|
||||
},
|
||||
)
|
||||
|
||||
def __post_init__(self):
|
||||
super().__post_init__()
|
||||
|
||||
if self.dataset_batch_size is not None:
|
||||
warnings.warn(
|
||||
"`dataset_batch_size` is deprecated and will be removed in version 0.18.0. You can safely remove this "
|
||||
"parameter from your configuration.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
|
||||
if self.num_of_sequences is not None:
|
||||
warnings.warn(
|
||||
"`num_of_sequences` is deprecated and will be removed in version 0.18.0. Use `max_length` instead, "
|
||||
"which specifies the maximum length of the tokenized sequence, unlike `num_of_sequences`, which "
|
||||
"referred to string sequences.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
|
||||
if self.chars_per_token is not None:
|
||||
warnings.warn(
|
||||
"`chars_per_token` is deprecated and will be removed in version 0.18.0. If you want to customize the "
|
||||
"packing length, use `max_length`.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
|
||||
if self.max_seq_length is not None:
|
||||
warnings.warn(
|
||||
"`max_seq_length` is deprecated and will be removed in version 0.20.0. Use `max_length` instead.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
self.max_length = self.max_seq_length
|
||||
|
||||
if self.use_liger is not None:
|
||||
warnings.warn(
|
||||
"`use_liger` is deprecated and will be removed in version 0.18.0. Use `use_liger_kernel` instead.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
self.use_liger_kernel = self.use_liger
|
||||
|
@ -80,7 +80,9 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
|
||||
Token ID to use for padding.
|
||||
completion_only_loss (`bool`, *optional*, defaults to `True`):
|
||||
When the input contains a completion mask (`completion_mask`), the labels are set to -100 for the tokens
|
||||
that are not in the completion.
|
||||
that are no in the completion.
|
||||
pad_to_multiple_of (`int` or `None`, *optional*, defaults to `None`):
|
||||
If set, the sequences will be padded to a multiple of this value.
|
||||
return_tensors (`str`, *optional*, defaults to `"pt"`):
|
||||
Type of Tensor to return. Only `"pt"` is currently supported.
|
||||
|
||||
@ -116,6 +118,7 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
|
||||
|
||||
pad_token_id: int
|
||||
completion_only_loss: bool = True
|
||||
pad_to_multiple_of: Optional[int] = None
|
||||
return_tensors: str = "pt"
|
||||
|
||||
def torch_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
|
||||
@ -128,11 +131,22 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
|
||||
|
||||
# Pad
|
||||
output = {}
|
||||
output["input_ids"] = pad(input_ids, padding_value=self.pad_token_id, padding_side="right")
|
||||
output["attention_mask"] = pad(attention_mask, padding_value=0, padding_side="right")
|
||||
output["labels"] = pad(labels, padding_value=-100, padding_side="right")
|
||||
output["input_ids"] = pad(
|
||||
input_ids,
|
||||
padding_value=self.pad_token_id,
|
||||
padding_side="right",
|
||||
pad_to_multiple_of=self.pad_to_multiple_of,
|
||||
)
|
||||
output["attention_mask"] = pad(
|
||||
attention_mask, padding_value=0, padding_side="right", pad_to_multiple_of=self.pad_to_multiple_of
|
||||
)
|
||||
output["labels"] = pad(
|
||||
labels, padding_value=-100, padding_side="right", pad_to_multiple_of=self.pad_to_multiple_of
|
||||
)
|
||||
if self.completion_only_loss and "completion_mask" in examples[0]:
|
||||
completion_mask = pad(completion_mask, padding_value=0, padding_side="right")
|
||||
completion_mask = pad(
|
||||
completion_mask, padding_value=0, padding_side="right", pad_to_multiple_of=self.pad_to_multiple_of
|
||||
)
|
||||
output["labels"][completion_mask == 0] = -100 # mask everything that is not in the completion
|
||||
|
||||
return output
|
||||
@ -315,7 +329,9 @@ class SFTTrainer(Trainer):
|
||||
f"`processing_class` ({processing_class.__class__.__name__}). Ensure that the `pad_token` exists "
|
||||
"in the vocabulary before using it as a padding token."
|
||||
)
|
||||
data_collator = DataCollatorForLanguageModeling(pad_token_id, self.completion_only_loss)
|
||||
data_collator = DataCollatorForLanguageModeling(
|
||||
pad_token_id, self.completion_only_loss, args.pad_to_multiple_of
|
||||
)
|
||||
|
||||
# Dataset
|
||||
preprocess_dataset = args.dataset_kwargs is None or not args.dataset_kwargs.get("skip_prepare_dataset", False)
|
||||
@ -650,7 +666,7 @@ class SFTTrainer(Trainer):
|
||||
"""
|
||||
Compute training loss and additionally compute token accuracies
|
||||
"""
|
||||
mode = "eval" if self.control.should_evaluate else "train"
|
||||
mode = "train" if self.model.training else "eval"
|
||||
(loss, outputs) = super().compute_loss(
|
||||
model, inputs, return_outputs=True, num_items_in_batch=num_items_in_batch
|
||||
)
|
||||
@ -695,7 +711,7 @@ class SFTTrainer(Trainer):
|
||||
return (loss, outputs) if return_outputs else loss
|
||||
|
||||
def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None:
|
||||
mode = "eval" if self.control.should_evaluate else "train"
|
||||
mode = "train" if self.model.training else "eval"
|
||||
metrics = {key: sum(val) / len(val) for key, val in self._metrics[mode].items()} # average the metrics
|
||||
|
||||
# This method can be called both in training and evaluation. When called in evaluation, the keys in `logs`
|
||||
|
@ -415,7 +415,12 @@ class RewardDataCollatorWithPadding:
|
||||
return batch
|
||||
|
||||
|
||||
def pad(tensors: list[torch.Tensor], padding_value: int = 0, padding_side: str = "right") -> torch.Tensor:
|
||||
def pad(
|
||||
tensors: list[torch.Tensor],
|
||||
padding_value: int = 0,
|
||||
padding_side: str = "right",
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Pads a list of tensors to the same shape along the first dimension.
|
||||
|
||||
@ -426,6 +431,8 @@ def pad(tensors: list[torch.Tensor], padding_value: int = 0, padding_side: str =
|
||||
Value to use for padding. Default is 0.
|
||||
padding_side (`str`):
|
||||
Side on which to add padding. Must be 'left' or 'right'. Default is 'right'.
|
||||
pad_to_multiple_of (`int`, *optional*, defaults to `None`):
|
||||
If set will pad the sequence to a multiple of the provided value.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
@ -446,18 +453,25 @@ def pad(tensors: list[torch.Tensor], padding_value: int = 0, padding_side: str =
|
||||
# Determine the maximum shape for each dimension
|
||||
output_shape = np.max([t.shape for t in tensors], 0).tolist()
|
||||
|
||||
# Apply pad_to_multiple_of to the first (sequence) dimension
|
||||
if pad_to_multiple_of is not None:
|
||||
remainder = output_shape[0] % pad_to_multiple_of
|
||||
if remainder != 0:
|
||||
output_shape[0] += pad_to_multiple_of - remainder
|
||||
|
||||
# Create an output tensor filled with the padding value
|
||||
output = torch.full((len(tensors), *output_shape), padding_value, dtype=tensors[0].dtype, device=tensors[0].device)
|
||||
|
||||
for i, t in enumerate(tensors):
|
||||
# Determine the slice for the sequence dimension
|
||||
if padding_side == "left":
|
||||
seq_slice = slice(output_shape[0] - t.shape[0], output_shape[0])
|
||||
seq_start = output_shape[0] - t.shape[0]
|
||||
elif padding_side == "right":
|
||||
seq_slice = slice(0, t.shape[0])
|
||||
seq_start = 0
|
||||
else:
|
||||
raise ValueError("padding_side must be 'left' or 'right'")
|
||||
|
||||
# Define the slices
|
||||
seq_slice = slice(seq_start, seq_start + t.shape[0])
|
||||
slices = (seq_slice,) + tuple(slice(0, s) for s in t.shape[1:])
|
||||
output[i][slices] = t
|
||||
|
||||
|
Reference in New Issue
Block a user