diff --git a/tests/test_dataset_formatting.py b/tests/test_dataset_formatting.py index 80f65f964..59c8d717c 100644 --- a/tests/test_dataset_formatting.py +++ b/tests/test_dataset_formatting.py @@ -14,6 +14,7 @@ from typing import Callable +import pytest from datasets import Dataset, load_dataset from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer @@ -23,6 +24,7 @@ from trl.models.utils import ChatMlSpecialTokens, clone_chat_template, setup_cha from .testing_utils import TrlTestCase +@pytest.mark.filterwarnings("ignore::FutureWarning") class TestDatasetFormatting(TrlTestCase): def setup_method(self): self.llama_tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/tiny-MistralForCausalLM-0.1") diff --git a/trl/extras/dataset_formatting.py b/trl/extras/dataset_formatting.py index 92e932801..c4bf0e97f 100644 --- a/trl/extras/dataset_formatting.py +++ b/trl/extras/dataset_formatting.py @@ -13,6 +13,7 @@ # limitations under the License. import logging +import warnings from typing import Callable, Literal, Optional import datasets @@ -41,7 +42,17 @@ def conversations_formatting_function( r""" return a callable function that takes in a "messages" dataset and returns a formatted dataset, based on the tokenizer apply chat template to the dataset along with the schema of the list of functions in the tools list. + + .. deprecated:: 0.24.0 + `conversations_formatting_function` is deprecated and will be removed in version 0.27. + Please use `tokenizer.apply_chat_template()` directly instead. """ + warnings.warn( + "`conversations_formatting_function` is deprecated and will be removed in TRL 0.27. " + "Please use `tokenizer.apply_chat_template()` directly instead.", + DeprecationWarning, + stacklevel=2, + ) def format_dataset(examples): if isinstance(examples[messages_field][0], list): @@ -61,7 +72,17 @@ def instructions_formatting_function(tokenizer: AutoTokenizer): r""" return a callable function that takes in an "instructions" dataset and returns a formatted dataset, based on the tokenizer apply chat template to the dataset + + .. deprecated:: 0.24.0 + `instructions_formatting_function` is deprecated and will be removed in version 0.27. + Please use `tokenizer.apply_chat_template()` directly instead. """ + warnings.warn( + "`instructions_formatting_function` is deprecated and will be removed in TRL 0.27. " + "Please use `tokenizer.apply_chat_template()` directly instead.", + DeprecationWarning, + stacklevel=2, + ) def format_dataset(examples): if isinstance(examples["prompt"], list): @@ -99,7 +120,18 @@ def get_formatting_func_from_dataset( Returns: Callable: Formatting function if the dataset format is supported else None + + .. deprecated:: 0.24.0 + `get_formatting_func_from_dataset` is deprecated and will be removed in version 0.27. + Please use `tokenizer.apply_chat_template()` directly instead. """ + warnings.warn( + "`get_formatting_func_from_dataset` is deprecated and will be removed in TRL 0.27. " + "Please use `tokenizer.apply_chat_template()` directly instead.", + DeprecationWarning, + stacklevel=2, + ) + if isinstance(dataset, Dataset): if "messages" in dataset.features: if dataset.features["messages"] == FORMAT_MAPPING["chatml"]: