Deprecate unused dataset_formatting module (#4242)

Co-authored-by: behroozazarkhalili <ermiaazarkhalili>
Co-authored-by: Quentin Gallouédec <gallouedec.quentin@gmail.com>
This commit is contained in:
Behrooz Azarkhalili
2025-10-10 08:16:18 -07:00
committed by GitHub
parent bcd059a384
commit 039d526d24
2 changed files with 34 additions and 0 deletions

View File

@ -14,6 +14,7 @@
from typing import Callable
import pytest
from datasets import Dataset, load_dataset
from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
@ -23,6 +24,7 @@ from trl.models.utils import ChatMlSpecialTokens, clone_chat_template, setup_cha
from .testing_utils import TrlTestCase
@pytest.mark.filterwarnings("ignore::FutureWarning")
class TestDatasetFormatting(TrlTestCase):
def setup_method(self):
self.llama_tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/tiny-MistralForCausalLM-0.1")

View File

@ -13,6 +13,7 @@
# limitations under the License.
import logging
import warnings
from typing import Callable, Literal, Optional
import datasets
@ -41,7 +42,17 @@ def conversations_formatting_function(
r"""
return a callable function that takes in a "messages" dataset and returns a formatted dataset, based on the
tokenizer apply chat template to the dataset along with the schema of the list of functions in the tools list.
.. deprecated:: 0.24.0
`conversations_formatting_function` is deprecated and will be removed in version 0.27.
Please use `tokenizer.apply_chat_template()` directly instead.
"""
warnings.warn(
"`conversations_formatting_function` is deprecated and will be removed in TRL 0.27. "
"Please use `tokenizer.apply_chat_template()` directly instead.",
DeprecationWarning,
stacklevel=2,
)
def format_dataset(examples):
if isinstance(examples[messages_field][0], list):
@ -61,7 +72,17 @@ def instructions_formatting_function(tokenizer: AutoTokenizer):
r"""
return a callable function that takes in an "instructions" dataset and returns a formatted dataset, based on the
tokenizer apply chat template to the dataset
.. deprecated:: 0.24.0
`instructions_formatting_function` is deprecated and will be removed in version 0.27.
Please use `tokenizer.apply_chat_template()` directly instead.
"""
warnings.warn(
"`instructions_formatting_function` is deprecated and will be removed in TRL 0.27. "
"Please use `tokenizer.apply_chat_template()` directly instead.",
DeprecationWarning,
stacklevel=2,
)
def format_dataset(examples):
if isinstance(examples["prompt"], list):
@ -99,7 +120,18 @@ def get_formatting_func_from_dataset(
Returns:
Callable: Formatting function if the dataset format is supported else None
.. deprecated:: 0.24.0
`get_formatting_func_from_dataset` is deprecated and will be removed in version 0.27.
Please use `tokenizer.apply_chat_template()` directly instead.
"""
warnings.warn(
"`get_formatting_func_from_dataset` is deprecated and will be removed in TRL 0.27. "
"Please use `tokenizer.apply_chat_template()` directly instead.",
DeprecationWarning,
stacklevel=2,
)
if isinstance(dataset, Dataset):
if "messages" in dataset.features:
if dataset.features["messages"] == FORMAT_MAPPING["chatml"]: