peft/method_comparison/MetaMathQA/data.py

# Copyright 2025-present the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
All utilities related to data handling.
"""

from functools import partial
from typing import Callable

import datasets
import numpy as np
from datasets import Dataset, load_dataset


# with a token limit of 768 for query + response, we have to exclude all texts with length > 1304; this leaves 93.8% of
# the dataset
CHAR_LIMIT = 1300
# train/valid/test split -- note that evaluation takes quite long, so don't choose too large sizes for the valid set,
# since it's run multiple times during training; test is only run once at the end and thus can be larger
VALID_SIZE = 50


def get_filtered_dataset(*, ds: datasets.Dataset, print_fn: Callable[..., None]) -> Dataset:
    """Return the filtered dataset, with long queries removed.

    We determined that 99% of queries have 529 or fewer characters. Characters roughly correspond to tokens, so this is
    a good proxy. We cannot use tokens directly, as that depends on the tokenizer, which can be different for each
    model, but we want the same filter for each model.

    """
    char_lengths = [len(f"{q} {r}") for q, r in zip(ds["query"], ds["response"])]
    idx_filtered = [i for i, length in enumerate(char_lengths) if length <= CHAR_LIMIT]
    print_fn(f"Filtered dataset: {100 * len(idx_filtered) / len(ds):.1f}% of the original dataset")
    return ds.select(idx_filtered)


def get_train_valid_test_datasets(
    *, tokenizer, query_template: str, print_fn: Callable[..., None]
) -> tuple[Dataset, Dataset, Dataset]:
    """
    Return the indices of the train, valid, and test splits of the dataset.

    We cannot use ds.train_test_split(..., stratify_by_column="type") as it gives:

    > ValueError: Stratifying by column is only supported for ClassLabel column, and column type is Value.

    even after calling ds_filtered.class_encode_column("type"). Thus, using sklearn's StratifiedKFold instead.
    """
    metamath = load_dataset("meta-math/MetaMathQA")["train"]
    metamath = get_filtered_dataset(ds=metamath, print_fn=print_fn)

    # gsmk8k does not need to be filtered as query and response are short enough
    gsm8k = load_dataset("openai/gsm8k", "main")
    gsm8k = gsm8k.rename_columns({"question": "query", "answer": "response"})
    gsm8k_train = gsm8k["train"]
    gsm8k_test = gsm8k["test"]

    np.random.seed(0)
    indices = np.arange(len(gsm8k_train))
    np.random.shuffle(indices)
    idx_valid = indices[:VALID_SIZE]

    ds_train = metamath
    ds_valid = gsm8k_train.select(idx_valid)
    ds_test = gsm8k_test

    print_fn(f"Train size: {len(ds_train)}")
    print_fn(f"Valid size: {len(ds_valid)}")
    print_fn(f"Test size: {len(ds_test)}")

    tokenize_with_answer_ = partial(tokenize_with_answer, tokenizer=tokenizer, template=query_template)
    tokenize_wo_answer_ = partial(tokenize_wo_answer, tokenizer=tokenizer, template=query_template)
    ds_train = ds_train.map(tokenize_with_answer_, batched=True).remove_columns(["type", "query", "original_question"])
    ds_valid = ds_valid.map(tokenize_wo_answer_, batched=True).remove_columns(["query"])
    ds_test = ds_test.map(tokenize_wo_answer_, batched=True).remove_columns(["query"])

    return ds_train, ds_valid, ds_test


def tokenize_with_answer(samples, tokenizer, template):
    queries = [template.format(query=sample) + answer for sample, answer in zip(samples["query"], samples["response"])]
    tokenized = tokenizer(queries)
    tokenized["input_ids"] = [input_ids[: tokenizer.model_max_length] for input_ids in tokenized["input_ids"]]
    tokenized["attention_mask"] = [
        input_ids[: tokenizer.model_max_length] for input_ids in tokenized["attention_mask"]
    ]
    return tokenized


def tokenize_wo_answer(samples, tokenizer, template):
    queries = [template.format(query=sample) for sample in samples["query"]]
    tokenized = tokenizer(queries)
    tokenized["input_ids"] = [input_ids[: tokenizer.model_max_length] for input_ids in tokenized["input_ids"]]
    tokenized["attention_mask"] = [
        input_ids[: tokenizer.model_max_length] for input_ids in tokenized["attention_mask"]
    ]
    return tokenized