mirror of
https://github.com/huggingface/peft.git
synced 2025-10-20 15:33:48 +08:00
Introduction of a method evaluation suite. We generally face the problem that there is little knowledge on what PEFT methods perform best. To this end we decided to build an evaluation suite that has defined tasks, shared hyper-parameters and can be extended with new tasks and new method configurations over time. For the sake of comparison we've not decided to incorporate user-submitted results but we encourage users to inspect the results, suggest new experiments and improve the configuration of methods if they're deemed unfavorable. As of now there's only one task based on the MetaMathQA dataset which has the benefit of being complex while still fitting on a consumer GPU. Notable changes in this squash: * Add default training params The experiment specific training params use the default training params but can override any parameter from it if needed. However, this way it's easier to make a change to all experiments (say, I want to change the base model, I don't need to change each individual training_parameters.json). * Add possibility to change attn implementation However, both flash attention 2 and flex attention are slower on my system. Thus, stay with default None (-> SDPA). * Refactor to use GenerationConfig Allows to more easily use, say, static cache, which is the new default, as it's faster (apart from the first pass) * Better parsing of answers E.g. 1/2 == 0.5 * Keep adapter file by default after train run But add --clean to delete it. Keeping the adapter can be useful if the user wants to run further tests with the trained model. --------- Co-authored-by: Benjamin Bossan <benjamin.bossan@gmail.com>
110 lines
4.6 KiB
Python
110 lines
4.6 KiB
Python
# Copyright 2025-present the HuggingFace Inc. team.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
"""
|
|
All utilities related to data handling.
|
|
"""
|
|
|
|
from functools import partial
|
|
from typing import Callable
|
|
|
|
import datasets
|
|
import numpy as np
|
|
from datasets import Dataset, load_dataset
|
|
|
|
|
|
# with a token limit of 768 for query + response, we have to exclude all texts with length > 1304; this leaves 93.8% of
|
|
# the dataset
|
|
CHAR_LIMIT = 1300
|
|
# train/valid/test split -- note that evaluation takes quite long, so don't choose too large sizes for the valid set,
|
|
# since it's run multiple times during training; test is only run once at the end and thus can be larger
|
|
VALID_SIZE = 50
|
|
|
|
|
|
def get_filtered_dataset(*, ds: datasets.Dataset, print_fn: Callable[..., None]) -> Dataset:
|
|
"""Return the filtered dataset, with long queries removed.
|
|
|
|
We determined that 99% of queries have 529 or fewer characters. Characters roughly correspond to tokens, so this is
|
|
a good proxy. We cannot use tokens directly, as that depends on the tokenizer, which can be different for each
|
|
model, but we want the same filter for each model.
|
|
|
|
"""
|
|
char_lengths = [len(f"{q} {r}") for q, r in zip(ds["query"], ds["response"])]
|
|
idx_filtered = [i for i, length in enumerate(char_lengths) if length <= CHAR_LIMIT]
|
|
print_fn(f"Filtered dataset: {100 * len(idx_filtered) / len(ds):.1f}% of the original dataset")
|
|
return ds.select(idx_filtered)
|
|
|
|
|
|
def get_train_valid_test_datasets(
|
|
*, tokenizer, query_template: str, print_fn: Callable[..., None]
|
|
) -> tuple[Dataset, Dataset, Dataset]:
|
|
"""
|
|
Return the indices of the train, valid, and test splits of the dataset.
|
|
|
|
We cannot use ds.train_test_split(..., stratify_by_column="type") as it gives:
|
|
|
|
> ValueError: Stratifying by column is only supported for ClassLabel column, and column type is Value.
|
|
|
|
even after calling ds_filtered.class_encode_column("type"). Thus, using sklearn's StratifiedKFold instead.
|
|
"""
|
|
metamath = load_dataset("meta-math/MetaMathQA")["train"]
|
|
metamath = get_filtered_dataset(ds=metamath, print_fn=print_fn)
|
|
|
|
# gsmk8k does not need to be filtered as query and response are short enough
|
|
gsm8k = load_dataset("openai/gsm8k", "main")
|
|
gsm8k = gsm8k.rename_columns({"question": "query", "answer": "response"})
|
|
gsm8k_train = gsm8k["train"]
|
|
gsm8k_test = gsm8k["test"]
|
|
|
|
np.random.seed(0)
|
|
indices = np.arange(len(gsm8k_train))
|
|
np.random.shuffle(indices)
|
|
idx_valid = indices[:VALID_SIZE]
|
|
|
|
ds_train = metamath
|
|
ds_valid = gsm8k_train.select(idx_valid)
|
|
ds_test = gsm8k_test
|
|
|
|
print_fn(f"Train size: {len(ds_train)}")
|
|
print_fn(f"Valid size: {len(ds_valid)}")
|
|
print_fn(f"Test size: {len(ds_test)}")
|
|
|
|
tokenize_with_answer_ = partial(tokenize_with_answer, tokenizer=tokenizer, template=query_template)
|
|
tokenize_wo_answer_ = partial(tokenize_wo_answer, tokenizer=tokenizer, template=query_template)
|
|
ds_train = ds_train.map(tokenize_with_answer_, batched=True).remove_columns(["type", "query", "original_question"])
|
|
ds_valid = ds_valid.map(tokenize_wo_answer_, batched=True).remove_columns(["query"])
|
|
ds_test = ds_test.map(tokenize_wo_answer_, batched=True).remove_columns(["query"])
|
|
|
|
return ds_train, ds_valid, ds_test
|
|
|
|
|
|
def tokenize_with_answer(samples, tokenizer, template):
|
|
queries = [template.format(query=sample) + answer for sample, answer in zip(samples["query"], samples["response"])]
|
|
tokenized = tokenizer(queries)
|
|
tokenized["input_ids"] = [input_ids[: tokenizer.model_max_length] for input_ids in tokenized["input_ids"]]
|
|
tokenized["attention_mask"] = [
|
|
input_ids[: tokenizer.model_max_length] for input_ids in tokenized["attention_mask"]
|
|
]
|
|
return tokenized
|
|
|
|
|
|
def tokenize_wo_answer(samples, tokenizer, template):
|
|
queries = [template.format(query=sample) for sample in samples["query"]]
|
|
tokenized = tokenizer(queries)
|
|
tokenized["input_ids"] = [input_ids[: tokenizer.model_max_length] for input_ids in tokenized["input_ids"]]
|
|
tokenized["attention_mask"] = [
|
|
input_ids[: tokenizer.model_max_length] for input_ids in tokenized["attention_mask"]
|
|
]
|
|
return tokenized
|