mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2025-10-20 12:54:18 +08:00
[v1] add v1 launcher (#9236)
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
@ -1,82 +0,0 @@
|
||||
# Copyright 2025 the LlamaFactory team.
|
||||
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
import os
|
||||
|
||||
import datasets
|
||||
|
||||
|
||||
_HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")
|
||||
|
||||
_DESCRIPTION = "BELLE multiturn chat dataset."
|
||||
|
||||
_CITATION = """\
|
||||
@article{belle2023exploring,
|
||||
title={Exploring the Impact of Instruction Data Scaling on Large Language Models},
|
||||
author={Yunjie Ji, Yong Deng, Yan Gong, Yiping Peng, Qiang Niu, Lei Zhang, Baochang Ma, Xiangang Li},
|
||||
journal={arXiv preprint arXiv:2303.14742},
|
||||
year={2023}
|
||||
}
|
||||
"""
|
||||
|
||||
_HOMEPAGE = f"{_HF_ENDPOINT}/datasets/BelleGroup/multiturn_chat_0.8M"
|
||||
_LICENSE = "gpl-3.0"
|
||||
_URL = f"{_HF_ENDPOINT}/datasets/BelleGroup/multiturn_chat_0.8M/resolve/main/multiturn_chat_0.8M.json"
|
||||
|
||||
|
||||
class BelleMultiturn(datasets.GeneratorBasedBuilder):
|
||||
VERSION = datasets.Version("0.0.0")
|
||||
|
||||
def _info(self):
|
||||
features = datasets.Features(
|
||||
{"conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}]}
|
||||
)
|
||||
return datasets.DatasetInfo(
|
||||
description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION
|
||||
)
|
||||
|
||||
def _split_generators(self, dl_manager: datasets.DownloadManager):
|
||||
file_path = dl_manager.download(_URL)
|
||||
return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": file_path})]
|
||||
|
||||
def _generate_examples(self, filepath: str):
|
||||
with open(filepath, encoding="utf-8") as f:
|
||||
for key, row in enumerate(f):
|
||||
data = json.loads(row)
|
||||
conversations = []
|
||||
prompt = data["instruction"].strip()
|
||||
response = data["output"].strip()
|
||||
|
||||
assist_idx = prompt.rfind("Assistant:")
|
||||
human_idx = prompt.rfind("Human:")
|
||||
query = prompt[human_idx + 6 : assist_idx].strip()
|
||||
prompt = prompt[:human_idx].strip()
|
||||
conversations.insert(0, {"from": "gpt", "value": response})
|
||||
conversations.insert(0, {"from": "human", "value": query})
|
||||
|
||||
while prompt.rfind("Assistant:") != -1:
|
||||
assist_idx = prompt.rfind("Assistant:")
|
||||
human_idx = prompt.rfind("Human:")
|
||||
if human_idx != -1:
|
||||
old_query = prompt[human_idx + 6 : assist_idx].strip()
|
||||
old_resp = prompt[assist_idx + 10 :].strip()
|
||||
conversations.insert(0, {"from": "gpt", "value": old_resp})
|
||||
conversations.insert(0, {"from": "human", "value": old_query})
|
||||
else:
|
||||
break
|
||||
prompt = prompt[:human_idx].strip()
|
||||
|
||||
yield key, {"conversations": conversations}
|
@ -143,14 +143,6 @@
|
||||
"hf_hub_url": "BelleGroup/school_math_0.25M",
|
||||
"ms_hub_url": "AI-ModelScope/school_math_0.25M"
|
||||
},
|
||||
"belle_multiturn": {
|
||||
"script_url": "belle_multiturn",
|
||||
"formatting": "sharegpt"
|
||||
},
|
||||
"ultra_chat": {
|
||||
"script_url": "ultra_chat",
|
||||
"formatting": "sharegpt"
|
||||
},
|
||||
"open_platypus": {
|
||||
"hf_hub_url": "garage-bAInd/Open-Platypus",
|
||||
"ms_hub_url": "AI-ModelScope/Open-Platypus"
|
||||
@ -583,16 +575,6 @@
|
||||
"system": "system"
|
||||
}
|
||||
},
|
||||
"hh_rlhf_en": {
|
||||
"script_url": "hh_rlhf_en",
|
||||
"ranking": true,
|
||||
"columns": {
|
||||
"prompt": "instruction",
|
||||
"chosen": "chosen",
|
||||
"rejected": "rejected",
|
||||
"history": "history"
|
||||
}
|
||||
},
|
||||
"nectar_rm": {
|
||||
"hf_hub_url": "AstraMindAI/RLAIF-Nectar",
|
||||
"ms_hub_url": "AI-ModelScope/RLAIF-Nectar",
|
||||
|
@ -1,98 +0,0 @@
|
||||
# Copyright 2025 the LlamaFactory team.
|
||||
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
import os
|
||||
|
||||
import datasets
|
||||
|
||||
|
||||
_HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")
|
||||
_DESCRIPTION = "Human preference data about helpfulness and harmlessness."
|
||||
_CITATION = ""
|
||||
_HOMEPAGE = f"{_HF_ENDPOINT}/datasets/Anthropic/hh-rlhf"
|
||||
_LICENSE = "mit"
|
||||
_URL = f"{_HF_ENDPOINT}/datasets/Anthropic/hh-rlhf/resolve/main/"
|
||||
_URLS = {
|
||||
"train": [
|
||||
_URL + "harmless-base/train.jsonl.gz",
|
||||
_URL + "helpful-base/train.jsonl.gz",
|
||||
_URL + "helpful-online/train.jsonl.gz",
|
||||
_URL + "helpful-rejection-sampled/train.jsonl.gz",
|
||||
],
|
||||
"test": [
|
||||
_URL + "harmless-base/test.jsonl.gz",
|
||||
_URL + "helpful-base/test.jsonl.gz",
|
||||
_URL + "helpful-online/test.jsonl.gz",
|
||||
_URL + "helpful-rejection-sampled/test.jsonl.gz",
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
class HhRlhfEn(datasets.GeneratorBasedBuilder):
|
||||
VERSION = datasets.Version("0.0.0")
|
||||
|
||||
def _info(self) -> datasets.DatasetInfo:
|
||||
features = datasets.Features(
|
||||
{
|
||||
"instruction": datasets.Value("string"),
|
||||
"chosen": datasets.Value("string"),
|
||||
"rejected": datasets.Value("string"),
|
||||
"history": datasets.Sequence(datasets.Sequence(datasets.Value("string"))),
|
||||
}
|
||||
)
|
||||
return datasets.DatasetInfo(
|
||||
description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION
|
||||
)
|
||||
|
||||
def _split_generators(self, dl_manager: datasets.DownloadManager):
|
||||
file_path = dl_manager.download_and_extract(_URLS)
|
||||
return [
|
||||
datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepaths": file_path["train"]}),
|
||||
datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"filepaths": file_path["test"]}),
|
||||
]
|
||||
|
||||
def _generate_examples(self, filepaths: list[str]):
|
||||
key = 0
|
||||
for filepath in filepaths:
|
||||
with open(filepath, encoding="utf-8") as f:
|
||||
for row in f:
|
||||
data = json.loads(row)
|
||||
chosen = data["chosen"]
|
||||
rejected = data["rejected"]
|
||||
|
||||
assist_idx = rejected.rfind("\n\nAssistant: ")
|
||||
r_reject = rejected[assist_idx + 13 :].strip()
|
||||
assist_idx = chosen.rfind("\n\nAssistant: ")
|
||||
r_accept = chosen[assist_idx + 13 :].strip()
|
||||
|
||||
human_idx = chosen.rfind("\n\nHuman: ")
|
||||
query = chosen[human_idx + 9 : assist_idx].strip()
|
||||
prompt = chosen[:human_idx]
|
||||
history = []
|
||||
|
||||
while prompt.rfind("\n\nAssistant: ") != -1:
|
||||
assist_idx = prompt.rfind("\n\nAssistant: ")
|
||||
human_idx = prompt.rfind("\n\nHuman: ")
|
||||
if human_idx != -1:
|
||||
old_query = prompt[human_idx + 9 : assist_idx].strip()
|
||||
old_resp = prompt[assist_idx + 13 :].strip()
|
||||
history.insert(0, (old_query, old_resp))
|
||||
else:
|
||||
break
|
||||
prompt = prompt[:human_idx]
|
||||
|
||||
yield key, {"instruction": query, "chosen": r_accept, "rejected": r_reject, "history": history}
|
||||
key += 1
|
@ -1,74 +0,0 @@
|
||||
# Copyright 2025 the LlamaFactory team.
|
||||
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
import os
|
||||
|
||||
import datasets
|
||||
|
||||
|
||||
_HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")
|
||||
|
||||
_DESCRIPTION = "UltraChat: Large-scale, Informative, and Diverse Multi-round Dialogue Data."
|
||||
|
||||
_CITATION = """\
|
||||
@misc{UltraChat,
|
||||
author = {Ding, Ning and Chen, Yulin and Xu, Bokai and Hu, Shengding and others},
|
||||
title = {UltraChat: A Large-scale Auto-generated Multi-round Dialogue Data},
|
||||
year = {2023},
|
||||
publisher = {GitHub},
|
||||
journal = {GitHub repository},
|
||||
howpublished = {\\url{https://github.com/thunlp/ultrachat}},
|
||||
}
|
||||
"""
|
||||
|
||||
_HOMEPAGE = f"{_HF_ENDPOINT}/datasets/stingning/ultrachat"
|
||||
_LICENSE = "cc-by-nc-4.0"
|
||||
_BASE_DATA_URL = f"{_HF_ENDPOINT}/datasets/stingning/ultrachat/resolve/main/train_{{idx}}.jsonl"
|
||||
|
||||
|
||||
class UltraChat(datasets.GeneratorBasedBuilder):
|
||||
VERSION = datasets.Version("0.0.0")
|
||||
|
||||
def _info(self):
|
||||
features = datasets.Features(
|
||||
{"conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}]}
|
||||
)
|
||||
return datasets.DatasetInfo(
|
||||
description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION
|
||||
)
|
||||
|
||||
def _split_generators(self, dl_manager: datasets.DownloadManager):
|
||||
file_paths = [dl_manager.download(_BASE_DATA_URL.format(idx=idx)) for idx in range(10)] # multiple shards
|
||||
return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepaths": file_paths})]
|
||||
|
||||
def _generate_examples(self, filepaths: list[str]):
|
||||
for filepath in filepaths:
|
||||
with open(filepath, encoding="utf-8") as f:
|
||||
for row in f:
|
||||
try:
|
||||
data = json.loads(row)
|
||||
except Exception:
|
||||
continue
|
||||
key: int = data["id"]
|
||||
content: list[str] = data["data"]
|
||||
if len(content) % 2 == 1:
|
||||
content.pop(-1)
|
||||
if len(content) < 2:
|
||||
continue
|
||||
conversations = [
|
||||
{"from": "human" if i % 2 == 0 else "gpt", "value": content[i]} for i in range(len(content))
|
||||
]
|
||||
yield key, {"conversations": conversations}
|
8
data/v1_sft_demo.yaml
Normal file
8
data/v1_sft_demo.yaml
Normal file
@ -0,0 +1,8 @@
|
||||
identity:
|
||||
file_name: identity.json
|
||||
converter: alpaca
|
||||
alpaca_en_demo:
|
||||
file_name: alpaca_en_demo.json
|
||||
dataset_dir: ~/data
|
||||
converter: alpaca
|
||||
num_samples: 500
|
Reference in New Issue
Block a user