Merge branch 'main' into add_dataset_sample_num

Former-commit-id: 26300127c45f24e63b91f1b0cc73e46c3a936a91
This commit is contained in:
seanzhang-zhichen
2024-05-24 15:57:47 +08:00
committed by GitHub
27 changed files with 756 additions and 513 deletions

View File

@ -7,7 +7,7 @@
"hf_hub_url": "Hugging Face 的数据集仓库地址(若指定,则忽略 script_url 和 file_name",
"ms_hub_url": "ModelScope 的数据集仓库地址(若指定,则忽略 script_url 和 file_name",
"script_url": "包含数据加载脚本的本地文件夹名称(若指定,则忽略 file_name",
"file_name": "该目录下数据集文件的名称(若上述参数未指定,则此项必需)",
"file_name": "该目录下数据集文件夹或文件的名称(若上述参数未指定,则此项必需)",
"formatting": "数据集格式可选默认alpaca可以为 alpaca 或 sharegpt",
"ranking": "是否为偏好数据集可选默认False",
"subset": "数据集子集的名称可选默认None",

View File

@ -34,7 +34,8 @@ class HhRlhfEn(datasets.GeneratorBasedBuilder):
features = datasets.Features(
{
"instruction": datasets.Value("string"),
"output": datasets.Sequence(datasets.Value("string")),
"chosen": datasets.Value("string"),
"rejected": datasets.Value("string"),
"history": datasets.Sequence(datasets.Sequence(datasets.Value("string"))),
}
)