!236 update custom dataset info and codecheck related

Merge pull request !236 from 金勇旭/master
This commit is contained in:
金勇旭
2025-06-18 06:38:01 +00:00
committed by i-robot
parent d6d874b0ec
commit 14e75e69e2
3 changed files with 27 additions and 14 deletions

View File

@ -291,7 +291,7 @@ Pairwise格式示例数据如下
"file_name(选填)": "dataset.json",
"split(选填)": "train",
"num_samples(选填)": xxx,
"columns": {
"columns(选填)": {
"prompt": "instruction",
"query": "input",
"response": "output",
@ -321,10 +321,19 @@ Pairwise格式示例数据如下
"split(选填)": "train",
"num_samples(选填)": xxx,
"formatting": "sharegpt",
"columns": {
"columns(选填)": {
"messages": "conversations",
"system": "system",
"tools": "tools"
},
"tags(选填)": {
"role_tag": "from",
"content_tag": "value",
"user_tag": "human",
"assistant_tag": "gpt",
"observation_tag": "observation",
"function_tag": "function_call",
"system_tag": "system"
}
}
}
@ -342,7 +351,7 @@ Pairwise格式示例数据如下
"split(选填)": "train",
"num_samples(选填)": xxx,
"formatting(必填)": "text",
"columns": {
"columns(选填)": {
"text_column": "text_key"
}
}

View File

@ -1,6 +1,9 @@
# Copyright 2025 the LlamaFactory team.
# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
# Copyright (c) 2025 Huawei Technologies Co., Ltd.
#
# This code is inspired by the HuggigFace's Transformers library.
# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llava/processing_llava.py
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
@ -57,8 +60,9 @@ class MMPluginMixin:
audio_token: Optional[str]
expand_mm_tokens: bool = True
@staticmethod
def _preprocess_image(
self, image: "ImageObject", image_max_pixels: int, image_min_pixels: int, **kwargs
image: "ImageObject", image_max_pixels: int, image_min_pixels: int, **kwargs
) -> "ImageObject":
r"""
Pre-processes a single image.
@ -78,9 +82,8 @@ class MMPluginMixin:
return image
def _get_video_sample_indices(
self, video_stream: "Stream", video_fps: float, video_maxlen: int, **kwargs
) -> List[int]:
@staticmethod
def _get_video_sample_indices(video_stream: "Stream", video_fps: float, video_maxlen: int, **kwargs) -> List[int]:
r"""
Computes video sample indices according to fps.
"""

View File

@ -64,8 +64,9 @@ class InstructionDatasetAttr:
function_tag: Optional[str] = "function_call"
system_tag: Optional[str] = "system"
def set_attr(self, key: str, obj: Dict[str, Any], default: Optional[Any] = None) -> None:
setattr(self, key, obj.get(key, default))
def set_attr(self, key: str, obj: Dict[str, Any]) -> None:
if key in obj:
setattr(self, key, obj.get(key))
def get_dataset_attr(dataset: Optional[str], dataset_info) -> "InstructionDatasetAttr":
@ -105,12 +106,12 @@ def get_dataset_attr(dataset: Optional[str], dataset_info) -> "InstructionDatase
dataset_attr.is_path = True
if "file_name" in dataset_info[dataset]:
dataset_attr.set_attr("file_name", dataset_info[dataset], default=None)
dataset_attr.set_attr("file_name", dataset_info[dataset])
if "ranking" in dataset_info[dataset]:
dataset_attr.set_attr("ranking", dataset_info[dataset], default=False)
dataset_attr.set_attr("formatting", dataset_info[dataset], default="alpaca")
dataset_attr.set_attr("ranking", dataset_info[dataset])
dataset_attr.set_attr("formatting", dataset_info[dataset])
dataset_attr.set_attr("subset", dataset_info[dataset])
dataset_attr.set_attr("split", dataset_info[dataset], default="train")
dataset_attr.set_attr("split", dataset_info[dataset])
dataset_attr.set_attr("folder", dataset_info[dataset])
dataset_attr.set_attr("num_samples", dataset_info[dataset])