!236 update custom dataset info and codecheck related
Merge pull request !236 from 金勇旭/master
This commit is contained in:
@ -291,7 +291,7 @@ Pairwise格式示例数据如下:
|
||||
"file_name(选填)": "dataset.json",
|
||||
"split(选填)": "train",
|
||||
"num_samples(选填)": xxx,
|
||||
"columns": {
|
||||
"columns(选填)": {
|
||||
"prompt": "instruction",
|
||||
"query": "input",
|
||||
"response": "output",
|
||||
@ -321,10 +321,19 @@ Pairwise格式示例数据如下:
|
||||
"split(选填)": "train",
|
||||
"num_samples(选填)": xxx,
|
||||
"formatting": "sharegpt",
|
||||
"columns": {
|
||||
"columns(选填)": {
|
||||
"messages": "conversations",
|
||||
"system": "system",
|
||||
"tools": "tools"
|
||||
},
|
||||
"tags(选填)": {
|
||||
"role_tag": "from",
|
||||
"content_tag": "value",
|
||||
"user_tag": "human",
|
||||
"assistant_tag": "gpt",
|
||||
"observation_tag": "observation",
|
||||
"function_tag": "function_call",
|
||||
"system_tag": "system"
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -342,7 +351,7 @@ Pairwise格式示例数据如下:
|
||||
"split(选填)": "train",
|
||||
"num_samples(选填)": xxx,
|
||||
"formatting(必填)": "text",
|
||||
"columns": {
|
||||
"columns(选填)": {
|
||||
"text_column": "text_key"
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,9 @@
|
||||
# Copyright 2025 the LlamaFactory team.
|
||||
# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd.
|
||||
#
|
||||
# This code is inspired by the HuggigFace's Transformers library.
|
||||
# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llava/processing_llava.py
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
@ -57,8 +60,9 @@ class MMPluginMixin:
|
||||
audio_token: Optional[str]
|
||||
expand_mm_tokens: bool = True
|
||||
|
||||
@staticmethod
|
||||
def _preprocess_image(
|
||||
self, image: "ImageObject", image_max_pixels: int, image_min_pixels: int, **kwargs
|
||||
image: "ImageObject", image_max_pixels: int, image_min_pixels: int, **kwargs
|
||||
) -> "ImageObject":
|
||||
r"""
|
||||
Pre-processes a single image.
|
||||
@ -78,9 +82,8 @@ class MMPluginMixin:
|
||||
|
||||
return image
|
||||
|
||||
def _get_video_sample_indices(
|
||||
self, video_stream: "Stream", video_fps: float, video_maxlen: int, **kwargs
|
||||
) -> List[int]:
|
||||
@staticmethod
|
||||
def _get_video_sample_indices(video_stream: "Stream", video_fps: float, video_maxlen: int, **kwargs) -> List[int]:
|
||||
r"""
|
||||
Computes video sample indices according to fps.
|
||||
"""
|
||||
|
@ -64,8 +64,9 @@ class InstructionDatasetAttr:
|
||||
function_tag: Optional[str] = "function_call"
|
||||
system_tag: Optional[str] = "system"
|
||||
|
||||
def set_attr(self, key: str, obj: Dict[str, Any], default: Optional[Any] = None) -> None:
|
||||
setattr(self, key, obj.get(key, default))
|
||||
def set_attr(self, key: str, obj: Dict[str, Any]) -> None:
|
||||
if key in obj:
|
||||
setattr(self, key, obj.get(key))
|
||||
|
||||
|
||||
def get_dataset_attr(dataset: Optional[str], dataset_info) -> "InstructionDatasetAttr":
|
||||
@ -105,12 +106,12 @@ def get_dataset_attr(dataset: Optional[str], dataset_info) -> "InstructionDatase
|
||||
dataset_attr.is_path = True
|
||||
|
||||
if "file_name" in dataset_info[dataset]:
|
||||
dataset_attr.set_attr("file_name", dataset_info[dataset], default=None)
|
||||
dataset_attr.set_attr("file_name", dataset_info[dataset])
|
||||
if "ranking" in dataset_info[dataset]:
|
||||
dataset_attr.set_attr("ranking", dataset_info[dataset], default=False)
|
||||
dataset_attr.set_attr("formatting", dataset_info[dataset], default="alpaca")
|
||||
dataset_attr.set_attr("ranking", dataset_info[dataset])
|
||||
dataset_attr.set_attr("formatting", dataset_info[dataset])
|
||||
dataset_attr.set_attr("subset", dataset_info[dataset])
|
||||
dataset_attr.set_attr("split", dataset_info[dataset], default="train")
|
||||
dataset_attr.set_attr("split", dataset_info[dataset])
|
||||
dataset_attr.set_attr("folder", dataset_info[dataset])
|
||||
dataset_attr.set_attr("num_samples", dataset_info[dataset])
|
||||
|
||||
|
Reference in New Issue
Block a user