!236 update custom dataset info and codecheck related

Merge pull request !236 from 金勇旭/master
2025-06-18 06:38:01 +00:00
parent d6d874b0ec
commit 14e75e69e2
3 changed files with 27 additions and 14 deletions
--- a/docs/zh/basic_tutorial/train/datasets.md
+++ b/docs/zh/basic_tutorial/train/datasets.md
@ -291,7 +291,7 @@ Pairwise格式示例数据如下：
    "file_name(选填)": "dataset.json",
    "split(选填)": "train",
    "num_samples(选填)": xxx,
-    "columns": {
+    "columns(选填)": {
      "prompt": "instruction",
      "query": "input",
      "response": "output",
@ -321,10 +321,19 @@ Pairwise格式示例数据如下：
    "split(选填)": "train",
    "num_samples(选填)": xxx,
    "formatting": "sharegpt",
-    "columns": {
+    "columns(选填)": {
      "messages": "conversations",
      "system": "system",
      "tools": "tools"
+    },
+    "tags(选填)": {
+      "role_tag": "from",
+      "content_tag": "value",
+      "user_tag": "human",
+      "assistant_tag": "gpt",
+      "observation_tag": "observation",
+      "function_tag": "function_call",
+      "system_tag": "system"
    }
  }
 }
@ -342,7 +351,7 @@ Pairwise格式示例数据如下：
    "split(选填)": "train",
    "num_samples(选填)": xxx,
    "formatting(必填)": "text",
-    "columns": {
+    "columns(选填)": {
      "text_column": "text_key"
    }
  }
--- a/src/openmind/flow/datasets/mm_plugin.py
+++ b/src/openmind/flow/datasets/mm_plugin.py
@ -1,6 +1,9 @@
-# Copyright 2025 the LlamaFactory team.
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
 # Copyright (c) 2025 Huawei Technologies Co., Ltd.
 #
+# This code is inspired by the HuggigFace's Transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llava/processing_llava.py
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@ -57,8 +60,9 @@ class MMPluginMixin:
    audio_token: Optional[str]
    expand_mm_tokens: bool = True

+    @staticmethod
    def _preprocess_image(
-        self, image: "ImageObject", image_max_pixels: int, image_min_pixels: int, **kwargs
+        image: "ImageObject", image_max_pixels: int, image_min_pixels: int, **kwargs
    ) -> "ImageObject":
        r"""
        Pre-processes a single image.
@ -78,9 +82,8 @@ class MMPluginMixin:

        return image

-    def _get_video_sample_indices(
-        self, video_stream: "Stream", video_fps: float, video_maxlen: int, **kwargs
-    ) -> List[int]:
+    @staticmethod
+    def _get_video_sample_indices(video_stream: "Stream", video_fps: float, video_maxlen: int, **kwargs) -> List[int]:
        r"""
        Computes video sample indices according to fps.
        """
--- a/src/openmind/flow/datasets/parser.py
+++ b/src/openmind/flow/datasets/parser.py
@ -64,8 +64,9 @@ class InstructionDatasetAttr:
    function_tag: Optional[str] = "function_call"
    system_tag: Optional[str] = "system"

-    def set_attr(self, key: str, obj: Dict[str, Any], default: Optional[Any] = None) -> None:
-        setattr(self, key, obj.get(key, default))
+    def set_attr(self, key: str, obj: Dict[str, Any]) -> None:
+        if key in obj:
+            setattr(self, key, obj.get(key))


 def get_dataset_attr(dataset: Optional[str], dataset_info) -> "InstructionDatasetAttr":
@ -105,12 +106,12 @@ def get_dataset_attr(dataset: Optional[str], dataset_info) -> "InstructionDatase
        dataset_attr.is_path = True

    if "file_name" in dataset_info[dataset]:
-        dataset_attr.set_attr("file_name", dataset_info[dataset], default=None)
+        dataset_attr.set_attr("file_name", dataset_info[dataset])
    if "ranking" in dataset_info[dataset]:
-        dataset_attr.set_attr("ranking", dataset_info[dataset], default=False)
-    dataset_attr.set_attr("formatting", dataset_info[dataset], default="alpaca")
+        dataset_attr.set_attr("ranking", dataset_info[dataset])
+    dataset_attr.set_attr("formatting", dataset_info[dataset])
    dataset_attr.set_attr("subset", dataset_info[dataset])
-    dataset_attr.set_attr("split", dataset_info[dataset], default="train")
+    dataset_attr.set_attr("split", dataset_info[dataset])
    dataset_attr.set_attr("folder", dataset_info[dataset])
    dataset_attr.set_attr("num_samples", dataset_info[dataset])