[tool] feat: support local gsm8k dataset in example/data_preprocess (#3362)

This commit is contained in:
X. HU
2025-09-09 22:29:56 +08:00
committed by GitHub
parent 5c46f4f437
commit dfa3933ac4
16 changed files with 91 additions and 40 deletions

View File

@ -89,7 +89,7 @@ jobs:
- name: Prepare gsm8k dataset with tool
run: |
ray stop --force
python3 examples/data_preprocess/gsm8k_multiturn_w_tool.py --local_dir $HOME/data/gsm8k_verl_sgl_multi_turn_preprocessed
python3 examples/data_preprocess/gsm8k_multiturn_w_tool.py --local_save_dir $HOME/data/gsm8k_verl_sgl_multi_turn_preprocessed
- name: Running GSM8K with tool E2E training tests on 8 L20 GPUs with rmpad using function rm and save ckpt with sglang
run: |
ray stop --force

View File

@ -115,7 +115,7 @@ The following steps outline how to set up the environment and run the SPIN recip
# wandb login
# Download the GSM8K dataset
python3 examples/data_preprocess/gsm8k.py --local_dir ~/data/gsm8k # Adjusted path
python3 examples/data_preprocess/gsm8k.py --local_save_dir ~/data/gsm8k # Adjusted path
# Download the base model (Example: Qwen2.5-3B-Instruct)
huggingface-cli download Qwen/Qwen2.5-3B-Instruct --local-dir $HOME/models/Qwen2.5-3B-Instruct

View File

@ -381,7 +381,7 @@ PPO
GPUS_PER_NODE=8
MODEL_PATH=Qwen/Qwen2.5-0.5B-Instruct
python3 examples/data_preprocess/gsm8k.py --local_dir data/gsm8k
python3 examples/data_preprocess/gsm8k.py --local_save_dir data/gsm8k
python3 -c "import transformers; transformers.pipeline('text-generation', model='$MODEL_PATH')"
ENGINE=vllm #sglang
@ -436,7 +436,7 @@ GRPO
GPUS_PER_NODE=8
MODEL_PATH=Qwen/Qwen2.5-0.5B-Instruct
# MODEL_PATH=Qwen/Qwen2-7B-Instruct
python3 examples/data_preprocess/gsm8k.py --local_dir data/gsm8k
python3 examples/data_preprocess/gsm8k.py --local_save_dir data/gsm8k
python3 -c "import transformers; transformers.pipeline('text-generation', model='$MODEL_PATH')"
ENGINE=vllm #sglang
@ -716,7 +716,7 @@ slurm_script.sh
echo "Starting data preprocessing..."
docker exec "${CONTAINER_NAME}" \
python3 "examples/data_preprocess/gsm8k.py" "--local_dir" "../data/gsm8k"
python3 "examples/data_preprocess/gsm8k.py" "--local_save_dir" "../data/gsm8k"
echo "Starting data preprocessing..."
docker exec "${CONTAINER_NAME}" \

View File

@ -100,7 +100,7 @@ vllm & vllm-ascend
.. code-block:: bash
python3 examples/data_preprocess/gsm8k.py --local_dir ~/data/gsm8k
python3 examples/data_preprocess/gsm8k.py --local_save_dir ~/data/gsm8k
2.执行训练

View File

@ -46,7 +46,7 @@ Step 1: Prepare dataset
.. code:: bash
cd examples/data_preprocess
python3 gsm8k.py --local_dir ~/data/gsm8k
python3 gsm8k.py --local_save_dir ~/data/gsm8k
Step 2: Download Model
----------------------

View File

@ -107,7 +107,7 @@ Step 2: Prepare dataset
git clone https://github.com/volcengine/verl.git
cd examples/data_preprocess
python3 gsm8k.py --local_dir ~/data/gsm8k
python3 gsm8k.py --local_save_dir ~/data/gsm8k
Step 3: Submit a job with SkyPilot
@ -342,7 +342,7 @@ Once the fleet is created, define a Ray cluster task, e.g. in ``ray-cluster.dsta
- pip install hf_transfer hf_xet
- |
if [ $DSTACK_NODE_RANK = 0 ]; then
python3 examples/data_preprocess/gsm8k.py --local_dir ~/data/gsm8k
python3 examples/data_preprocess/gsm8k.py --local_save_dir ~/data/gsm8k
python3 -c "import transformers; transformers.pipeline('text-generation', model='Qwen/Qwen2.5-7B-Instruct')"
ray start --head --port=6379;
else
@ -741,7 +741,7 @@ slurm_script.sh
echo "Starting data preprocessing..."
docker exec "${CONTAINER_NAME}" \
python3 "examples/data_preprocess/gsm8k.py" "--local_dir" "../data/gsm8k"
python3 "examples/data_preprocess/gsm8k.py" "--local_save_dir" "../data/gsm8k"
echo "Starting data preprocessing..."
docker exec "${CONTAINER_NAME}" \

View File

@ -46,7 +46,7 @@ We preprocess the dataset in parquet format so that (1) it contains necessary fi
.. code-block:: bash
python3 examples/data_preprocess/gsm8k.py --local_dir ~/data/gsm8k
python3 examples/data_preprocess/gsm8k.py --local_save_dir ~/data/gsm8k
Step 2: Download a model for post-training
-------------------------------------------

View File

@ -34,14 +34,22 @@ def extract_solution(solution_str):
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--local_dir", default="~/data/gsm8k")
parser.add_argument("--local_dir", default=None, help="The save directory for the preprocessed dataset.")
parser.add_argument("--hdfs_dir", default=None)
parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
parser.add_argument(
"--local_save_dir", default="~/data/gsm8k", help="The save directory for the preprocessed dataset."
)
args = parser.parse_args()
local_dataset_path = args.local_dataset_path
data_source = "openai/gsm8k"
dataset = datasets.load_dataset(data_source, "main")
if local_dataset_path is not None:
dataset = datasets.load_dataset(local_dataset_path, "main")
else:
dataset = datasets.load_dataset(data_source, "main")
train_dataset = dataset["train"]
test_dataset = dataset["test"]
@ -81,13 +89,17 @@ if __name__ == "__main__":
train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True)
local_dir = args.local_dir
hdfs_dir = args.hdfs_dir
local_save_dir = args.local_dir
if local_save_dir is not None:
print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
else:
local_save_dir = args.local_save_dir
train_dataset.to_parquet(os.path.join(local_dir, "train.parquet"))
test_dataset.to_parquet(os.path.join(local_dir, "test.parquet"))
train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet"))
test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet"))
if hdfs_dir is not None:
makedirs(hdfs_dir)
copy(src=local_dir, dst=hdfs_dir)
copy(src=local_save_dir, dst=hdfs_dir)

View File

@ -36,13 +36,22 @@ def extract_solution(solution_str):
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--local_dir", default="~/data/gsm8k")
parser.add_argument("--local_dir", default=None, help="The save directory for the preprocessed dataset.")
parser.add_argument("--hdfs_dir", default=None)
parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
parser.add_argument(
"--local_save_dir", default="~/data/gsm8k", help="The save directory for the preprocessed dataset."
)
args = parser.parse_args()
local_dataset_path = args.local_dataset_path
data_source = "openai/gsm8k"
dataset = datasets.load_dataset(data_source, "main")
if local_dataset_path is not None:
dataset = datasets.load_dataset(local_dataset_path, "main")
else:
dataset = datasets.load_dataset(data_source, "main")
train_dataset = dataset["train"]
test_dataset = dataset["test"]
@ -95,12 +104,16 @@ if __name__ == "__main__":
train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True)
local_dir = args.local_dir
hdfs_dir = args.hdfs_dir
local_save_dir = args.local_dir
if local_save_dir is not None:
print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
else:
local_save_dir = args.local_save_dir
train_dataset.to_parquet(os.path.join(local_dir, "train.parquet"))
test_dataset.to_parquet(os.path.join(local_dir, "test.parquet"))
train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet"))
test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet"))
if hdfs_dir is not None:
makedirs(hdfs_dir)
copy(src=local_dir, dst=hdfs_dir)
copy(src=local_save_dir, dst=hdfs_dir)

View File

@ -36,13 +36,22 @@ def extract_solution(solution_str):
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--local_dir", default="~/data/gsm8k")
parser.add_argument("--local_dir", default=None, help="The save directory for the preprocessed dataset.")
parser.add_argument("--hdfs_dir", default=None)
parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
parser.add_argument(
"--local_save_dir", default="~/data/gsm8k", help="The save directory for the preprocessed dataset."
)
args = parser.parse_args()
local_dataset_path = args.local_dataset_path
data_source = "openai/gsm8k"
dataset = datasets.load_dataset(data_source, "main")
if local_dataset_path is not None:
dataset = datasets.load_dataset(local_dataset_path, "main")
else:
dataset = datasets.load_dataset(data_source, "main")
train_dataset = dataset["train"]
test_dataset = dataset["test"]
@ -105,12 +114,16 @@ if __name__ == "__main__":
train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True)
local_dir = args.local_dir
hdfs_dir = args.hdfs_dir
local_save_dir = args.local_dir
if local_save_dir is not None:
print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
else:
local_save_dir = args.local_save_dir
train_dataset.to_parquet(os.path.join(local_dir, "train.parquet"))
test_dataset.to_parquet(os.path.join(local_dir, "test.parquet"))
train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet"))
test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet"))
if hdfs_dir is not None:
makedirs(hdfs_dir)
copy(src=local_dir, dst=hdfs_dir)
copy(src=local_save_dir, dst=hdfs_dir)

View File

@ -36,13 +36,22 @@ def extract_solution(solution_str):
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--local_dir", default="~/data/gsm8k")
parser.add_argument("--local_dir", default=None, help="The save directory for the preprocessed dataset.")
parser.add_argument("--hdfs_dir", default=None)
parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
parser.add_argument(
"--local_save_dir", default="~/data/gsm8k", help="The save directory for the preprocessed dataset."
)
args = parser.parse_args()
local_dataset_path = args.local_dataset_path
data_source = "openai/gsm8k"
dataset = datasets.load_dataset(data_source, "main")
if local_dataset_path is not None:
dataset = datasets.load_dataset(local_dataset_path, "main")
else:
dataset = datasets.load_dataset(data_source, "main")
train_dataset = dataset["train"]
test_dataset = dataset["test"]
@ -106,12 +115,16 @@ if __name__ == "__main__":
train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True)
local_dir = args.local_dir
hdfs_dir = args.hdfs_dir
local_save_dir = args.local_dir
if local_save_dir is not None:
print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
else:
local_save_dir = args.local_save_dir
train_dataset.to_parquet(os.path.join(local_dir, "train.parquet"))
test_dataset.to_parquet(os.path.join(local_dir, "test.parquet"))
train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet"))
test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet"))
if hdfs_dir is not None:
makedirs(hdfs_dir)
copy(src=local_dir, dst=hdfs_dir)
copy(src=local_save_dir, dst=hdfs_dir)

View File

@ -5,7 +5,7 @@ set -x
# Example usage:
#
# python3 examples/data_preprocess/math_dataset.py --local_dir ~/data/math
# python3 examples/data_preprocess/gsm8k.py --local_dir ~/data/gsm8k
# python3 examples/data_preprocess/gsm8k.py --local_save_dir ~/data/gsm8k
gsm8k_train_path=$HOME/data/gsm8k/train.parquet
gsm8k_test_path=$HOME/data/gsm8k/test.parquet

View File

@ -102,7 +102,7 @@ entropy_checkpointing=true # This enables entropy recomputation specifically for
# ------------------------------------- train/val data preparation ---------------------------------------
if [ "$first_time_dataset_prep" = true ]; then
echo "Preprocessing GSM8K dataset..."
python examples/data_preprocess/gsm8k.py --local_dir /data/gsm8k/
python examples/data_preprocess/gsm8k.py --local_save_dir /data/gsm8k/
fi
gsm8k_train_path=/data/gsm8k/train.parquet

View File

@ -106,7 +106,7 @@ entropy_checkpointing=true # This enables entropy recomputation specifically for
# ------------------------------------- train/val data preparation ---------------------------------------
if [ "$first_time_dataset_prep" = true ]; then
echo "Preprocessing GSM8K dataset..."
python examples/data_preprocess/gsm8k.py --local_dir /data/gsm8k/
python examples/data_preprocess/gsm8k.py --local_save_dir /data/gsm8k/
fi
gsm8k_train_path=/data/gsm8k/train.parquet

View File

@ -113,7 +113,7 @@ The following steps outline how to set up the environment and run the SPIN recip
# wandb login
# Download the GSM8K dataset
python3 examples/data_preprocess/gsm8k.py --local_dir ~/data/gsm8k # Adjusted path
python3 examples/data_preprocess/gsm8k.py --local_save_dir ~/data/gsm8k # Adjusted path
# Download the base model (Example: Qwen2.5-3B-Instruct)
huggingface-cli download Qwen/Qwen2.5-3B-Instruct --local-dir $HOME/models/Qwen2.5-3B-Instruct

View File

@ -5,7 +5,7 @@ set -x
# Example usage:
#
# python3 examples/data_preprocess/math_dataset.py --local_dir ~/data/math
# python3 examples/data_preprocess/gsm8k.py --local_dir ~/data/gsm8k
# python3 examples/data_preprocess/gsm8k.py --local_save_dir ~/data/gsm8k
gsm8k_train_path=$HOME/data/math/train.parquet
gsm8k_test_path=$HOME/data/math/test.parquet