mirror of
https://github.com/volcengine/verl.git
synced 2025-10-20 13:43:50 +08:00
[tool] feat: support local gsm8k dataset in example/data_preprocess (#3362)
This commit is contained in:
@ -89,7 +89,7 @@ jobs:
|
||||
- name: Prepare gsm8k dataset with tool
|
||||
run: |
|
||||
ray stop --force
|
||||
python3 examples/data_preprocess/gsm8k_multiturn_w_tool.py --local_dir $HOME/data/gsm8k_verl_sgl_multi_turn_preprocessed
|
||||
python3 examples/data_preprocess/gsm8k_multiturn_w_tool.py --local_save_dir $HOME/data/gsm8k_verl_sgl_multi_turn_preprocessed
|
||||
- name: Running GSM8K with tool E2E training tests on 8 L20 GPUs with rmpad using function rm and save ckpt with sglang
|
||||
run: |
|
||||
ray stop --force
|
||||
|
@ -115,7 +115,7 @@ The following steps outline how to set up the environment and run the SPIN recip
|
||||
# wandb login
|
||||
|
||||
# Download the GSM8K dataset
|
||||
python3 examples/data_preprocess/gsm8k.py --local_dir ~/data/gsm8k # Adjusted path
|
||||
python3 examples/data_preprocess/gsm8k.py --local_save_dir ~/data/gsm8k # Adjusted path
|
||||
|
||||
# Download the base model (Example: Qwen2.5-3B-Instruct)
|
||||
huggingface-cli download Qwen/Qwen2.5-3B-Instruct --local-dir $HOME/models/Qwen2.5-3B-Instruct
|
||||
|
@ -381,7 +381,7 @@ PPO
|
||||
|
||||
GPUS_PER_NODE=8
|
||||
MODEL_PATH=Qwen/Qwen2.5-0.5B-Instruct
|
||||
python3 examples/data_preprocess/gsm8k.py --local_dir data/gsm8k
|
||||
python3 examples/data_preprocess/gsm8k.py --local_save_dir data/gsm8k
|
||||
python3 -c "import transformers; transformers.pipeline('text-generation', model='$MODEL_PATH')"
|
||||
ENGINE=vllm #sglang
|
||||
|
||||
@ -436,7 +436,7 @@ GRPO
|
||||
GPUS_PER_NODE=8
|
||||
MODEL_PATH=Qwen/Qwen2.5-0.5B-Instruct
|
||||
# MODEL_PATH=Qwen/Qwen2-7B-Instruct
|
||||
python3 examples/data_preprocess/gsm8k.py --local_dir data/gsm8k
|
||||
python3 examples/data_preprocess/gsm8k.py --local_save_dir data/gsm8k
|
||||
python3 -c "import transformers; transformers.pipeline('text-generation', model='$MODEL_PATH')"
|
||||
ENGINE=vllm #sglang
|
||||
|
||||
@ -716,7 +716,7 @@ slurm_script.sh
|
||||
|
||||
echo "Starting data preprocessing..."
|
||||
docker exec "${CONTAINER_NAME}" \
|
||||
python3 "examples/data_preprocess/gsm8k.py" "--local_dir" "../data/gsm8k"
|
||||
python3 "examples/data_preprocess/gsm8k.py" "--local_save_dir" "../data/gsm8k"
|
||||
|
||||
echo "Starting data preprocessing..."
|
||||
docker exec "${CONTAINER_NAME}" \
|
||||
|
@ -100,7 +100,7 @@ vllm & vllm-ascend
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python3 examples/data_preprocess/gsm8k.py --local_dir ~/data/gsm8k
|
||||
python3 examples/data_preprocess/gsm8k.py --local_save_dir ~/data/gsm8k
|
||||
|
||||
2.执行训练
|
||||
|
||||
|
@ -46,7 +46,7 @@ Step 1: Prepare dataset
|
||||
.. code:: bash
|
||||
|
||||
cd examples/data_preprocess
|
||||
python3 gsm8k.py --local_dir ~/data/gsm8k
|
||||
python3 gsm8k.py --local_save_dir ~/data/gsm8k
|
||||
|
||||
Step 2: Download Model
|
||||
----------------------
|
||||
|
@ -107,7 +107,7 @@ Step 2: Prepare dataset
|
||||
|
||||
git clone https://github.com/volcengine/verl.git
|
||||
cd examples/data_preprocess
|
||||
python3 gsm8k.py --local_dir ~/data/gsm8k
|
||||
python3 gsm8k.py --local_save_dir ~/data/gsm8k
|
||||
|
||||
|
||||
Step 3: Submit a job with SkyPilot
|
||||
@ -342,7 +342,7 @@ Once the fleet is created, define a Ray cluster task, e.g. in ``ray-cluster.dsta
|
||||
- pip install hf_transfer hf_xet
|
||||
- |
|
||||
if [ $DSTACK_NODE_RANK = 0 ]; then
|
||||
python3 examples/data_preprocess/gsm8k.py --local_dir ~/data/gsm8k
|
||||
python3 examples/data_preprocess/gsm8k.py --local_save_dir ~/data/gsm8k
|
||||
python3 -c "import transformers; transformers.pipeline('text-generation', model='Qwen/Qwen2.5-7B-Instruct')"
|
||||
ray start --head --port=6379;
|
||||
else
|
||||
@ -741,7 +741,7 @@ slurm_script.sh
|
||||
|
||||
echo "Starting data preprocessing..."
|
||||
docker exec "${CONTAINER_NAME}" \
|
||||
python3 "examples/data_preprocess/gsm8k.py" "--local_dir" "../data/gsm8k"
|
||||
python3 "examples/data_preprocess/gsm8k.py" "--local_save_dir" "../data/gsm8k"
|
||||
|
||||
echo "Starting data preprocessing..."
|
||||
docker exec "${CONTAINER_NAME}" \
|
||||
|
@ -46,7 +46,7 @@ We preprocess the dataset in parquet format so that (1) it contains necessary fi
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python3 examples/data_preprocess/gsm8k.py --local_dir ~/data/gsm8k
|
||||
python3 examples/data_preprocess/gsm8k.py --local_save_dir ~/data/gsm8k
|
||||
|
||||
Step 2: Download a model for post-training
|
||||
-------------------------------------------
|
||||
|
@ -34,14 +34,22 @@ def extract_solution(solution_str):
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--local_dir", default="~/data/gsm8k")
|
||||
parser.add_argument("--local_dir", default=None, help="The save directory for the preprocessed dataset.")
|
||||
parser.add_argument("--hdfs_dir", default=None)
|
||||
parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
|
||||
parser.add_argument(
|
||||
"--local_save_dir", default="~/data/gsm8k", help="The save directory for the preprocessed dataset."
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
local_dataset_path = args.local_dataset_path
|
||||
|
||||
data_source = "openai/gsm8k"
|
||||
|
||||
dataset = datasets.load_dataset(data_source, "main")
|
||||
if local_dataset_path is not None:
|
||||
dataset = datasets.load_dataset(local_dataset_path, "main")
|
||||
else:
|
||||
dataset = datasets.load_dataset(data_source, "main")
|
||||
|
||||
train_dataset = dataset["train"]
|
||||
test_dataset = dataset["test"]
|
||||
@ -81,13 +89,17 @@ if __name__ == "__main__":
|
||||
train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
|
||||
test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True)
|
||||
|
||||
local_dir = args.local_dir
|
||||
hdfs_dir = args.hdfs_dir
|
||||
local_save_dir = args.local_dir
|
||||
if local_save_dir is not None:
|
||||
print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
|
||||
else:
|
||||
local_save_dir = args.local_save_dir
|
||||
|
||||
train_dataset.to_parquet(os.path.join(local_dir, "train.parquet"))
|
||||
test_dataset.to_parquet(os.path.join(local_dir, "test.parquet"))
|
||||
train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet"))
|
||||
test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet"))
|
||||
|
||||
if hdfs_dir is not None:
|
||||
makedirs(hdfs_dir)
|
||||
|
||||
copy(src=local_dir, dst=hdfs_dir)
|
||||
copy(src=local_save_dir, dst=hdfs_dir)
|
||||
|
@ -36,13 +36,22 @@ def extract_solution(solution_str):
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--local_dir", default="~/data/gsm8k")
|
||||
parser.add_argument("--local_dir", default=None, help="The save directory for the preprocessed dataset.")
|
||||
parser.add_argument("--hdfs_dir", default=None)
|
||||
parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
|
||||
parser.add_argument(
|
||||
"--local_save_dir", default="~/data/gsm8k", help="The save directory for the preprocessed dataset."
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
local_dataset_path = args.local_dataset_path
|
||||
|
||||
data_source = "openai/gsm8k"
|
||||
dataset = datasets.load_dataset(data_source, "main")
|
||||
|
||||
if local_dataset_path is not None:
|
||||
dataset = datasets.load_dataset(local_dataset_path, "main")
|
||||
else:
|
||||
dataset = datasets.load_dataset(data_source, "main")
|
||||
|
||||
train_dataset = dataset["train"]
|
||||
test_dataset = dataset["test"]
|
||||
@ -95,12 +104,16 @@ if __name__ == "__main__":
|
||||
train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
|
||||
test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True)
|
||||
|
||||
local_dir = args.local_dir
|
||||
hdfs_dir = args.hdfs_dir
|
||||
local_save_dir = args.local_dir
|
||||
if local_save_dir is not None:
|
||||
print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
|
||||
else:
|
||||
local_save_dir = args.local_save_dir
|
||||
|
||||
train_dataset.to_parquet(os.path.join(local_dir, "train.parquet"))
|
||||
test_dataset.to_parquet(os.path.join(local_dir, "test.parquet"))
|
||||
train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet"))
|
||||
test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet"))
|
||||
|
||||
if hdfs_dir is not None:
|
||||
makedirs(hdfs_dir)
|
||||
copy(src=local_dir, dst=hdfs_dir)
|
||||
copy(src=local_save_dir, dst=hdfs_dir)
|
||||
|
@ -36,13 +36,22 @@ def extract_solution(solution_str):
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--local_dir", default="~/data/gsm8k")
|
||||
parser.add_argument("--local_dir", default=None, help="The save directory for the preprocessed dataset.")
|
||||
parser.add_argument("--hdfs_dir", default=None)
|
||||
parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
|
||||
parser.add_argument(
|
||||
"--local_save_dir", default="~/data/gsm8k", help="The save directory for the preprocessed dataset."
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
local_dataset_path = args.local_dataset_path
|
||||
|
||||
data_source = "openai/gsm8k"
|
||||
dataset = datasets.load_dataset(data_source, "main")
|
||||
|
||||
if local_dataset_path is not None:
|
||||
dataset = datasets.load_dataset(local_dataset_path, "main")
|
||||
else:
|
||||
dataset = datasets.load_dataset(data_source, "main")
|
||||
|
||||
train_dataset = dataset["train"]
|
||||
test_dataset = dataset["test"]
|
||||
@ -105,12 +114,16 @@ if __name__ == "__main__":
|
||||
train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
|
||||
test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True)
|
||||
|
||||
local_dir = args.local_dir
|
||||
hdfs_dir = args.hdfs_dir
|
||||
local_save_dir = args.local_dir
|
||||
if local_save_dir is not None:
|
||||
print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
|
||||
else:
|
||||
local_save_dir = args.local_save_dir
|
||||
|
||||
train_dataset.to_parquet(os.path.join(local_dir, "train.parquet"))
|
||||
test_dataset.to_parquet(os.path.join(local_dir, "test.parquet"))
|
||||
train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet"))
|
||||
test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet"))
|
||||
|
||||
if hdfs_dir is not None:
|
||||
makedirs(hdfs_dir)
|
||||
copy(src=local_dir, dst=hdfs_dir)
|
||||
copy(src=local_save_dir, dst=hdfs_dir)
|
||||
|
@ -36,13 +36,22 @@ def extract_solution(solution_str):
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--local_dir", default="~/data/gsm8k")
|
||||
parser.add_argument("--local_dir", default=None, help="The save directory for the preprocessed dataset.")
|
||||
parser.add_argument("--hdfs_dir", default=None)
|
||||
parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
|
||||
parser.add_argument(
|
||||
"--local_save_dir", default="~/data/gsm8k", help="The save directory for the preprocessed dataset."
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
local_dataset_path = args.local_dataset_path
|
||||
|
||||
data_source = "openai/gsm8k"
|
||||
dataset = datasets.load_dataset(data_source, "main")
|
||||
|
||||
if local_dataset_path is not None:
|
||||
dataset = datasets.load_dataset(local_dataset_path, "main")
|
||||
else:
|
||||
dataset = datasets.load_dataset(data_source, "main")
|
||||
|
||||
train_dataset = dataset["train"]
|
||||
test_dataset = dataset["test"]
|
||||
@ -106,12 +115,16 @@ if __name__ == "__main__":
|
||||
train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
|
||||
test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True)
|
||||
|
||||
local_dir = args.local_dir
|
||||
hdfs_dir = args.hdfs_dir
|
||||
local_save_dir = args.local_dir
|
||||
if local_save_dir is not None:
|
||||
print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
|
||||
else:
|
||||
local_save_dir = args.local_save_dir
|
||||
|
||||
train_dataset.to_parquet(os.path.join(local_dir, "train.parquet"))
|
||||
test_dataset.to_parquet(os.path.join(local_dir, "test.parquet"))
|
||||
train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet"))
|
||||
test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet"))
|
||||
|
||||
if hdfs_dir is not None:
|
||||
makedirs(hdfs_dir)
|
||||
copy(src=local_dir, dst=hdfs_dir)
|
||||
copy(src=local_save_dir, dst=hdfs_dir)
|
||||
|
@ -5,7 +5,7 @@ set -x
|
||||
# Example usage:
|
||||
#
|
||||
# python3 examples/data_preprocess/math_dataset.py --local_dir ~/data/math
|
||||
# python3 examples/data_preprocess/gsm8k.py --local_dir ~/data/gsm8k
|
||||
# python3 examples/data_preprocess/gsm8k.py --local_save_dir ~/data/gsm8k
|
||||
|
||||
gsm8k_train_path=$HOME/data/gsm8k/train.parquet
|
||||
gsm8k_test_path=$HOME/data/gsm8k/test.parquet
|
||||
|
@ -102,7 +102,7 @@ entropy_checkpointing=true # This enables entropy recomputation specifically for
|
||||
# ------------------------------------- train/val data preparation ---------------------------------------
|
||||
if [ "$first_time_dataset_prep" = true ]; then
|
||||
echo "Preprocessing GSM8K dataset..."
|
||||
python examples/data_preprocess/gsm8k.py --local_dir /data/gsm8k/
|
||||
python examples/data_preprocess/gsm8k.py --local_save_dir /data/gsm8k/
|
||||
fi
|
||||
|
||||
gsm8k_train_path=/data/gsm8k/train.parquet
|
||||
|
@ -106,7 +106,7 @@ entropy_checkpointing=true # This enables entropy recomputation specifically for
|
||||
# ------------------------------------- train/val data preparation ---------------------------------------
|
||||
if [ "$first_time_dataset_prep" = true ]; then
|
||||
echo "Preprocessing GSM8K dataset..."
|
||||
python examples/data_preprocess/gsm8k.py --local_dir /data/gsm8k/
|
||||
python examples/data_preprocess/gsm8k.py --local_save_dir /data/gsm8k/
|
||||
fi
|
||||
|
||||
gsm8k_train_path=/data/gsm8k/train.parquet
|
||||
|
@ -113,7 +113,7 @@ The following steps outline how to set up the environment and run the SPIN recip
|
||||
# wandb login
|
||||
|
||||
# Download the GSM8K dataset
|
||||
python3 examples/data_preprocess/gsm8k.py --local_dir ~/data/gsm8k # Adjusted path
|
||||
python3 examples/data_preprocess/gsm8k.py --local_save_dir ~/data/gsm8k # Adjusted path
|
||||
|
||||
# Download the base model (Example: Qwen2.5-3B-Instruct)
|
||||
huggingface-cli download Qwen/Qwen2.5-3B-Instruct --local-dir $HOME/models/Qwen2.5-3B-Instruct
|
||||
|
@ -5,7 +5,7 @@ set -x
|
||||
# Example usage:
|
||||
#
|
||||
# python3 examples/data_preprocess/math_dataset.py --local_dir ~/data/math
|
||||
# python3 examples/data_preprocess/gsm8k.py --local_dir ~/data/gsm8k
|
||||
# python3 examples/data_preprocess/gsm8k.py --local_save_dir ~/data/gsm8k
|
||||
|
||||
gsm8k_train_path=$HOME/data/math/train.parquet
|
||||
gsm8k_test_path=$HOME/data/math/test.parquet
|
||||
|
Reference in New Issue
Block a user