diff --git a/.github/workflows/e2e_eval_aime24.yml b/.github/workflows/.deprecate/e2e_eval_aime24.yml similarity index 100% rename from .github/workflows/e2e_eval_aime24.yml rename to .github/workflows/.deprecate/e2e_eval_aime24.yml diff --git a/.github/workflows/e2e_spin.yml b/.github/workflows/.deprecate/e2e_spin.yml similarity index 100% rename from .github/workflows/e2e_spin.yml rename to .github/workflows/.deprecate/e2e_spin.yml diff --git a/.github/workflows/e2e_sppo.yml b/.github/workflows/.deprecate/e2e_sppo.yml similarity index 94% rename from .github/workflows/e2e_sppo.yml rename to .github/workflows/.deprecate/e2e_sppo.yml index 684618116..2dacbb2ca 100644 --- a/.github/workflows/e2e_sppo.yml +++ b/.github/workflows/.deprecate/e2e_sppo.yml @@ -95,7 +95,7 @@ jobs: pip3 install -e .[test,gpu,sglang] - name: Prepare MATH dataset run: | - python3 examples/data_preprocess/math_dataset.py + python3 examples/data_preprocess/math_dataset.py --local_dataset_path $HOME/models/hf_data/DigitalLearningGmbH/MATH-lighteval - name: Running the E2E test with the SPPO algorithm run: | ray stop --force diff --git a/.github/workflows/model.yml b/.github/workflows/model.yml index f08702d06..54856b8d9 100644 --- a/.github/workflows/model.yml +++ b/.github/workflows/model.yml @@ -171,14 +171,14 @@ jobs: run: | pip3 install --no-deps -e .[test] pip install --upgrade "huggingface_hub[cli]" - - name: Download model config files - run: | - hf download Qwen/Qwen2.5-7B config.json --local-dir $HOME/configs/Qwen/Qwen2.5-7B - hf download Qwen/Qwen3-8B config.json --local-dir $HOME/configs/Qwen/Qwen3-8B - hf download deepseek-ai/deepseek-coder-1.3b-instruct config.json --local-dir $HOME/configs/deepseek-ai/deepseek-coder-1.3b-instruct - hf download Qwen/Qwen2-57B-A14B config.json --local-dir $HOME/configs/Qwen/Qwen2-57B-A14B - hf download Qwen/Qwen3-30B-A3B config.json --local-dir $HOME/configs/Qwen/Qwen3-30B-A3B - hf download deepseek-ai/DeepSeek-V3-Base config.json --local-dir $HOME/configs/deepseek-ai/DeepSeek-V3-Base +# - name: Download model config files +# run: | +# hf download Qwen/Qwen2.5-7B config.json --local-dir $HOME/configs/Qwen/Qwen2.5-7B +# hf download Qwen/Qwen3-8B config.json --local-dir $HOME/configs/Qwen/Qwen3-8B +# hf download deepseek-ai/deepseek-coder-1.3b-instruct config.json --local-dir $HOME/configs/deepseek-ai/deepseek-coder-1.3b-instruct +# hf download Qwen/Qwen2-57B-A14B config.json --local-dir $HOME/configs/Qwen/Qwen2-57B-A14B +# hf download Qwen/Qwen3-30B-A3B config.json --local-dir $HOME/configs/Qwen/Qwen3-30B-A3B +# hf download deepseek-ai/DeepSeek-V3-Base config.json --local-dir $HOME/configs/deepseek-ai/DeepSeek-V3-Base - name: Running mcore config converter tests on 8 L20 GPUs run: | torchrun --nproc_per_node=8 tests/special_distributed/test_mcore_config_converter.py diff --git a/.github/workflows/sgl.yml b/.github/workflows/sgl.yml index 3b6aff843..b43571187 100644 --- a/.github/workflows/sgl.yml +++ b/.github/workflows/sgl.yml @@ -129,8 +129,8 @@ jobs: python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k - name: Test the latest SGLang Rollout async with agent loop run: | - huggingface-cli download verl-team/gsm8k-v0.4.1 --repo-type dataset --local-dir ~/verl-data/gsm8k ROLLOUT_NAME=sglang pytest -svvv tests/experimental/agent_loop +# huggingface-cli download verl-team/gsm8k-v0.4.1 --repo-type dataset --local-dir ~/verl-data/gsm8k - name: Test the latest SGLang run: | cd tests/workers/rollout diff --git a/.github/workflows/type-coverage-check.yml b/.github/workflows/type-coverage-check.yml index 5d7bcd571..aa8c03a54 100644 --- a/.github/workflows/type-coverage-check.yml +++ b/.github/workflows/type-coverage-check.yml @@ -20,8 +20,9 @@ jobs: - name: Install dependencies run: | - pip install gitpython - pip install -e .[sglang] + pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cpu + pip3 install -r requirements.txt + pip3 install -e . --no-deps - name: Run type annotation coverage check run: | python3 tests/special_sanity/type_coverage_check.py diff --git a/examples/data_preprocess/math_dataset.py b/examples/data_preprocess/math_dataset.py index 343a83436..b23a032fb 100644 --- a/examples/data_preprocess/math_dataset.py +++ b/examples/data_preprocess/math_dataset.py @@ -31,16 +31,28 @@ def extract_solution(solution_str): if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--local_dir", default="~/data/math") + parser.add_argument("--local_dir", default=None) parser.add_argument("--hdfs_dir", default=None) + parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.") + parser.add_argument( + "--local_save_dir", default="~/data/math", help="The save directory for the preprocessed dataset." + ) args = parser.parse_args() + local_dataset_path = args.local_dataset_path # 'lighteval/MATH' is no longer available on huggingface. # Use mirror repo: DigitalLearningGmbH/MATH-lighteval data_source = "DigitalLearningGmbH/MATH-lighteval" print(f"Loading the {data_source} dataset from huggingface...", flush=True) - dataset = datasets.load_dataset(data_source, trust_remote_code=True) + if local_dataset_path is not None: + dataset = datasets.load_dataset( + local_dataset_path, + ) + else: + dataset = datasets.load_dataset( + data_source, + ) train_dataset = dataset["train"] test_dataset = dataset["test"] @@ -70,7 +82,13 @@ if __name__ == "__main__": train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True) test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True) - local_dir = os.path.expanduser(args.local_dir) + local_save_dir = args.local_dir + if local_save_dir is not None: + print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.") + else: + local_save_dir = args.local_save_dir + + local_dir = os.path.expanduser(local_save_dir) hdfs_dir = args.hdfs_dir train_dataset.to_parquet(os.path.join(local_dir, "train.parquet")) diff --git a/requirements-cuda.txt b/requirements-cuda.txt new file mode 100644 index 000000000..7bfe8efeb --- /dev/null +++ b/requirements-cuda.txt @@ -0,0 +1 @@ +flash-attn \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 64dc7f585..9d4f236ea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,6 @@ accelerate codetiming datasets dill -flash-attn hydra-core liger-kernel numpy<2.0.0 diff --git a/tests/special_distributed/test_fsdp_ckpt.py b/tests/special_distributed/test_fsdp_ckpt.py index e59064c1e..d4cd0d51e 100644 --- a/tests/special_distributed/test_fsdp_ckpt.py +++ b/tests/special_distributed/test_fsdp_ckpt.py @@ -49,7 +49,7 @@ def test_fsdp_ckpt(strategy="fsdp"): local_rank, rank, world_size = initialize_global_process_group() device_mesh = init_device_mesh("cuda", mesh_shape=(world_size,), mesh_dim_names=("dp",)) - model_name = "Qwen/Qwen2.5-0.5B-Instruct" + model_name = os.path.expanduser("~/models/Qwen/Qwen2.5-0.5B-Instruct") config = Qwen2Config(num_hidden_layers=1) with torch.device("cuda"): diff --git a/tests/special_distributed/test_mcore_config_converter.py b/tests/special_distributed/test_mcore_config_converter.py index 2eaea9bdb..18b2d3a04 100644 --- a/tests/special_distributed/test_mcore_config_converter.py +++ b/tests/special_distributed/test_mcore_config_converter.py @@ -89,7 +89,7 @@ def test_mcore_config_converter(): ) for model_name in TEST_MODELS: print(f"testing {model_name}") - hf_config = AutoConfig.from_pretrained(os.path.expanduser(f"~/configs/{model_name}/config.json")) + hf_config = AutoConfig.from_pretrained(os.path.expanduser(f"~/models/configs/{model_name}/config.json")) hf_config = modify_hf_config(model_name, hf_config) tf_config = hf_to_mcore_config(hf_config, torch.bfloat16) check_config_converter_results(tf_config, hf_config)