mirror of
https://github.com/volcengine/verl.git
synced 2025-10-20 13:43:50 +08:00
[ci] refactor: setup testing guidance (#1958)
This commit is contained in:
35
.github/workflows/check-pr-title.yml
vendored
35
.github/workflows/check-pr-title.yml
vendored
@ -1,5 +1,34 @@
|
||||
# .github/workflows/check-pr-title.yml
|
||||
name: Check PR Title
|
||||
# # Tests layout
|
||||
|
||||
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
|
||||
# - `tests/trainer` for testing functionality related to `verl/trainer`
|
||||
# - `tests/models` for testing functionality related to `verl/models`
|
||||
# - ...
|
||||
|
||||
# There are a few folders with `special_` prefix, created for special purposes:
|
||||
# - `special_distributed`: unit tests that must run with multiple GPUs
|
||||
# - `special_e2e`: end-to-end tests with training/generation scripts
|
||||
# - `special_npu`: tests for NPUs
|
||||
# - `special_sanity`: a suite of quick sanity tests
|
||||
# - `special_standalone`: a set of test that are designed to run in dedicated environments
|
||||
|
||||
# Accelerators for tests
|
||||
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
|
||||
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
|
||||
|
||||
# # Workflow layout
|
||||
|
||||
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
|
||||
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
|
||||
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
|
||||
# 3. End-to-end tests: `e2e_*.yml`
|
||||
# 4. Unit tests
|
||||
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
|
||||
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
|
||||
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
|
||||
# - new workflow yaml is added to `.github/workflows`
|
||||
# - new tests are added to workflow mentioned in 2.
|
||||
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
@ -18,6 +47,6 @@ jobs:
|
||||
python-version: '3.11'
|
||||
|
||||
- name: Run PR title checker
|
||||
run: python3 tests/sanity/check_pr_title.py
|
||||
run: python3 tests/special_sanity/check_pr_title.py
|
||||
env:
|
||||
PR_TITLE: ${{ github.event.pull_request.title }}
|
||||
|
35
.github/workflows/checkpoint_converter.yml
vendored
35
.github/workflows/checkpoint_converter.yml
vendored
@ -1,3 +1,36 @@
|
||||
# # Tests layout
|
||||
|
||||
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
|
||||
# - `tests/trainer` for testing functionality related to `verl/trainer`
|
||||
# - `tests/models` for testing functionality related to `verl/models`
|
||||
# - ...
|
||||
|
||||
# There are a few folders with `special_` prefix, created for special purposes:
|
||||
# - `special_distributed`: unit tests that must run with multiple GPUs
|
||||
# - `special_e2e`: end-to-end tests with training/generation scripts
|
||||
# - `special_npu`: tests for NPUs
|
||||
# - `special_sanity`: a suite of quick sanity tests
|
||||
# - `special_standalone`: a set of test that are designed to run in dedicated environments
|
||||
|
||||
# Accelerators for tests
|
||||
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
|
||||
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
|
||||
|
||||
# # Workflow layout
|
||||
|
||||
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
|
||||
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
|
||||
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
|
||||
# 3. End-to-end tests: `e2e_*.yml`
|
||||
# 4. Unit tests
|
||||
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
|
||||
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
|
||||
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
|
||||
# - new workflow yaml is added to `.github/workflows`
|
||||
# - new tests are added to workflow mentioned in 2.
|
||||
|
||||
|
||||
|
||||
name: checkpoint_converter
|
||||
# latest version: Megatron-LM core_r0.11.0 https://github.com/NVIDIA/Megatron-LM/tree/core_r0.11.0
|
||||
|
||||
@ -27,7 +60,7 @@ on:
|
||||
- ".github/workflows/checkpoint_converter.yml"
|
||||
- ".github/workflows/e2e_ppo_trainer_megatron.yml"
|
||||
- "examples/data_preprocess/gsm8k.py"
|
||||
- "tests/e2e/run_ppo_trainer_megatron.sh"
|
||||
- "tests/special_e2e/run_ppo_trainer_megatron.sh"
|
||||
- "verl/trainer/main_ppo.py"
|
||||
- "verl/trainer/config/ppo_megatron_trainer.yaml"
|
||||
|
||||
|
56
.github/workflows/cpu_unit_tests.yml
vendored
56
.github/workflows/cpu_unit_tests.yml
vendored
@ -1,3 +1,35 @@
|
||||
# # Tests layout
|
||||
|
||||
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
|
||||
# - `tests/trainer` for testing functionality related to `verl/trainer`
|
||||
# - `tests/models` for testing functionality related to `verl/models`
|
||||
# - ...
|
||||
|
||||
# There are a few folders with `special_` prefix, created for special purposes:
|
||||
# - `special_distributed`: unit tests that must run with multiple GPUs
|
||||
# - `special_e2e`: end-to-end tests with training/generation scripts
|
||||
# - `special_npu`: tests for NPUs
|
||||
# - `special_sanity`: a suite of quick sanity tests
|
||||
# - `special_standalone`: a set of test that are designed to run in dedicated environments
|
||||
|
||||
# Accelerators for tests
|
||||
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
|
||||
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
|
||||
|
||||
# # Workflow layout
|
||||
|
||||
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
|
||||
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
|
||||
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
|
||||
# 3. End-to-end tests: `e2e_*.yml`
|
||||
# 4. Unit tests
|
||||
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
|
||||
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
|
||||
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
|
||||
# - new workflow yaml is added to `.github/workflows`
|
||||
# - new tests are added to workflow mentioned in 2.
|
||||
|
||||
|
||||
name: cpu_unit_tests
|
||||
|
||||
on:
|
||||
@ -40,20 +72,12 @@ jobs:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install the current repository
|
||||
run: |
|
||||
pip install -e .[test]
|
||||
- name: Running data proto test
|
||||
pip install -e .[test,prime,geo]
|
||||
pip install --upgrade "ray>=2.40.0" pillow
|
||||
- name: Running CPU unit tests
|
||||
run: |
|
||||
cd tests
|
||||
pytest -s -x test_protocol.py
|
||||
- name: running utils cpu tests
|
||||
run: |
|
||||
cd tests/utils/cpu_tests
|
||||
pytest -s -x .
|
||||
- name: Running trainer tests
|
||||
run: |
|
||||
cd tests/trainer
|
||||
pytest -s -x .
|
||||
- name: Running worker tests
|
||||
run: |
|
||||
cd tests/workers/reward_manager
|
||||
pytest -s -x .
|
||||
[ ! -d "$HOME/verl-data" ] && git clone --depth 1 https://github.com/eric-haibin-lin/verl-data ~/verl-data
|
||||
python3 examples/data_preprocess/geo3k.py
|
||||
echo '[pytest]' > pytest.ini
|
||||
echo 'python_files = *_on_cpu.py' >> pytest.ini
|
||||
pytest -s -x tests/
|
58
.github/workflows/dataset.yml
vendored
58
.github/workflows/dataset.yml
vendored
@ -1,58 +0,0 @@
|
||||
name: dataset
|
||||
|
||||
on:
|
||||
# Trigger the workflow on push or pull request,
|
||||
# but only for the main branch
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- v0.*
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "verl/utils/**/*.py"
|
||||
- .github/workflows/dataset.yml
|
||||
|
||||
# Cancel jobs on the same ref if a new one is triggered
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
|
||||
# Declare permissions just read content.
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
ray:
|
||||
runs-on: [L20x8]
|
||||
timeout-minutes: 10 # Increase this timeout value as needed
|
||||
env:
|
||||
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
|
||||
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
|
||||
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
|
||||
HF_ENDPOINT: "https://hf-mirror.com"
|
||||
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
|
||||
container:
|
||||
image: whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6.post5-mcore0.12.0-te2.3
|
||||
options: --gpus all --shm-size=10g
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Install the current repository
|
||||
run: |
|
||||
pip install -e .[test]
|
||||
pip install --upgrade "ray>=2.40.0"
|
||||
pip install cupy-cuda12x
|
||||
- name: Running dataset tests
|
||||
run: |
|
||||
[ ! -d "$HOME/verl-data" ] && git clone --depth 1 https://github.com/eric-haibin-lin/verl-data ~/verl-data
|
||||
python3 examples/data_preprocess/geo3k.py
|
||||
pytest -s -x tests/utils/gpu_tests/dataset/test_rl_dataset.py
|
||||
pytest -s -x tests/utils/gpu_tests/dataset/test_sft_dataset.py
|
||||
# pytest -s -x tests/utils/gpu_tests/dataset/test_rm_dataset.py
|
||||
- name: Running ray test using cupy (move it to L20 when dockerfile ready)
|
||||
run: |
|
||||
cd tests/ray_gpu
|
||||
pytest -s -x test_rvdz.py
|
4
.github/workflows/disabled/e2e_prime.yml
vendored
4
.github/workflows/disabled/e2e_prime.yml
vendored
@ -25,7 +25,7 @@ on:
|
||||
# Entrypoints
|
||||
- ".github/workflows/e2e_prime.yml"
|
||||
- "examples/data_preprocess/gsm8k.py"
|
||||
- "tests/e2e/run_prime.sh"
|
||||
- "tests/special_e2e/run_prime.sh"
|
||||
|
||||
# Cancel jobs on the same ref if a new one is triggered
|
||||
concurrency:
|
||||
@ -63,4 +63,4 @@ jobs:
|
||||
- name: Running GSM8K E2E with prime alg
|
||||
run: |
|
||||
ray stop --force
|
||||
bash tests/e2e/run_prime.sh
|
||||
bash tests/special_e2e/run_prime.sh
|
||||
|
32
.github/workflows/doc.yml
vendored
32
.github/workflows/doc.yml
vendored
@ -1,3 +1,35 @@
|
||||
# # Tests layout
|
||||
|
||||
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
|
||||
# - `tests/trainer` for testing functionality related to `verl/trainer`
|
||||
# - `tests/models` for testing functionality related to `verl/models`
|
||||
# - ...
|
||||
|
||||
# There are a few folders with `special_` prefix, created for special purposes:
|
||||
# - `special_distributed`: unit tests that must run with multiple GPUs
|
||||
# - `special_e2e`: end-to-end tests with training/generation scripts
|
||||
# - `special_npu`: tests for NPUs
|
||||
# - `special_sanity`: a suite of quick sanity tests
|
||||
# - `special_standalone`: a set of test that are designed to run in dedicated environments
|
||||
|
||||
# Accelerators for tests
|
||||
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
|
||||
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
|
||||
|
||||
# # Workflow layout
|
||||
|
||||
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
|
||||
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
|
||||
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
|
||||
# 3. End-to-end tests: `e2e_*.yml`
|
||||
# 4. Unit tests
|
||||
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
|
||||
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
|
||||
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
|
||||
# - new workflow yaml is added to `.github/workflows`
|
||||
# - new tests are added to workflow mentioned in 2.
|
||||
|
||||
|
||||
name: doc_test
|
||||
|
||||
on:
|
||||
|
42
.github/workflows/e2e_ascend.yml
vendored
42
.github/workflows/e2e_ascend.yml
vendored
@ -1,3 +1,35 @@
|
||||
# # Tests layout
|
||||
|
||||
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
|
||||
# - `tests/trainer` for testing functionality related to `verl/trainer`
|
||||
# - `tests/models` for testing functionality related to `verl/models`
|
||||
# - ...
|
||||
|
||||
# There are a few folders with `special_` prefix, created for special purposes:
|
||||
# - `special_distributed`: unit tests that must run with multiple GPUs
|
||||
# - `special_e2e`: end-to-end tests with training/generation scripts
|
||||
# - `special_npu`: tests for NPUs
|
||||
# - `special_sanity`: a suite of quick sanity tests
|
||||
# - `special_standalone`: a set of test that are designed to run in dedicated environments
|
||||
|
||||
# Accelerators for tests
|
||||
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
|
||||
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
|
||||
|
||||
# # Workflow layout
|
||||
|
||||
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
|
||||
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
|
||||
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
|
||||
# 3. End-to-end tests: `e2e_*.yml`
|
||||
# 4. Unit tests
|
||||
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
|
||||
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
|
||||
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
|
||||
# - new workflow yaml is added to `.github/workflows`
|
||||
# - new tests are added to workflow mentioned in 2.
|
||||
|
||||
|
||||
name: e2e_ascend
|
||||
|
||||
on:
|
||||
@ -24,7 +56,7 @@ on:
|
||||
- ".github/workflows/e2e_ascend.yml"
|
||||
- "examples/data_preprocess/gsm8k.py"
|
||||
- "examples/data_preprocess/geo3k.py"
|
||||
- "tests/e2e/ppo_trainer"
|
||||
- "tests/special_e2e/ppo_trainer"
|
||||
- "verl/trainer/main_ppo.py"
|
||||
- "verl/trainer/config/ppo_trainer.yaml"
|
||||
|
||||
@ -84,15 +116,15 @@ jobs:
|
||||
- name: Running gsm8k e2e training tests with LoRA on ASCEND NPU
|
||||
run: |
|
||||
ray stop --force
|
||||
bash tests/e2e/sft/run_sft.sh
|
||||
bash tests/special_e2e/sft/run_sft.sh
|
||||
rm -rf $HOME/ckpts
|
||||
- name: Running gsm8k e2e training tests with GRPO on ASCEND NPU
|
||||
run: |
|
||||
ray stop --force
|
||||
bash tests/npu/run_qwen2_5_05b_grpo.sh
|
||||
bash tests/special_npu/run_qwen2_5_05b_grpo.sh
|
||||
rm -rf $HOME/ckpts
|
||||
- name: Running gsm8k e2e training tests with DAPO on ASCEND NPU
|
||||
run: |
|
||||
ray stop --force
|
||||
bash tests/npu/run_qwen2_5_05b_dapo.sh
|
||||
rm -rf $HOME/ckpts
|
||||
bash tests/special_npu/run_qwen2_5_05b_dapo.sh
|
||||
rm -rf $HOME/ckpts
|
||||
|
36
.github/workflows/e2e_dapo.yml
vendored
36
.github/workflows/e2e_dapo.yml
vendored
@ -1,3 +1,35 @@
|
||||
# # Tests layout
|
||||
|
||||
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
|
||||
# - `tests/trainer` for testing functionality related to `verl/trainer`
|
||||
# - `tests/models` for testing functionality related to `verl/models`
|
||||
# - ...
|
||||
|
||||
# There are a few folders with `special_` prefix, created for special purposes:
|
||||
# - `special_distributed`: unit tests that must run with multiple GPUs
|
||||
# - `special_e2e`: end-to-end tests with training/generation scripts
|
||||
# - `special_npu`: tests for NPUs
|
||||
# - `special_sanity`: a suite of quick sanity tests
|
||||
# - `special_standalone`: a set of test that are designed to run in dedicated environments
|
||||
|
||||
# Accelerators for tests
|
||||
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
|
||||
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
|
||||
|
||||
# # Workflow layout
|
||||
|
||||
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
|
||||
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
|
||||
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
|
||||
# 3. End-to-end tests: `e2e_*.yml`
|
||||
# 4. Unit tests
|
||||
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
|
||||
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
|
||||
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
|
||||
# - new workflow yaml is added to `.github/workflows`
|
||||
# - new tests are added to workflow mentioned in 2.
|
||||
|
||||
|
||||
name: e2e_dapo
|
||||
|
||||
on:
|
||||
@ -27,7 +59,7 @@ on:
|
||||
# Entrypoints
|
||||
- ".github/workflows/e2e_dapo.yml"
|
||||
- "examples/data_preprocess/gsm8k.py"
|
||||
- "tests/e2e/run_dapo.sh"
|
||||
- "tests/special_e2e/run_dapo.sh"
|
||||
|
||||
# Cancel jobs on the same ref if a new one is triggered
|
||||
concurrency:
|
||||
@ -64,4 +96,4 @@ jobs:
|
||||
- name: Running the E2E test with the DAPO algorithm
|
||||
run: |
|
||||
ray stop --force
|
||||
bash tests/e2e/run_dapo.sh
|
||||
bash tests/special_e2e/run_dapo.sh
|
||||
|
36
.github/workflows/e2e_eval_aime24.yml
vendored
36
.github/workflows/e2e_eval_aime24.yml
vendored
@ -1,3 +1,35 @@
|
||||
# # Tests layout
|
||||
|
||||
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
|
||||
# - `tests/trainer` for testing functionality related to `verl/trainer`
|
||||
# - `tests/models` for testing functionality related to `verl/models`
|
||||
# - ...
|
||||
|
||||
# There are a few folders with `special_` prefix, created for special purposes:
|
||||
# - `special_distributed`: unit tests that must run with multiple GPUs
|
||||
# - `special_e2e`: end-to-end tests with training/generation scripts
|
||||
# - `special_npu`: tests for NPUs
|
||||
# - `special_sanity`: a suite of quick sanity tests
|
||||
# - `special_standalone`: a set of test that are designed to run in dedicated environments
|
||||
|
||||
# Accelerators for tests
|
||||
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
|
||||
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
|
||||
|
||||
# # Workflow layout
|
||||
|
||||
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
|
||||
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
|
||||
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
|
||||
# 3. End-to-end tests: `e2e_*.yml`
|
||||
# 4. Unit tests
|
||||
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
|
||||
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
|
||||
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
|
||||
# - new workflow yaml is added to `.github/workflows`
|
||||
# - new tests are added to workflow mentioned in 2.
|
||||
|
||||
|
||||
name: e2e_eval_aime24
|
||||
|
||||
on:
|
||||
@ -24,7 +56,7 @@ on:
|
||||
- "!recipe/**"
|
||||
# Entrypoints
|
||||
- ".github/workflows/e2e_eval_aime24.yml"
|
||||
- "tests/e2e/run_r1_distill_qwen_aime24_eval.sh"
|
||||
- "tests/special_e2e/run_r1_distill_qwen_aime24_eval.sh"
|
||||
- "verl/trainer/main_generation.py"
|
||||
- "verl/trainer/config/generation.yaml"
|
||||
|
||||
@ -65,4 +97,4 @@ jobs:
|
||||
- name: Running generation and evaluation in AIME 2024
|
||||
run: |
|
||||
ray stop --force
|
||||
bash tests/e2e/run_r1_distill_qwen_aime24_eval.sh
|
||||
bash tests/special_e2e/run_r1_distill_qwen_aime24_eval.sh
|
||||
|
56
.github/workflows/e2e_ppo_trainer.yml
vendored
56
.github/workflows/e2e_ppo_trainer.yml
vendored
@ -26,7 +26,7 @@ on:
|
||||
- ".github/workflows/e2e_ppo_trainer.yml"
|
||||
- "examples/data_preprocess/gsm8k.py"
|
||||
- "examples/data_preprocess/geo3k.py"
|
||||
- "tests/e2e/ppo_trainer"
|
||||
- "tests/special_e2e/ppo_trainer"
|
||||
- "verl/trainer/main_ppo.py"
|
||||
- "verl/trainer/config/ppo_trainer.yaml"
|
||||
|
||||
@ -86,11 +86,11 @@ jobs:
|
||||
- name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm with validation and saving (FSDP_SIZE=8)
|
||||
run: |
|
||||
ray stop --force
|
||||
VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 SAVE_HF_MODEL=True VERL_EXP_NAME="qwen2.5-0.5b-function-reward-minimal-fsdp8" bash tests/e2e/ppo_trainer/run_function_reward.sh
|
||||
VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 SAVE_HF_MODEL=True VERL_EXP_NAME="qwen2.5-0.5b-function-reward-minimal-fsdp8" bash tests/special_e2e/ppo_trainer/run_function_reward.sh
|
||||
- name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm after resuming
|
||||
run: |
|
||||
ray stop --force
|
||||
RESUME_MODE=auto VERL_EXP_NAME="qwen2.5-0.5b-function-reward-minimal-fsdp8" bash tests/e2e/ppo_trainer/run_function_reward.sh
|
||||
RESUME_MODE=auto VERL_EXP_NAME="qwen2.5-0.5b-function-reward-minimal-fsdp8" bash tests/special_e2e/ppo_trainer/run_function_reward.sh
|
||||
- name: Test merging FSDP checkpoints (Qwen Actor)
|
||||
run: |
|
||||
exp_name="qwen2.5-0.5b-function-reward-minimal-fsdp8"
|
||||
@ -98,7 +98,7 @@ jobs:
|
||||
- name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm with validation and saving (DDP_SIZE=2, FSDP_SIZE=4)
|
||||
run: |
|
||||
ray stop --force
|
||||
VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 SAVE_HF_MODEL=True FSDP_SIZE=4 VERL_EXP_NAME="qwen2.5-0.5b-function-reward-minimal-ddp2-fsdp4" bash tests/e2e/ppo_trainer/run_function_reward.sh
|
||||
VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 SAVE_HF_MODEL=True FSDP_SIZE=4 VERL_EXP_NAME="qwen2.5-0.5b-function-reward-minimal-ddp2-fsdp4" bash tests/special_e2e/ppo_trainer/run_function_reward.sh
|
||||
- name: Test merging DDP+FSDP checkpoints (Qwen Actor)
|
||||
run: |
|
||||
exp_name="qwen2.5-0.5b-function-reward-minimal-ddp2-fsdp4"
|
||||
@ -106,32 +106,32 @@ jobs:
|
||||
- name: Running GSM8K E2E without rmpad using function rm
|
||||
run: |
|
||||
ray stop --force
|
||||
RM_PAD=False bash tests/e2e/ppo_trainer/run_function_reward.sh
|
||||
RM_PAD=False bash tests/special_e2e/ppo_trainer/run_function_reward.sh
|
||||
- name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm (GRPO)
|
||||
run: |
|
||||
ray stop --force
|
||||
ADV_ESTIMATOR=grpo USE_KL=True bash tests/e2e/ppo_trainer/run_function_reward.sh
|
||||
ADV_ESTIMATOR=grpo USE_KL=True bash tests/special_e2e/ppo_trainer/run_function_reward.sh
|
||||
- name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm (ReMax)
|
||||
run: |
|
||||
ray stop --force
|
||||
ADV_ESTIMATOR=remax USE_KL=True bash tests/e2e/ppo_trainer/run_function_reward.sh
|
||||
ADV_ESTIMATOR=remax USE_KL=True bash tests/special_e2e/ppo_trainer/run_function_reward.sh
|
||||
- name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using customized reward function
|
||||
run: |
|
||||
ray stop --force
|
||||
CUSTOM_REWARD_FN=True bash tests/e2e/ppo_trainer/run_function_reward.sh
|
||||
CUSTOM_REWARD_FN=True bash tests/special_e2e/ppo_trainer/run_function_reward.sh
|
||||
- name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm with in-reward kl and kl loss
|
||||
run: |
|
||||
ray stop --force
|
||||
USE_KL=True bash tests/e2e/ppo_trainer/run_function_reward.sh
|
||||
USE_KL=True bash tests/special_e2e/ppo_trainer/run_function_reward.sh
|
||||
# LoRA tests
|
||||
- name: Running GSM8K E2E training tests on 8 L20 GPUs with grpo lora using function rm with use_shm
|
||||
run: |
|
||||
ray stop --force
|
||||
ADV_ESTIMATOR=grpo USE_SHM=True LORA_RANK=32 LOAD_FORMAT=safetensors bash tests/e2e/ppo_trainer/run_function_reward.sh
|
||||
ADV_ESTIMATOR=grpo USE_SHM=True LORA_RANK=32 LOAD_FORMAT=safetensors bash tests/special_e2e/ppo_trainer/run_function_reward.sh
|
||||
- name: Running GSM8K E2E training tests on 8 L20 GPUs with grpo lora using function rm with use_shm and layered_summon
|
||||
run: |
|
||||
ray stop --force
|
||||
ADV_ESTIMATOR=grpo USE_SHM=True LORA_RANK=32 LOAD_FORMAT=safetensors LAYERED_SUMMON=True TOTAL_TRAIN_STEPS=1 SAVE_FREQ=1 FSDP_SIZE=4 VERL_EXP_NAME="qwen2.5-0.5b-function-reward-minimal" bash tests/e2e/ppo_trainer/run_function_reward.sh
|
||||
ADV_ESTIMATOR=grpo USE_SHM=True LORA_RANK=32 LOAD_FORMAT=safetensors LAYERED_SUMMON=True TOTAL_TRAIN_STEPS=1 SAVE_FREQ=1 FSDP_SIZE=4 VERL_EXP_NAME="qwen2.5-0.5b-function-reward-minimal" bash tests/special_e2e/ppo_trainer/run_function_reward.sh
|
||||
- name: Test GRPO LoRA checkpoints merging function
|
||||
run: |
|
||||
export EXP_NAME="qwen2.5-0.5b-function-reward-minimal"
|
||||
@ -141,40 +141,40 @@ jobs:
|
||||
- name: Running GSM8K E2E training tests on 8 L20 GPUs with grpo lora using function rm with use_shm and layered_summon with fsdp2
|
||||
run: |
|
||||
ray stop --force
|
||||
ADV_ESTIMATOR=grpo USE_SHM=True LORA_RANK=32 LOAD_FORMAT=safetensors LAYERED_SUMMON=True STRATEGY=fsdp2 bash tests/e2e/ppo_trainer/run_function_reward.sh
|
||||
ADV_ESTIMATOR=grpo USE_SHM=True LORA_RANK=32 LOAD_FORMAT=safetensors LAYERED_SUMMON=True STRATEGY=fsdp2 bash tests/special_e2e/ppo_trainer/run_function_reward.sh
|
||||
# Model RM
|
||||
- name: Running GRPO GSM8K E2E training tests with FSDP on 8 L20 GPUs (DeepSeek)
|
||||
run: |
|
||||
ray stop --force
|
||||
MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct bash tests/e2e/ppo_trainer/run_function_reward.sh
|
||||
MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct bash tests/special_e2e/ppo_trainer/run_function_reward.sh
|
||||
- name: Running GSM8K E2E with rmpad using model rm
|
||||
run: |
|
||||
ray stop --force
|
||||
bash tests/e2e/ppo_trainer/run_model_reward.sh
|
||||
bash tests/special_e2e/ppo_trainer/run_model_reward.sh
|
||||
- name: Running GSM8K E2E without rmpad using model rm
|
||||
run: |
|
||||
ray stop --force
|
||||
RM_PAD=False bash tests/e2e/ppo_trainer/run_model_reward.sh
|
||||
RM_PAD=False bash tests/special_e2e/ppo_trainer/run_model_reward.sh
|
||||
- name: Running GSM8K E2E with rmpad using model rm and ulysses sp=2
|
||||
run: |
|
||||
ray stop --force
|
||||
SP_SIZE=2 bash tests/e2e/ppo_trainer/run_model_reward.sh
|
||||
SP_SIZE=2 bash tests/special_e2e/ppo_trainer/run_model_reward.sh
|
||||
- name: Running GSM8K E2E with rmpad using model rm and dynamic batch size
|
||||
run: |
|
||||
ray stop --force
|
||||
SEQ_BALANCE=True bash tests/e2e/ppo_trainer/run_model_reward.sh
|
||||
SEQ_BALANCE=True bash tests/special_e2e/ppo_trainer/run_model_reward.sh
|
||||
- name: Running GSM8K E2E with rmpad using model rm with Liger Kernel enabled
|
||||
run: |
|
||||
ray stop --force
|
||||
LIGER=True bash tests/e2e/ppo_trainer/run_model_reward.sh
|
||||
LIGER=True bash tests/special_e2e/ppo_trainer/run_model_reward.sh
|
||||
- name: Running GSM8K E2E with rmpad using model rm with Fused Kernel enabled
|
||||
run: |
|
||||
ray stop --force
|
||||
FUSED_KERNELS=True bash tests/e2e/ppo_trainer/run_model_reward.sh
|
||||
FUSED_KERNELS=True bash tests/special_e2e/ppo_trainer/run_model_reward.sh
|
||||
- name: Running GSM8K E2E with rmpad using model rm with Fused Kernel enabled
|
||||
run: |
|
||||
ray stop --force
|
||||
FUSED_KERNEL=True FUSED_KERNEL_BACKEND=triton bash tests/e2e/ppo_trainer/run_model_reward.sh
|
||||
FUSED_KERNEL=True FUSED_KERNEL_BACKEND=triton bash tests/special_e2e/ppo_trainer/run_model_reward.sh
|
||||
|
||||
e2e_ppo_trainer_vllm_vlm:
|
||||
runs-on: [L20x8]
|
||||
@ -209,7 +209,7 @@ jobs:
|
||||
MODEL_ID=Qwen/Qwen2-VL-2B-Instruct \
|
||||
ADV_ESTIMATOR=grpo RM_PAD=True USE_KL=True ENABLE_CHUNKED_PREFILL=False \
|
||||
SP_SIZE=2 \
|
||||
bash tests/e2e/ppo_trainer/run_function_reward.sh
|
||||
bash tests/special_e2e/ppo_trainer/run_function_reward.sh
|
||||
|
||||
- name: Running Geo3k VLM PPO E2E training tests on 8 L20 GPUs with rmpad using function rm
|
||||
run: |
|
||||
@ -219,7 +219,7 @@ jobs:
|
||||
MODEL_ID=Qwen/Qwen2-VL-2B-Instruct \
|
||||
ADV_ESTIMATOR=gae RM_PAD=True USE_KL=True ENABLE_CHUNKED_PREFILL=False \
|
||||
SP_SIZE=2 \
|
||||
bash tests/e2e/ppo_trainer/run_function_reward.sh
|
||||
bash tests/special_e2e/ppo_trainer/run_function_reward.sh
|
||||
|
||||
e2e_ppo_trainer_sglang:
|
||||
runs-on: [L20x8]
|
||||
@ -248,7 +248,7 @@ jobs:
|
||||
- name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm and save ckpt
|
||||
run: |
|
||||
ray stop --force
|
||||
ENGINE=sglang bash tests/e2e/ppo_trainer/run_function_reward.sh
|
||||
ENGINE=sglang bash tests/special_e2e/ppo_trainer/run_function_reward.sh
|
||||
|
||||
e2e_ppo_trainer_sglang_multiturn_with_tool:
|
||||
runs-on: [L20x8]
|
||||
@ -277,11 +277,11 @@ jobs:
|
||||
- name: Running GSM8K with tool E2E training tests on 8 L20 GPUs with rmpad using function rm and save ckpt with sglang
|
||||
run: |
|
||||
ray stop --force
|
||||
bash tests/e2e/run_gsm8k_fsdp_sgl_multiturn_w_tool.sh
|
||||
bash tests/special_e2e/run_gsm8k_fsdp_sgl_multiturn_w_tool.sh
|
||||
- name: Running GSM8K with tool E2E training tests with FSDP2
|
||||
run: |
|
||||
ray stop --force
|
||||
FSDP_STRATEGY=fsdp2 bash tests/e2e/run_gsm8k_fsdp_sgl_multiturn_w_tool.sh
|
||||
FSDP_STRATEGY=fsdp2 bash tests/special_e2e/run_gsm8k_fsdp_sgl_multiturn_w_tool.sh
|
||||
|
||||
e2e_ppo_trainer_sglang_vlm:
|
||||
runs-on: [L20x8]
|
||||
@ -317,7 +317,7 @@ jobs:
|
||||
ADV_ESTIMATOR=grpo RM_PAD=True USE_KL=True ENABLE_CHUNKED_PREFILL=False \
|
||||
ENGINE=sglang GPU_MEMORY_UTILIZATION=0.6 ACTOR_FSDP_PARAM_OFFLOAD=True \
|
||||
ACTOR_FSDP_OPTIMIZER_OFFLOAD=True REF_FSDP_PARAM_OFFLOAD=True \
|
||||
bash tests/e2e/ppo_trainer/run_function_reward.sh
|
||||
bash tests/special_e2e/ppo_trainer/run_function_reward.sh
|
||||
- name: Running Geo3k VLM E2E with rmpad using torch fused kernel (Qwen2.5-VL)
|
||||
run: |
|
||||
ray stop --force
|
||||
@ -327,7 +327,7 @@ jobs:
|
||||
ADV_ESTIMATOR=grpo RM_PAD=True USE_KL=True ENABLE_CHUNKED_PREFILL=False \
|
||||
ENGINE=sglang GPU_MEMORY_UTILIZATION=0.6 ACTOR_FSDP_PARAM_OFFLOAD=True \
|
||||
ACTOR_FSDP_OPTIMIZER_OFFLOAD=True REF_FSDP_PARAM_OFFLOAD=True \
|
||||
bash tests/e2e/ppo_trainer/run_function_reward.sh
|
||||
bash tests/special_e2e/ppo_trainer/run_function_reward.sh
|
||||
- name: Running Geo3k VLM E2E with rmpad using triton fused kernel (Qwen2.5-VL)
|
||||
run: |
|
||||
ray stop --force
|
||||
@ -338,4 +338,4 @@ jobs:
|
||||
ADV_ESTIMATOR=grpo RM_PAD=True USE_KL=True ENABLE_CHUNKED_PREFILL=False \
|
||||
ENGINE=sglang GPU_MEMORY_UTILIZATION=0.6 ACTOR_FSDP_PARAM_OFFLOAD=True \
|
||||
ACTOR_FSDP_OPTIMIZER_OFFLOAD=True REF_FSDP_PARAM_OFFLOAD=True \
|
||||
bash tests/e2e/ppo_trainer/run_function_reward.sh
|
||||
bash tests/special_e2e/ppo_trainer/run_function_reward.sh
|
||||
|
59
.github/workflows/e2e_ppo_trainer_megatron.yml
vendored
59
.github/workflows/e2e_ppo_trainer_megatron.yml
vendored
@ -1,3 +1,34 @@
|
||||
# # Tests layout
|
||||
|
||||
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
|
||||
# - `tests/trainer` for testing functionality related to `verl/trainer`
|
||||
# - `tests/models` for testing functionality related to `verl/models`
|
||||
# - ...
|
||||
|
||||
# There are a few folders with `special_` prefix, created for special purposes:
|
||||
# - `special_distributed`: unit tests that must run with multiple GPUs
|
||||
# - `special_e2e`: end-to-end tests with training/generation scripts
|
||||
# - `special_npu`: tests for NPUs
|
||||
# - `special_sanity`: a suite of quick sanity tests
|
||||
# - `special_standalone`: a set of test that are designed to run in dedicated environments
|
||||
|
||||
# Accelerators for tests
|
||||
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
|
||||
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
|
||||
|
||||
# # Workflow layout
|
||||
|
||||
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
|
||||
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
|
||||
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
|
||||
# 3. End-to-end tests: `e2e_*.yml`
|
||||
# 4. Unit tests
|
||||
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
|
||||
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
|
||||
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
|
||||
# - new workflow yaml is added to `.github/workflows`
|
||||
# - new tests are added to workflow mentioned in 2.
|
||||
|
||||
name: e2e_ppo_trainer_megatron
|
||||
# latest version: Megatron-LM core_r0.11.0 https://github.com/NVIDIA/Megatron-LM/tree/core_r0.11.0
|
||||
|
||||
@ -27,7 +58,7 @@ on:
|
||||
- ".github/workflows/e2e_ppo_trainer_megatron.yml"
|
||||
- "examples/data_preprocess/gsm8k.py"
|
||||
- "examples/data_preprocess/geo3k.py"
|
||||
- "tests/e2e/run_ppo_trainer_megatron.sh"
|
||||
- "tests/special_e2e/run_ppo_trainer_megatron.sh"
|
||||
- "verl/trainer/main_ppo.py"
|
||||
- "verl/trainer/config/ppo_megatron_trainer.yaml"
|
||||
|
||||
@ -66,11 +97,11 @@ jobs:
|
||||
- name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (DeepSeek)
|
||||
run: |
|
||||
ray stop --force
|
||||
ALL_OFFLOAD=True SAVE_FREQ=1 MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct bash tests/e2e/run_ppo_trainer_megatron.sh
|
||||
ALL_OFFLOAD=True SAVE_FREQ=1 MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct bash tests/special_e2e/run_ppo_trainer_megatron.sh
|
||||
- name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (DeepSeek)
|
||||
run: |
|
||||
ray stop --force
|
||||
RESUME_MODE=auto MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct TOTAL_TRAIN_STEPS=2 bash tests/e2e/run_ppo_trainer_megatron.sh
|
||||
RESUME_MODE=auto MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct TOTAL_TRAIN_STEPS=2 bash tests/special_e2e/run_ppo_trainer_megatron.sh
|
||||
- name: Test Megatron checkpoints merging function (DeepSeek Actor and Critic)
|
||||
run: |
|
||||
exp_name="deepseek-coder-1.3b-instruct-megatron-gsm8k-minimal"
|
||||
@ -79,7 +110,7 @@ jobs:
|
||||
- name: Running GRPO GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Deepseek)
|
||||
run: |
|
||||
ray stop --force
|
||||
ADV_ESTIMATOR=grpo USE_DYNAMIC_BSZ=False MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct bash tests/e2e/run_ppo_trainer_megatron.sh
|
||||
ADV_ESTIMATOR=grpo USE_DYNAMIC_BSZ=False MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct bash tests/special_e2e/run_ppo_trainer_megatron.sh
|
||||
- name: clean up
|
||||
run: |
|
||||
rm -rf checkpoints
|
||||
@ -108,11 +139,11 @@ jobs:
|
||||
- name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen3) with validation and saving
|
||||
run: |
|
||||
ray stop --force
|
||||
ALL_OFFLOAD=True VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 MODEL_ID=Qwen/Qwen3-0.6B bash tests/e2e/run_ppo_trainer_megatron.sh
|
||||
ALL_OFFLOAD=True VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 MODEL_ID=Qwen/Qwen3-0.6B bash tests/special_e2e/run_ppo_trainer_megatron.sh
|
||||
- name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen3) testing learning rate scheduler
|
||||
run: |
|
||||
ray stop --force
|
||||
LR_WARMUP_STEPS=1 TOTAL_TRAIN_STEPS=2 MODEL_ID=Qwen/Qwen3-0.6B bash tests/e2e/run_ppo_trainer_megatron.sh
|
||||
LR_WARMUP_STEPS=1 TOTAL_TRAIN_STEPS=2 MODEL_ID=Qwen/Qwen3-0.6B bash tests/special_e2e/run_ppo_trainer_megatron.sh
|
||||
|
||||
- name: Test Megatron checkpoints merging function (Qwen3 Actor and Critic)
|
||||
run: |
|
||||
@ -147,11 +178,11 @@ jobs:
|
||||
- name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with tie-embedding Megatron (Qwen) with train tp > infer tp
|
||||
run: |
|
||||
ray stop --force
|
||||
VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 TRAIN_TP=2 INFER_TP=1 MODEL_ID=Qwen/Qwen2.5-1.5B bash tests/e2e/run_ppo_trainer_megatron.sh
|
||||
VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 TRAIN_TP=2 INFER_TP=1 MODEL_ID=Qwen/Qwen2.5-1.5B bash tests/special_e2e/run_ppo_trainer_megatron.sh
|
||||
- name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen) with train tp < infer tp
|
||||
run: |
|
||||
ray stop --force
|
||||
VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 TRAIN_TP=1 INFER_TP=2 MODEL_ID=Qwen/Qwen2.5-1.5B bash tests/e2e/run_ppo_trainer_megatron.sh
|
||||
VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 TRAIN_TP=1 INFER_TP=2 MODEL_ID=Qwen/Qwen2.5-1.5B bash tests/special_e2e/run_ppo_trainer_megatron.sh
|
||||
- name: clean up
|
||||
run: |
|
||||
rm -rf checkpoints
|
||||
@ -183,9 +214,9 @@ jobs:
|
||||
- name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen)
|
||||
run: |
|
||||
ray stop --force
|
||||
SAVE_FREQ=1 COMMON_PP=4 COMMON_VPP=null COMMON_CP=1 SKIP_SAVE_HF_MODEL=1 bash tests/e2e/run_ppo_trainer_megatron.sh +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_first_pipeline_stage=8 +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=4 actor_rollout_ref.actor.megatron.use_dist_checkpointing=true actor_rollout_ref.actor.megatron.dist_checkpointing_path=checkpoints/verl-test/qwen2.5-0.5b-megatron actor_rollout_ref.ref.megatron.use_dist_checkpointing=true actor_rollout_ref.ref.megatron.dist_checkpointing_path=checkpoints/verl-test/qwen2.5-0.5b-megatron critic.megatron.use_dist_checkpointing=true critic.megatron.dist_checkpointing_path=checkpoints/verl-test/qwen2.5-0.5b-megatron reward_model.megatron.use_dist_checkpointing=true reward_model.megatron.dist_checkpointing_path=checkpoints/verl-test/qwen2.5-0.5b-megatron
|
||||
SAVE_FREQ=1 COMMON_PP=4 COMMON_VPP=null COMMON_CP=1 SKIP_SAVE_HF_MODEL=1 bash tests/special_e2e/run_ppo_trainer_megatron.sh +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_first_pipeline_stage=8 +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=4 actor_rollout_ref.actor.megatron.use_dist_checkpointing=true actor_rollout_ref.actor.megatron.dist_checkpointing_path=checkpoints/verl-test/qwen2.5-0.5b-megatron actor_rollout_ref.ref.megatron.use_dist_checkpointing=true actor_rollout_ref.ref.megatron.dist_checkpointing_path=checkpoints/verl-test/qwen2.5-0.5b-megatron critic.megatron.use_dist_checkpointing=true critic.megatron.dist_checkpointing_path=checkpoints/verl-test/qwen2.5-0.5b-megatron reward_model.megatron.use_dist_checkpointing=true reward_model.megatron.dist_checkpointing_path=checkpoints/verl-test/qwen2.5-0.5b-megatron
|
||||
cp -r checkpoints checkpoints-dut
|
||||
SAVE_FREQ=1 COMMON_PP=4 COMMON_VPP=null COMMON_CP=1 bash tests/e2e/run_ppo_trainer_megatron.sh
|
||||
SAVE_FREQ=1 COMMON_PP=4 COMMON_VPP=null COMMON_CP=1 bash tests/special_e2e/run_ppo_trainer_megatron.sh
|
||||
- name: Test Megatron checkpoints merging function (Qwen Actor and Critic)
|
||||
run: |
|
||||
exp_name="qwen2.5-0.5b-megatron-gsm8k-minimal"
|
||||
@ -219,7 +250,7 @@ jobs:
|
||||
- name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (DeepSeek)
|
||||
run: |
|
||||
ray stop --force
|
||||
SAVE_FREQ=1 MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct COMMON_PP=2 COMMON_VPP=null bash tests/e2e/run_ppo_trainer_megatron.sh +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=true +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=true
|
||||
SAVE_FREQ=1 MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct COMMON_PP=2 COMMON_VPP=null bash tests/special_e2e/run_ppo_trainer_megatron.sh +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=true +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=true
|
||||
- name: Test Megatron checkpoints merging function (DeepSeek Actor and Critic)
|
||||
run: |
|
||||
exp_name="deepseek-coder-1.3b-instruct-megatron-gsm8k-minimal"
|
||||
@ -253,12 +284,12 @@ jobs:
|
||||
- name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (DeepSeek)
|
||||
run: |
|
||||
ray stop --force
|
||||
ADV_ESTIMATOR=grpo USE_DUMMY_MODEL=True DUMMY_MODEL_CONFIG_PATH=tests/e2e/ppo_trainer/expert_parallel/qwen2moe_minimal.json \
|
||||
ADV_ESTIMATOR=grpo USE_DUMMY_MODEL=True DUMMY_MODEL_CONFIG_PATH=tests/special_e2e/ppo_trainer/expert_parallel/qwen2moe_minimal.json \
|
||||
PPO_MAX_TOKEN_LEN=512 FWD_MAX_TOKEN_LEN=512 \
|
||||
MAX_PROMPT_LENGTH=256 MAX_RESPONSE_LENGTH=256 \
|
||||
MODEL_ID=Qwen/Qwen1.5-MoE-A2.7B-Chat \
|
||||
COMMON_PP=2 COMMON_VPP=null COMMON_CP=1 COMMON_TP=4 COMMON_EP=4 COMMON_ETP=1 INFER_TP=8 \
|
||||
USE_DIST_CKPT=True ALL_OFFLOAD=True SKIP_SAVE_HF_MODEL=1 bash tests/e2e/run_ppo_trainer_megatron.sh
|
||||
USE_DIST_CKPT=True ALL_OFFLOAD=True SKIP_SAVE_HF_MODEL=1 bash tests/special_e2e/run_ppo_trainer_megatron.sh
|
||||
- name: clean up
|
||||
run: |
|
||||
rm -rf checkpoints
|
||||
@ -290,7 +321,7 @@ jobs:
|
||||
- name: Running Geo3k E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen)
|
||||
run: |
|
||||
ray stop --force
|
||||
TRAIN_FILES=${HOME}/data/geo3k/train.parquet VAL_FILES=${HOME}/data/geo3k/test.parquet MAX_PROMPT_LENGTH=1024 MAX_RESPONSE_LENGTH=2048 MODEL_ID=Qwen/Qwen2.5-VL-3B-Instruct ADV_ESTIMATOR=grpo USE_DYNAMIC_BSZ=False SKIP_SAVE_HF_MODEL=1 COMMON_PP=4 COMMON_VPP=null COMMON_CP=1 COMMON_TP=2 USE_DIST_CKPT=true DIST_CKPT_PATH=checkpoints/verl-test/qwen2.5-vl-3b-megatron bash tests/e2e/run_ppo_trainer_megatron.sh
|
||||
TRAIN_FILES=${HOME}/data/geo3k/train.parquet VAL_FILES=${HOME}/data/geo3k/test.parquet MAX_PROMPT_LENGTH=1024 MAX_RESPONSE_LENGTH=2048 MODEL_ID=Qwen/Qwen2.5-VL-3B-Instruct ADV_ESTIMATOR=grpo USE_DYNAMIC_BSZ=False SKIP_SAVE_HF_MODEL=1 COMMON_PP=4 COMMON_VPP=null COMMON_CP=1 COMMON_TP=2 USE_DIST_CKPT=true DIST_CKPT_PATH=checkpoints/verl-test/qwen2.5-vl-3b-megatron bash tests/special_e2e/run_ppo_trainer_megatron.sh
|
||||
- name: clean up
|
||||
run: |
|
||||
rm -rf checkpoints
|
||||
|
45
.github/workflows/e2e_sft.yml
vendored
45
.github/workflows/e2e_sft.yml
vendored
@ -1,3 +1,34 @@
|
||||
# # Tests layout
|
||||
|
||||
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
|
||||
# - `tests/trainer` for testing functionality related to `verl/trainer`
|
||||
# - `tests/models` for testing functionality related to `verl/models`
|
||||
# - ...
|
||||
|
||||
# There are a few folders with `special_` prefix, created for special purposes:
|
||||
# - `special_distributed`: unit tests that must run with multiple GPUs
|
||||
# - `special_e2e`: end-to-end tests with training/generation scripts
|
||||
# - `special_npu`: tests for NPUs
|
||||
# - `special_sanity`: a suite of quick sanity tests
|
||||
# - `special_standalone`: a set of test that are designed to run in dedicated environments
|
||||
|
||||
# Accelerators for tests
|
||||
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
|
||||
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
|
||||
|
||||
# # Workflow layout
|
||||
|
||||
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
|
||||
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
|
||||
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
|
||||
# 3. End-to-end tests: `e2e_*.yml`
|
||||
# 4. Unit tests
|
||||
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
|
||||
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
|
||||
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
|
||||
# - new workflow yaml is added to `.github/workflows`
|
||||
# - new tests are added to workflow mentioned in 2.
|
||||
|
||||
name: e2e_sft
|
||||
|
||||
on:
|
||||
@ -25,7 +56,7 @@ on:
|
||||
# Entrypoints
|
||||
- ".github/workflows/e2e_sft.yml"
|
||||
- "examples/data_preprocess/gsm8k.py"
|
||||
- "tests/e2e/sft"
|
||||
- "tests/special_e2e/sft"
|
||||
- "verl/trainer/fsdp_sft_trainer.py"
|
||||
- "verl/trainer/config/sft_trainer.yaml"
|
||||
|
||||
@ -66,25 +97,25 @@ jobs:
|
||||
- name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm
|
||||
run: |
|
||||
ray stop --force
|
||||
bash tests/e2e/sft/run_sft.sh
|
||||
bash tests/special_e2e/sft/run_sft.sh
|
||||
- name: Running GSM8K E2E training tests on 8 L20 GPUs w/o rmpad using function rm
|
||||
run: |
|
||||
ray stop --force
|
||||
RM_PAD=False bash tests/e2e/sft/run_sft.sh
|
||||
RM_PAD=False bash tests/special_e2e/sft/run_sft.sh
|
||||
- name: Running GSM8K E2E training tests on 8 L20 GPUs with sequence parallism
|
||||
run: |
|
||||
ray stop --force
|
||||
SP_SIZE=2 bash tests/e2e/sft/run_sft.sh
|
||||
SP_SIZE=2 bash tests/special_e2e/sft/run_sft.sh
|
||||
- name: Check loss difference between sequence parallel vs. default implementation
|
||||
run: |
|
||||
ray stop --force
|
||||
ENTRYPOINT="tests/e2e/sft/test_sp_loss_match.py" SP_SIZE=2 bash tests/e2e/sft/run_sft.sh
|
||||
ENTRYPOINT="tests/special_e2e/sft/test_sp_loss_match.py" SP_SIZE=2 bash tests/special_e2e/sft/run_sft.sh
|
||||
- name: Running GSM8K E2E training tests on 8 L20 GPUs with sequence parallism and liger
|
||||
run: |
|
||||
ray stop --force
|
||||
SP_SIZE=2 LIGER=True bash tests/e2e/sft/run_sft.sh
|
||||
SP_SIZE=2 LIGER=True bash tests/special_e2e/sft/run_sft.sh
|
||||
- name: Running GSM8K E2E training tests with LoRA
|
||||
run: |
|
||||
ray stop --force
|
||||
LORA_RANK=32 bash tests/e2e/sft/run_sft.sh
|
||||
LORA_RANK=32 bash tests/special_e2e/sft/run_sft.sh
|
||||
# TODO: multiturn
|
||||
|
9
.github/workflows/e2e_spin.yml
vendored
9
.github/workflows/e2e_spin.yml
vendored
@ -27,13 +27,18 @@ on:
|
||||
# Entrypoints
|
||||
- ".github/workflows/e2e_spin.yml"
|
||||
- "examples/data_preprocess/gsm8k.py"
|
||||
- "tests/e2e/run_spin.sh"
|
||||
- "tests/special_e2e/run_spin.sh"
|
||||
- "!examples"
|
||||
|
||||
# Declare permissions just read content.
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
# Cancel jobs on the same ref if a new one is triggered
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
|
||||
jobs:
|
||||
e2e_spin:
|
||||
runs-on: [L20x8]
|
||||
@ -63,4 +68,4 @@ jobs:
|
||||
- name: Running the E2E test with the spin algorithm
|
||||
run: |
|
||||
ray stop --force
|
||||
bash tests/e2e/run_spin.sh
|
||||
bash tests/special_e2e/run_spin.sh
|
||||
|
9
.github/workflows/e2e_sppo.yml
vendored
9
.github/workflows/e2e_sppo.yml
vendored
@ -27,12 +27,17 @@ on:
|
||||
# Entrypoints
|
||||
- ".github/workflows/e2e_sppo.yml"
|
||||
- "examples/data_preprocess/gsm8k.py"
|
||||
- "tests/e2e/run_sppo.sh"
|
||||
- "tests/special_e2e/run_sppo.sh"
|
||||
|
||||
# Declare permissions just read content.
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
# Cancel jobs on the same ref if a new one is triggered
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
|
||||
jobs:
|
||||
e2e_sppo:
|
||||
runs-on: [L20x8]
|
||||
@ -62,4 +67,4 @@ jobs:
|
||||
- name: Running the E2E test with the SPPO algorithm
|
||||
run: |
|
||||
ray stop --force
|
||||
bash tests/e2e/run_sppo.sh
|
||||
bash tests/special_e2e/run_sppo.sh
|
||||
|
97
.github/workflows/gpu_unit_tests.yml
vendored
Normal file
97
.github/workflows/gpu_unit_tests.yml
vendored
Normal file
@ -0,0 +1,97 @@
|
||||
# # Tests layout
|
||||
|
||||
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
|
||||
# - `tests/trainer` for testing functionality related to `verl/trainer`
|
||||
# - `tests/models` for testing functionality related to `verl/models`
|
||||
# - ...
|
||||
|
||||
# There are a few folders with `special_` prefix, created for special purposes:
|
||||
# - `special_distributed`: unit tests that must run with multiple GPUs
|
||||
# - `special_e2e`: end-to-end tests with training/generation scripts
|
||||
# - `special_npu`: tests for NPUs
|
||||
# - `special_sanity`: a suite of quick sanity tests
|
||||
# - `special_standalone`: a set of test that are designed to run in dedicated environments
|
||||
|
||||
# Accelerators for tests
|
||||
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
|
||||
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
|
||||
|
||||
# # Workflow layout
|
||||
|
||||
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
|
||||
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
|
||||
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
|
||||
# 3. End-to-end tests: `e2e_*.yml`
|
||||
# 4. Unit tests
|
||||
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
|
||||
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
|
||||
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
|
||||
# - new workflow yaml is added to `.github/workflows`
|
||||
# - new tests are added to workflow mentioned in 2.
|
||||
|
||||
name: GPU unit tests
|
||||
|
||||
on:
|
||||
# Trigger the workflow on push or pull request,
|
||||
# but only for the main branch
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- v0.4.x
|
||||
paths:
|
||||
- "**/*.py"
|
||||
- .github/workflows/gpu_unit_tests.yml
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
- v0.4.x
|
||||
paths:
|
||||
- "**/*.py"
|
||||
# Other entrypoints
|
||||
- "!examples/**"
|
||||
- "!tests/**"
|
||||
- "!verl/trainer/main_*.py"
|
||||
- "!verl/trainer/fsdp_sft_trainer.py"
|
||||
# Recipes
|
||||
- "!recipe/**"
|
||||
# Entrypoints
|
||||
- .github/workflows/gpu_unit_tests.yml
|
||||
- "tests/*"
|
||||
|
||||
# Cancel jobs on the same ref if a new one is triggered
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
|
||||
# Declare permissions just read content.
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
gpu_unit_tests:
|
||||
runs-on: [L20x8]
|
||||
timeout-minutes: 40 # Increase this timeout value as needed
|
||||
env:
|
||||
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
|
||||
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
|
||||
NO_PROXY: "localhost,127.0.0.1"
|
||||
HF_HUB_ENABLE_HF_TRANSFER: 1
|
||||
container:
|
||||
image: whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6.post5-mcore0.12.0-te2.3
|
||||
options: --gpus all --shm-size=10g
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Install the current repository
|
||||
run: |
|
||||
pip3 install hf_transfer
|
||||
pip3 install --no-deps -e .[test]
|
||||
pip3 install --upgrade "ray>=2.40.0"
|
||||
pip3 install cupy-cuda12x
|
||||
- name: Run all GPU unit tests
|
||||
run: |
|
||||
pytest -s -x --ignore-glob="*test_linear_cross_entropy_tp.py" --ignore-glob='*on_cpu.py' --ignore-glob="*test_vllm*" --ignore-glob="*_sglang*" --ignore-glob="*_hf_rollout*" --ignore-glob="tests/models/" --ignore-glob='tests/special*' tests/
|
||||
- name: Testing LinearCrossEntropyTP Correctness, Computation Time and Memory Consumption
|
||||
run: |
|
||||
LOW_MEMORY=True torchrun --standalone --nnodes=1 --nproc-per-node=8 tests/utils/test_linear_cross_entropy_tp.py
|
65
.github/workflows/kernels.yml
vendored
65
.github/workflows/kernels.yml
vendored
@ -1,65 +0,0 @@
|
||||
name: kernels
|
||||
# latest version: Megatron-LM core_r0.11.0 https://github.com/NVIDIA/Megatron-LM/tree/core_r0.11.0
|
||||
|
||||
on:
|
||||
# Trigger the workflow on push or pull request,
|
||||
# but only for the main branch
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- v0.2.x
|
||||
paths:
|
||||
- "**/*.py"
|
||||
- .github/workflows/kernels.yml
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
- v0.2.x
|
||||
paths:
|
||||
- "**/*.py"
|
||||
# Other entrypoints
|
||||
- "!examples/**"
|
||||
- "!tests/**"
|
||||
- "!verl/trainer/main_*.py"
|
||||
- "!verl/trainer/fsdp_sft_trainer.py"
|
||||
# Recipes
|
||||
- "!recipe/**"
|
||||
# Entrypoints
|
||||
- .github/workflows/kernels.yml
|
||||
- "tests/kernels/*"
|
||||
|
||||
# Cancel jobs on the same ref if a new one is triggered
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
|
||||
# Declare permissions just read content.
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
kernels:
|
||||
runs-on: [L20x8]
|
||||
timeout-minutes: 40 # Increase this timeout value as needed
|
||||
env:
|
||||
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
|
||||
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
|
||||
NO_PROXY: "localhost,127.0.0.1"
|
||||
HF_HUB_ENABLE_HF_TRANSFER: 1
|
||||
container:
|
||||
image: whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6.post5-mcore0.12.0-te2.3
|
||||
options: --gpus all --shm-size=10g
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Install the current repository
|
||||
run: |
|
||||
pip3 install hf_transfer
|
||||
pip3 install --no-deps -e .[test]
|
||||
- name: Testing LinearCrossEntropy Correction, Computation Time and Memory Consumption
|
||||
run: |
|
||||
python3 tests/kernels/test_linear_cross_entropy.py
|
||||
- name: Testing LinearCrossEntropyTP Correction, Computation Time and Memory Consumption
|
||||
run: |
|
||||
LOW_MEMORY=True torchrun --standalone --nnodes=1 --nproc-per-node=8 tests/kernels/test_linear_cross_entropy_tp.py
|
49
.github/workflows/model.yml
vendored
49
.github/workflows/model.yml
vendored
@ -1,3 +1,35 @@
|
||||
# # Tests layout
|
||||
|
||||
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
|
||||
# - `tests/trainer` for testing functionality related to `verl/trainer`
|
||||
# - `tests/models` for testing functionality related to `verl/models`
|
||||
# - ...
|
||||
|
||||
# There are a few folders with `special_` prefix, created for special purposes:
|
||||
# - `special_distributed`: unit tests that must run with multiple GPUs
|
||||
# - `special_e2e`: end-to-end tests with training/generation scripts
|
||||
# - `special_npu`: tests for NPUs
|
||||
# - `special_sanity`: a suite of quick sanity tests
|
||||
# - `special_standalone`: a set of test that are designed to run in dedicated environments
|
||||
|
||||
# Accelerators for tests
|
||||
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
|
||||
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
|
||||
|
||||
# # Workflow layout
|
||||
|
||||
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
|
||||
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
|
||||
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
|
||||
# 3. End-to-end tests: `e2e_*.yml`
|
||||
# 4. Unit tests
|
||||
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
|
||||
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
|
||||
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
|
||||
# - new workflow yaml is added to `.github/workflows`
|
||||
# - new tests are added to workflow mentioned in 2.
|
||||
# name: Check PR Title
|
||||
|
||||
name: model_rmpad
|
||||
|
||||
on:
|
||||
@ -15,14 +47,19 @@ on:
|
||||
- "verl/**/*.py"
|
||||
# Entrypoints
|
||||
- ".github/workflows/model.yml"
|
||||
- "tests/utils/gpu_tests/checkpoint/test_fsdp_ckpt.py"
|
||||
- "tests/models/test_transformers_ulysses.py"
|
||||
- "tests/distributed/run_all.sh"
|
||||
- "tests/special_distributed/test_fsdp_ckpt.py"
|
||||
- "tests/models/**"
|
||||
- "tests/special_distributed/run_all.sh"
|
||||
|
||||
# Declare permissions just read content.
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
# Cancel jobs on the same ref if a new one is triggered
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
|
||||
jobs:
|
||||
model_rmpad:
|
||||
runs-on: [L20x8]
|
||||
@ -53,7 +90,7 @@ jobs:
|
||||
pytest -s tests/models/test_transformer.py
|
||||
- name: Running FSDP rmpad model tests on 8 L20 GPUs + latest flash_attn
|
||||
run: |
|
||||
STRATEGY=fsdp torchrun --nproc_per_node=8 tests/utils/gpu_tests/checkpoint/test_fsdp_ckpt.py
|
||||
STRATEGY=fsdp torchrun --nproc_per_node=8 tests/special_distributed/test_fsdp_ckpt.py
|
||||
- name: Running transformers ulysses tests on 8 L20 GPUs + latest transformers
|
||||
run: |
|
||||
torchrun --nproc_per_node=8 -m pytest tests/models/test_transformers_ulysses.py
|
||||
@ -79,7 +116,7 @@ jobs:
|
||||
torchrun --nproc_per_node=8 -m pytest tests/models/test_transformers_ulysses.py
|
||||
- name: Run distributed test
|
||||
run: |
|
||||
bash tests/distributed/run_all.sh
|
||||
bash tests/special_distributed/run_all.sh
|
||||
|
||||
# TODO: Move this back to model_rmpad once FSDP2 is stable.
|
||||
# NOTE: List as an independent job to make rerun easier.
|
||||
@ -106,4 +143,4 @@ jobs:
|
||||
- name: Running FSDP2 rmpad model tests on 8 L20 GPUs + latest flash_attn
|
||||
run: |
|
||||
pip3 install --upgrade flash_attn --no-build-isolation
|
||||
STRATEGY=fsdp2 torchrun --nproc_per_node=8 tests/utils/gpu_tests/checkpoint/test_fsdp_ckpt.py
|
||||
STRATEGY=fsdp2 torchrun --nproc_per_node=8 tests/special_distributed/test_fsdp_ckpt.py
|
||||
|
48
.github/workflows/ray_cpu_test.yml
vendored
48
.github/workflows/ray_cpu_test.yml
vendored
@ -1,48 +0,0 @@
|
||||
name: ray_cpu
|
||||
|
||||
on:
|
||||
# Trigger the workflow on push or pull request,
|
||||
# but only for the main branch
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- v0.*
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
- v0.*
|
||||
paths:
|
||||
- "verl/single_controller/*.py"
|
||||
- .github/workflows/ray_cpu_test.yml
|
||||
- "!recipe/**/*.py"
|
||||
|
||||
# Cancel jobs on the same ref if a new one is triggered
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
|
||||
# Declare permissions just read content.
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
ray_cpu:
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 10 # Increase this timeout value as needed
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ["3.10"]
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install the current repository
|
||||
run: |
|
||||
pip install -e .[test]
|
||||
pip install --upgrade "ray>=2.40.0"
|
||||
- name: Running ray tests that can be tested on CPU machines
|
||||
run: |
|
||||
cd tests/ray_cpu
|
||||
pytest -s -x --ignore=test_check_worker_alive.py .
|
51
.github/workflows/sandbox.yml
vendored
51
.github/workflows/sandbox.yml
vendored
@ -1,51 +0,0 @@
|
||||
name: sandbox
|
||||
|
||||
on:
|
||||
# Trigger the workflow on push or pull request,
|
||||
# but only for the main branch
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- v0.*
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
- v0.*
|
||||
paths:
|
||||
- "**/*.py"
|
||||
- .github/workflows/sandbox.yml
|
||||
|
||||
# Cancel jobs on the same ref if a new one is triggered
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
|
||||
# Declare permissions just read content.
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
sandbox:
|
||||
runs-on: [L20x8]
|
||||
timeout-minutes: 10 # Increase this timeout value as needed
|
||||
env:
|
||||
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
|
||||
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
|
||||
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
|
||||
HF_ENDPOINT: "https://hf-mirror.com"
|
||||
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
|
||||
container:
|
||||
image: whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6.post5-mcore0.12.0-te2.3
|
||||
options: --gpus all --shm-size=10g
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Install the current repository
|
||||
run: |
|
||||
pip3 install -e .[test,prime]
|
||||
pip3 install vllm==0.5.4
|
||||
- name: Running sandbox tests on 8 L20 GPUs
|
||||
run: |
|
||||
cd tests/utils/reward_score
|
||||
pytest -s -x .
|
37
.github/workflows/sanity.yml
vendored
37
.github/workflows/sanity.yml
vendored
@ -1,3 +1,35 @@
|
||||
# # Tests layout
|
||||
|
||||
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
|
||||
# - `tests/trainer` for testing functionality related to `verl/trainer`
|
||||
# - `tests/models` for testing functionality related to `verl/models`
|
||||
# - ...
|
||||
|
||||
# There are a few folders with `special_` prefix, created for special purposes:
|
||||
# - `special_distributed`: unit tests that must run with multiple GPUs
|
||||
# - `special_e2e`: end-to-end tests with training/generation scripts
|
||||
# - `special_npu`: tests for NPUs
|
||||
# - `special_sanity`: a suite of quick sanity tests
|
||||
# - `special_standalone`: a set of test that are designed to run in dedicated environments
|
||||
|
||||
# Accelerators for tests
|
||||
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
|
||||
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
|
||||
|
||||
# # Workflow layout
|
||||
|
||||
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
|
||||
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
|
||||
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
|
||||
# 3. End-to-end tests: `e2e_*.yml`
|
||||
# 4. Unit tests
|
||||
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
|
||||
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
|
||||
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
|
||||
# - new workflow yaml is added to `.github/workflows`
|
||||
# - new tests are added to workflow mentioned in 2.
|
||||
# name: Check PR Title
|
||||
|
||||
name: sanity
|
||||
|
||||
on:
|
||||
@ -14,6 +46,7 @@ on:
|
||||
paths:
|
||||
- "**/*.py"
|
||||
- .github/workflows/sanity.yml
|
||||
- "tests/special_sanity/**"
|
||||
|
||||
# Cancel jobs on the same ref if a new one is triggered
|
||||
concurrency:
|
||||
@ -42,10 +75,10 @@ jobs:
|
||||
pip install -e .[test]
|
||||
- name: Run sanity test
|
||||
run: |
|
||||
pytest -s -x tests/sanity
|
||||
pytest -s -x tests/special_sanity
|
||||
- name: Run license test
|
||||
run: |
|
||||
python3 tests/sanity/check_license.py --directory .
|
||||
python3 tests/special_sanity/check_license.py --directory .
|
||||
- name: Assert naming convention
|
||||
run: |
|
||||
if grep -rIn --exclude-dir=.git --exclude-dir=.github --exclude-dir=venv --exclude-dir=__pycache__ 'veRL' .; then
|
||||
|
34
.github/workflows/sgl.yml
vendored
34
.github/workflows/sgl.yml
vendored
@ -1,3 +1,34 @@
|
||||
# # Tests layout
|
||||
|
||||
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
|
||||
# - `tests/trainer` for testing functionality related to `verl/trainer`
|
||||
# - `tests/models` for testing functionality related to `verl/models`
|
||||
# - ...
|
||||
|
||||
# There are a few folders with `special_` prefix, created for special purposes:
|
||||
# - `special_distributed`: unit tests that must run with multiple GPUs
|
||||
# - `special_e2e`: end-to-end tests with training/generation scripts
|
||||
# - `special_npu`: tests for NPUs
|
||||
# - `special_sanity`: a suite of quick sanity tests
|
||||
# - `special_standalone`: a set of test that are designed to run in dedicated environments
|
||||
|
||||
# Accelerators for tests
|
||||
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
|
||||
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
|
||||
|
||||
# # Workflow layout
|
||||
|
||||
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
|
||||
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
|
||||
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
|
||||
# 3. End-to-end tests: `e2e_*.yml`
|
||||
# 4. Unit tests
|
||||
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
|
||||
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
|
||||
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
|
||||
# - new workflow yaml is added to `.github/workflows`
|
||||
# - new tests are added to workflow mentioned in 2.
|
||||
|
||||
name: sgl
|
||||
|
||||
on:
|
||||
@ -85,4 +116,5 @@ jobs:
|
||||
- name: Test the latest SGLang Rollout async with search tool
|
||||
run: |
|
||||
cd tests/workers/rollout
|
||||
pytest -s test_sglang_async_rollout_search_tools.py
|
||||
pytest -s test_sglang_async_rollout_search_tools.py
|
||||
# Note(haibin.lin): for any new test, please update gpu_unit_tests.yaml to avoid repeated tests
|
@ -1,52 +0,0 @@
|
||||
name: single_controller_unit_tests
|
||||
|
||||
on:
|
||||
# Trigger the workflow on push or pull request,
|
||||
# but only for the main branch
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- v0.*
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
- v0.*
|
||||
paths:
|
||||
- "verl/single_controller/*.py"
|
||||
- .github/workflows/ray_gpu_test.yml
|
||||
- "!recipe/**/*.py"
|
||||
|
||||
# Cancel jobs on the same ref if a new one is triggered
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
|
||||
# Declare permissions just read content.
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
single_controller_unit_tests:
|
||||
runs-on: [L20x8]
|
||||
timeout-minutes: 10 # Increase this timeout value as needed
|
||||
env:
|
||||
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
|
||||
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
|
||||
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
|
||||
HF_ENDPOINT: "https://hf-mirror.com"
|
||||
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
|
||||
container:
|
||||
image: whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6.post5-mcore0.12.0-te2.3
|
||||
options: --gpus all --shm-size=10g
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Install the current repository
|
||||
run: |
|
||||
pip install -e .[test]
|
||||
pip install --upgrade "ray>=2.40.0"
|
||||
- name: Running ray tests that need 8 GPUs
|
||||
run: |
|
||||
cd tests/ray_gpu
|
||||
pytest -s -x --ignore=test_rvdz.py .
|
47
.github/workflows/utils_gpu_test.yml
vendored
47
.github/workflows/utils_gpu_test.yml
vendored
@ -1,47 +0,0 @@
|
||||
name: utils_gpu_test
|
||||
|
||||
on:
|
||||
# Trigger the workflow on push or pull request,
|
||||
# but only for the main branch
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- v0.*
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
- v0.*
|
||||
paths:
|
||||
- "**/*.py"
|
||||
- .github/workflows/utils_gpu_test.yml
|
||||
- "!recipe/**/*.py"
|
||||
|
||||
# Cancel jobs on the same ref if a new one is triggered
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
|
||||
# Declare permissions just read content.
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
utils_gpu_test:
|
||||
runs-on: [L20x8]
|
||||
timeout-minutes: 20 # Increase this timeout value as needed
|
||||
container:
|
||||
image: whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6.post5-mcore0.12.0-te2.3
|
||||
options: --gpus all --shm-size=10g
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install the current repository
|
||||
run: |
|
||||
pip install -e .[test]
|
||||
- name: Running utils gpu tests
|
||||
run: |
|
||||
cd tests/utils/gpu_tests
|
||||
pytest -s -x --ignore=dataset/ --ignore=checkpoint/ .
|
41
.github/workflows/vllm.yml
vendored
41
.github/workflows/vllm.yml
vendored
@ -1,3 +1,35 @@
|
||||
# # Tests layout
|
||||
|
||||
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
|
||||
# - `tests/trainer` for testing functionality related to `verl/trainer`
|
||||
# - `tests/models` for testing functionality related to `verl/models`
|
||||
# - ...
|
||||
|
||||
# There are a few folders with `special_` prefix, created for special purposes:
|
||||
# - `special_distributed`: unit tests that must run with multiple GPUs
|
||||
# - `special_e2e`: end-to-end tests with training/generation scripts
|
||||
# - `special_npu`: tests for NPUs
|
||||
# - `special_sanity`: a suite of quick sanity tests
|
||||
# - `special_standalone`: a set of test that are designed to run in dedicated environments
|
||||
|
||||
# Accelerators for tests
|
||||
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
|
||||
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
|
||||
|
||||
# # Workflow layout
|
||||
|
||||
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
|
||||
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
|
||||
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
|
||||
# 3. End-to-end tests: `e2e_*.yml`
|
||||
# 4. Unit tests
|
||||
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
|
||||
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
|
||||
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
|
||||
# - new workflow yaml is added to `.github/workflows`
|
||||
# - new tests are added to workflow mentioned in 2.
|
||||
|
||||
|
||||
name: vllm
|
||||
|
||||
on:
|
||||
@ -28,8 +60,8 @@ on:
|
||||
- "!**/*sglang*"
|
||||
# Entrypoints
|
||||
- ".github/workflows/vllm.yml"
|
||||
- "tests/e2e/generation"
|
||||
- "tests/rollout"
|
||||
- "tests/special_e2e/generation"
|
||||
- "tests/workers/rollout"
|
||||
- "verl/trainer/main_generation.py"
|
||||
- "verl/trainer/config/generation.yaml"
|
||||
|
||||
@ -82,13 +114,13 @@ jobs:
|
||||
torchrun --standalone --nnodes=1 --nproc_per_node=4 $(which pytest) -s test_vllm_spmd.py
|
||||
- name: Run Qwen 0.5B generation test
|
||||
run: |
|
||||
cd tests/e2e/generation
|
||||
cd tests/special_e2e/generation
|
||||
export OUTPUT_PATH="${HOME}/data/gen/qwen_05_gen_test.parquet"
|
||||
MODEL_ID=Qwen/Qwen2.5-0.5B-Instruct NGPUS_PER_NODE=4 GEN_TP=2 bash ./run_gen_qwen05.sh
|
||||
rm -rf "${OUTPUT_PATH}"
|
||||
- name: Run Qwen 0.5B generation test when world_size == 1
|
||||
run: |
|
||||
cd tests/e2e/generation
|
||||
cd tests/special_e2e/generation
|
||||
export OUTPUT_PATH="${HOME}/data/gen/qwen_05_gen_test.parquet"
|
||||
MODEL_ID=Qwen/Qwen2.5-0.5B-Instruct NGPUS_PER_NODE=1 GEN_TP=1 bash ./run_gen_qwen05.sh
|
||||
rm -rf "${OUTPUT_PATH}"
|
||||
@ -96,3 +128,4 @@ jobs:
|
||||
run: |
|
||||
pip3 install --upgrade vllm==0.8.3 tensordict==0.7.2
|
||||
pytest -svvv tests/workers/rollout/test_vllm_chat_scheduler.py
|
||||
# Note(haibin.lin): for any new test, please update gpu_unit_tests.yaml to avoid repeated tests
|
@ -208,7 +208,7 @@ Unit Tests
|
||||
|
||||
e2e Tests
|
||||
----------
|
||||
we provide e2e test scripts in `tests/e2e` folder, named `tests/e2e/run_gsm8k_fsdp_sgl_multiturn_sf_tool.sh`
|
||||
we provide e2e test scripts in `tests/special_e2e` folder, named `tests/special_e2e/run_gsm8k_fsdp_sgl_multiturn_sf_tool.sh`
|
||||
|
||||
by setting 'trainer.rollout_data_dir' you can dump the rollout data to local disk. here is an sample taken from the rollout data:
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
set -x
|
||||
|
||||
data_path=$HOME/data/rlhf/gsm8k/test.parquet
|
||||
save_path=$HOME/data/rlhf/math/deepseek_v2_lite_gen_test.parquet
|
||||
data_path=$HOME/data/gsm8k/test.parquet
|
||||
save_path=$HOME/data/gsm8k/deepseek_v2_lite_gen_test.parquet
|
||||
model_path=deepseek-ai/deepseek-llm-7b-chat
|
||||
|
||||
python3 -m verl.trainer.main_generation \
|
||||
|
2
setup.py
2
setup.py
@ -45,7 +45,7 @@ install_requires = [
|
||||
|
||||
TEST_REQUIRES = ["pytest", "pre-commit", "py-spy"]
|
||||
PRIME_REQUIRES = ["pyext"]
|
||||
GEO_REQUIRES = ["mathruler"]
|
||||
GEO_REQUIRES = ["mathruler", "torchvision", "qwen_vl_utils"]
|
||||
GPU_REQUIRES = ["liger-kernel", "flash-attn"]
|
||||
MATH_REQUIRES = ["math-verify"] # Add math-verify as an optional dependency
|
||||
VLLM_REQUIRES = ["tensordict<=0.6.2", "vllm<=0.8.5"]
|
||||
|
30
tests/README.md
Normal file
30
tests/README.md
Normal file
@ -0,0 +1,30 @@
|
||||
# Tests layout
|
||||
|
||||
Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
|
||||
- `tests/trainer` for testing functionality related to `verl/trainer`
|
||||
- `tests/models` for testing functionality related to `verl/models`
|
||||
- ...
|
||||
|
||||
There are a few folders with `special_` prefix, created for special purposes:
|
||||
- `special_distributed`: unit tests that must run with multiple GPUs
|
||||
- `special_e2e`: end-to-end tests with training/generation scripts
|
||||
- `special_npu`: tests for NPUs
|
||||
- `special_sanity`: a suite of quick sanity tests
|
||||
- `special_standalone`: a set of test that are designed to run in dedicated environments
|
||||
|
||||
Accelerators for tests
|
||||
- By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
|
||||
- For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
|
||||
|
||||
# Workflow layout
|
||||
|
||||
All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
|
||||
1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
|
||||
2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
|
||||
3. End-to-end tests: `e2e_*.yml`
|
||||
4. Unit tests
|
||||
- `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
|
||||
- `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
|
||||
- Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
|
||||
- new workflow yaml is added to `.github/workflows`
|
||||
- new tests are added to workflow mentioned in 2.
|
@ -1,46 +0,0 @@
|
||||
# Copyright 2024 Bytedance Ltd. and/or its affiliates
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
|
||||
from torch.utils import data
|
||||
|
||||
from tests.e2e.envs.digit_completion import DigitCompletion
|
||||
|
||||
if __name__ == "__main__":
|
||||
simple_task = DigitCompletion(max_number=9, max_diff=9, max_num_in_response=9)
|
||||
all_prompts = simple_task.get_all_prompts()
|
||||
|
||||
# 21 * 6 * 4
|
||||
train_data, test_data = data.random_split(all_prompts, lengths=[0.8, 0.2])
|
||||
train_data = list(train_data)
|
||||
test_data = list(test_data)
|
||||
|
||||
train_data = [[{"role": "user", "content": str(item)}] for item in train_data]
|
||||
test_data = [[{"role": "user", "content": str(item)}] for item in test_data]
|
||||
|
||||
print(f"Size of train: {len(train_data)}, size of test: {len(test_data)}")
|
||||
|
||||
train_data = {"prompt": train_data}
|
||||
test_data = {"prompt": test_data}
|
||||
|
||||
model_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
import pandas as pd
|
||||
|
||||
train_data_frame = pd.DataFrame(train_data)
|
||||
test_data_frame = pd.DataFrame(test_data)
|
||||
|
||||
train_data_frame.to_parquet(os.path.join(model_folder, "train.parquet"))
|
||||
test_data_frame.to_parquet(os.path.join(model_folder, "test.parquet"))
|
Binary file not shown.
Binary file not shown.
@ -1,29 +0,0 @@
|
||||
{
|
||||
"architectures": [
|
||||
"LlamaForCausalLM"
|
||||
],
|
||||
"attention_bias": false,
|
||||
"attention_dropout": 0.0,
|
||||
"bos_token_id": null,
|
||||
"eos_token_id": 1,
|
||||
"hidden_act": "silu",
|
||||
"hidden_size": 128,
|
||||
"initializer_range": 0.02,
|
||||
"intermediate_size": 344,
|
||||
"max_position_embeddings": 2048,
|
||||
"mlp_bias": false,
|
||||
"model_type": "llama",
|
||||
"num_attention_heads": 4,
|
||||
"num_hidden_layers": 4,
|
||||
"num_key_value_heads": 4,
|
||||
"pad_token_id": 2,
|
||||
"pretraining_tp": 1,
|
||||
"rms_norm_eps": 1e-06,
|
||||
"rope_scaling": null,
|
||||
"rope_theta": 10000.0,
|
||||
"tie_word_embeddings": false,
|
||||
"torch_dtype": "bfloat16",
|
||||
"transformers_version": "4.43.3",
|
||||
"use_cache": true,
|
||||
"vocab_size": 16
|
||||
}
|
@ -1,58 +0,0 @@
|
||||
# Copyright 2024 Bytedance Ltd. and/or its affiliates
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
Create a random model and tokenizer for PPO training
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaConfig
|
||||
|
||||
from tests.e2e.envs.digit_completion import CharTokenizer
|
||||
|
||||
tokenizer = CharTokenizer(
|
||||
characters=["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ",", ":"],
|
||||
model_max_length=2048,
|
||||
chat_template="{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set role = message['role'] %}{{ message['content'] }}{% endfor %}{% if add_generation_prompt %}{{ sep_token }}{% endif %}", # noqa: E501
|
||||
)
|
||||
|
||||
config = LlamaConfig(
|
||||
vocab_size=(tokenizer.vocab_size + 16 - 1) // 16 * 16,
|
||||
hidden_size=128,
|
||||
intermediate_size=344,
|
||||
num_hidden_layers=4,
|
||||
num_attention_heads=4,
|
||||
num_key_value_heads=4,
|
||||
pad_token_id=tokenizer.pad_token_id,
|
||||
bos_token_id=tokenizer.bos_token_id,
|
||||
eos_token_id=tokenizer.eos_token_id,
|
||||
)
|
||||
|
||||
model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.bfloat16)
|
||||
|
||||
model_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)))
|
||||
os.makedirs(model_folder, exist_ok=True)
|
||||
|
||||
model.save_pretrained(model_folder)
|
||||
|
||||
tokenizer_folder = model_folder
|
||||
tokenizer.save_pretrained(tokenizer_folder)
|
||||
|
||||
load_tokenizer = AutoTokenizer.from_pretrained(tokenizer_folder)
|
||||
|
||||
chat = [{"role": "user", "content": "1,0:2,3"}]
|
||||
|
||||
load_tokenizer.padding_side = "left"
|
||||
print(load_tokenizer.apply_chat_template(chat, tokenize=True, add_generation_prompt=True, max_length=10, padding="max_length"))
|
@ -1,6 +0,0 @@
|
||||
{
|
||||
"_from_model_config": true,
|
||||
"eos_token_id": 1,
|
||||
"pad_token_id": 2,
|
||||
"transformers_version": "4.43.3"
|
||||
}
|
Binary file not shown.
@ -1,18 +0,0 @@
|
||||
{
|
||||
"char_ords": [
|
||||
48,
|
||||
49,
|
||||
50,
|
||||
51,
|
||||
52,
|
||||
53,
|
||||
54,
|
||||
55,
|
||||
56,
|
||||
57,
|
||||
44,
|
||||
58
|
||||
],
|
||||
"model_max_length": 2048,
|
||||
"chat_template": "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set role = message['role'] %}{{ message['content'] }}{% endfor %}{% if add_generation_prompt %}{{ sep_token }}{% endif %}"
|
||||
}
|
@ -1,30 +0,0 @@
|
||||
# Digit completion
|
||||
|
||||
This is an example of solving a digit completion problem. The problem is defined as below:
|
||||
|
||||
The prompt is a sequence of numbers with fixed difference. The agent's goal is to complete the next N numbers.
|
||||
If the max number is reached, the next number should be modulo with max number.
|
||||
|
||||
For example,
|
||||
- prompt = [1, 2, 3]
|
||||
- N = 5
|
||||
- max_number = 6
|
||||
|
||||
The response should be [4, 5, 6, 7%6, 8%6] = [4, 5, 6, 0, 1].
|
||||
|
||||
# Environment definition
|
||||
|
||||
The core definition of the task is defined in tests/e2e/envs/digit_completion/task.py
|
||||
|
||||
It is highly recommended to take a look at it for better understanding.
|
||||
|
||||
|
||||
|
||||
# Run experiments
|
||||
|
||||
An example of running the task is provided in `tests/e2e/run_ray_trainer.sh`.
|
||||
|
||||
```bash
|
||||
bash tests/e2e/run_ray_trainer.sh
|
||||
```
|
||||
|
@ -1,161 +0,0 @@
|
||||
# Copyright 2024 Bytedance Ltd. and/or its affiliates
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
Using FSDPTrainer
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import hydra
|
||||
import ray
|
||||
import torch
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from verl import DataProto
|
||||
from verl.trainer.ppo.ray_trainer import RayPPOTrainer
|
||||
from verl.utils.fs import copy_to_local
|
||||
|
||||
|
||||
def make_reward_function(tokenizer, num_examine):
|
||||
def arithmetic_sequence_reward_function(data: DataProto, return_dict: bool = False):
|
||||
from tests.e2e.envs.digit_completion.task import compute_reward
|
||||
|
||||
reward_tensor = torch.zeros_like(data.batch["responses"], dtype=torch.float32)
|
||||
|
||||
for i in range(data.batch.batch_size[0]):
|
||||
data_item = data[i] # DataProtoItem
|
||||
|
||||
prompt_ids = data_item.batch["prompts"]
|
||||
|
||||
prompt_length = prompt_ids.shape[-1]
|
||||
|
||||
# extract raw prompt
|
||||
valid_prompt_length = data_item.batch["attention_mask"][:prompt_length].sum()
|
||||
valid_prompt_ids = prompt_ids[-valid_prompt_length:]
|
||||
|
||||
# extract response
|
||||
response_ids = data_item.batch["responses"]
|
||||
response_length = response_ids.shape[-1]
|
||||
response_mask = data.batch["attention_mask"][i][-response_length:]
|
||||
valid_response_length = data_item.batch["attention_mask"][prompt_length:].sum()
|
||||
valid_response_ids = response_ids[:valid_response_length]
|
||||
|
||||
# decode
|
||||
prompt = tokenizer.decode(valid_prompt_ids)
|
||||
response = tokenizer.decode(valid_response_ids)
|
||||
# remove bos and eos
|
||||
prompt = prompt.replace(tokenizer.sep_token, "")
|
||||
response = response.replace(tokenizer.eos_token, "")
|
||||
if i < num_examine:
|
||||
print(prompt, response)
|
||||
|
||||
reward_output = compute_reward(prompt, response)
|
||||
dense_reward = reward_output[0].tolist()
|
||||
ground_truth_response = reward_output[1]["ground_truth_response"]
|
||||
last_reward = dense_reward[-1] if len(dense_reward) > 0 else 1 if len(ground_truth_response) == 0 else 0
|
||||
|
||||
# pad to response_length
|
||||
for _ in range(reward_tensor.shape[-1] - len(dense_reward)):
|
||||
dense_reward.append(last_reward)
|
||||
|
||||
dense_reward = torch.as_tensor(dense_reward, dtype=torch.float32, device=reward_tensor.device)
|
||||
reward_tensor[i] = dense_reward * response_mask
|
||||
|
||||
if return_dict:
|
||||
return {"reward_tensor": reward_tensor}
|
||||
else:
|
||||
return reward_tensor
|
||||
|
||||
return arithmetic_sequence_reward_function
|
||||
|
||||
|
||||
@hydra.main(config_path="../../../../verl/trainer/config", config_name="ppo_trainer", version_base=None)
|
||||
def main(config):
|
||||
ray.init(
|
||||
runtime_env={
|
||||
"env_vars": {
|
||||
"MEGATRON_USE_CUDA_TIMER": "0",
|
||||
"MEGATRON_START_PROCESS_TIMER": "False",
|
||||
"TOKENIZERS_PARALLELISM": "true",
|
||||
"NCCL_DEBUG": "WARN",
|
||||
}
|
||||
},
|
||||
num_cpus=config.ray_init.num_cpus,
|
||||
)
|
||||
|
||||
# print initial config
|
||||
from pprint import pprint
|
||||
|
||||
from omegaconf import OmegaConf
|
||||
|
||||
pprint(OmegaConf.to_container(config, resolve=True)) # resolve=True will eval symbol values
|
||||
|
||||
# print the config
|
||||
# print initial config
|
||||
print("Config after normalizing batch_size")
|
||||
pprint(OmegaConf.to_container(config, resolve=True)) # resolve=True will eval symbol values
|
||||
|
||||
# download the checkpoint from hdfs
|
||||
local_path = copy_to_local(config.actor_rollout_ref.model.path)
|
||||
local_path = os.path.expanduser(local_path)
|
||||
# instantiate tokenizern
|
||||
from transformers import LlamaConfig
|
||||
|
||||
from tests.e2e.envs.digit_completion import CharTokenizer
|
||||
|
||||
AutoTokenizer.register(LlamaConfig, CharTokenizer, exist_ok=True)
|
||||
tokenizer = AutoTokenizer.from_pretrained(local_path)
|
||||
print(f"Tokenizer vocab_size: {tokenizer.vocab_size}")
|
||||
|
||||
# define worker classes
|
||||
from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
|
||||
from verl.workers.fsdp_workers import ActorRolloutRefWorker, CriticWorker
|
||||
|
||||
role_worker_mapping = {
|
||||
Role.ActorRollout: ray.remote(ActorRolloutRefWorker),
|
||||
Role.Critic: ray.remote(CriticWorker),
|
||||
}
|
||||
|
||||
global_pool_id = "global_pool"
|
||||
resource_pool_spec = {
|
||||
global_pool_id: [config.trainer.n_gpus_per_node] * config.trainer.nnodes,
|
||||
}
|
||||
mapping = {
|
||||
Role.ActorRollout: global_pool_id,
|
||||
Role.Critic: global_pool_id,
|
||||
}
|
||||
|
||||
# use reward model
|
||||
if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
|
||||
role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker)
|
||||
mapping[Role.RefPolicy] = global_pool_id
|
||||
|
||||
reward_fn = make_reward_function(tokenizer=tokenizer, num_examine=1)
|
||||
|
||||
resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping)
|
||||
|
||||
trainer = RayPPOTrainer(
|
||||
config=config,
|
||||
tokenizer=tokenizer,
|
||||
role_worker_mapping=role_worker_mapping,
|
||||
resource_pool_manager=resource_pool_manager,
|
||||
reward_fn=reward_fn,
|
||||
val_reward_fn=reward_fn,
|
||||
)
|
||||
trainer.init_workers()
|
||||
trainer.fit()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -1,40 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e -x
|
||||
|
||||
OUTPUT_FILE="/tmp/output_ray_trainer.txt"
|
||||
|
||||
export PATH=$PATH:~/.local/bin
|
||||
|
||||
rm -rf $OUTPUT_FILE
|
||||
python3 tests/e2e/arithmetic_sequence/rl/main_trainer.py \
|
||||
algorithm.adv_estimator=gae \
|
||||
data.train_files=tests/e2e/arithmetic_sequence/data/train.parquet \
|
||||
data.val_files=tests/e2e/arithmetic_sequence/data/test.parquet \
|
||||
data.train_batch_size=800 \
|
||||
data.max_prompt_length=16 \
|
||||
data.max_response_length=32 \
|
||||
data.return_raw_input_ids=True \
|
||||
actor_rollout_ref.model.path=tests/e2e/arithmetic_sequence/model \
|
||||
actor_rollout_ref.model.external_lib=tests.e2e.envs.digit_completion \
|
||||
actor_rollout_ref.model.use_fused_kernels=True \
|
||||
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=128 \
|
||||
actor_rollout_ref.actor.entropy_coeff=0 \
|
||||
actor_rollout_ref.actor.optim.lr=1e-4 \
|
||||
actor_rollout_ref.actor.use_kl_loss=False \
|
||||
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=200 \
|
||||
actor_rollout_ref.rollout.name=hf \
|
||||
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
|
||||
critic.ppo_micro_batch_size_per_gpu=128 \
|
||||
critic.model.path=tests/e2e/arithmetic_sequence/model \
|
||||
critic.optim.lr=1e-3 \
|
||||
algorithm.use_kl_in_reward=False \
|
||||
trainer.total_epochs=200 \
|
||||
trainer.experiment_name=arithmetic_sequences \
|
||||
trainer.logger=['console'] \
|
||||
trainer.n_gpus_per_node=1 \
|
||||
trainer.test_freq=1 \
|
||||
trainer.save_freq=110 | tee $OUTPUT_FILE;
|
||||
|
||||
python3 tests/e2e/check_results.py --output_file=$OUTPUT_FILE
|
||||
rm -rf $OUTPUT_FILE
|
@ -1,41 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e -x
|
||||
|
||||
OUTPUT_FILE="/tmp/output_ray_trainer.txt"
|
||||
|
||||
export PATH=$PATH:~/.local/bin
|
||||
|
||||
rm -rf $OUTPUT_FILE
|
||||
python3 tests/e2e/arithmetic_sequence/rl/main_trainer.py \
|
||||
algorithm.adv_estimator=gae \
|
||||
data.train_files=tests/e2e/arithmetic_sequence/data/train.parquet \
|
||||
data.val_files=tests/e2e/arithmetic_sequence/data/test.parquet \
|
||||
data.train_batch_size=800 \
|
||||
data.val_batch_size=200 \
|
||||
data.max_prompt_length=16 \
|
||||
data.max_response_length=32 \
|
||||
data.return_raw_input_ids=True \
|
||||
actor_rollout_ref.model.path=tests/e2e/arithmetic_sequence/model \
|
||||
actor_rollout_ref.model.external_lib=tests.e2e.envs.digit_completion \
|
||||
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=128 \
|
||||
actor_rollout_ref.actor.entropy_coeff=0 \
|
||||
actor_rollout_ref.actor.optim.lr=1e-4 \
|
||||
actor_rollout_ref.actor.use_kl_loss=False \
|
||||
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=200 \
|
||||
actor_rollout_ref.rollout.name=hf \
|
||||
actor_rollout_ref.rollout.use_fire_sampling=True \
|
||||
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
|
||||
critic.ppo_micro_batch_size_per_gpu=128 \
|
||||
critic.model.path=tests/e2e/arithmetic_sequence/model \
|
||||
critic.optim.lr=1e-3 \
|
||||
algorithm.use_kl_in_reward=False \
|
||||
trainer.total_epochs=200 \
|
||||
trainer.experiment_name=arithmetic_sequences \
|
||||
trainer.logger=['console'] \
|
||||
trainer.n_gpus_per_node=1 \
|
||||
trainer.test_freq=1 \
|
||||
trainer.save_freq=110 | tee $OUTPUT_FILE;
|
||||
|
||||
python3 tests/e2e/check_results.py --output_file=$OUTPUT_FILE --target 0.19
|
||||
rm -rf $OUTPUT_FILE
|
@ -1,20 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e -x
|
||||
|
||||
huggingface-cli download Qwen/Qwen2.5-0.5B --local-dir $HOME/models/Qwen/Qwen2.5-0.5B
|
||||
|
||||
python3 tests/e2e/arithmetic_sequence/rl/main_trainer.py \
|
||||
algorithm.adv_estimator=gae \
|
||||
data.train_files=tests/e2e/arithmetic_sequence/data/train.parquet \
|
||||
data.val_files=tests/e2e/arithmetic_sequence/data/test.parquet \
|
||||
actor_rollout_ref.model.use_fused_kernels=True \
|
||||
actor_rollout_ref.actor.use_kl_loss=False \
|
||||
actor_rollout_ref.model.path=tests/e2e/arithmetic_sequence/model \
|
||||
actor_rollout_ref.rollout.name=vllm \
|
||||
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
|
||||
actor_rollout_ref.model.tokenizer_path=tests/e2e/arithmetic_sequence/model \
|
||||
critic.model.path=Qwen/Qwen2.5-0.5B \
|
||||
critic.model.use_remove_padding=True \
|
||||
algorithm.use_kl_in_reward=False \
|
||||
trainer.total_epochs=1
|
@ -1,45 +0,0 @@
|
||||
# Copyright 2024 Bytedance Ltd. and/or its affiliates
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
def test_flash_attn_cross_entropy():
|
||||
import torch
|
||||
from flash_attn.ops.triton.cross_entropy import cross_entropy_loss
|
||||
from torch import nn
|
||||
|
||||
from verl.utils.debug import log_gpu_memory_usage
|
||||
from verl.utils.torch_functional import logprobs_from_logits_naive
|
||||
|
||||
log_gpu_memory_usage("At start")
|
||||
|
||||
hidden_states = torch.randn(size=(2048, 5120), device="cuda", requires_grad=True, dtype=torch.bfloat16)
|
||||
|
||||
linear = nn.Linear(in_features=5120, out_features=155136, bias=False, device="cuda", dtype=torch.bfloat16)
|
||||
|
||||
logits = linear(hidden_states)
|
||||
|
||||
# logits = logits.float()
|
||||
labels = torch.randint(low=0, high=155136, size=(2048,), device="cuda")
|
||||
|
||||
log_gpu_memory_usage("before computation")
|
||||
# output = checkpoint.checkpoint(logprobs_from_logits, logits, labels, use_reentrant=True)
|
||||
output = -cross_entropy_loss(logits, labels)[0]
|
||||
# output = logprobs_from_logits(logits, labels)
|
||||
log_gpu_memory_usage("After forward")
|
||||
output.sum().backward()
|
||||
log_gpu_memory_usage("After backward")
|
||||
|
||||
groundtruth = logprobs_from_logits_naive(logits.float(), labels)
|
||||
|
||||
torch.testing.assert_close(output, groundtruth)
|
@ -1,96 +0,0 @@
|
||||
# Copyright 2024 Bytedance Ltd. and/or its affiliates
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from flash_attn.bert_padding import unpad_input
|
||||
|
||||
from verl.utils.model import create_random_mask
|
||||
|
||||
|
||||
def test_log_probs_from_logits_response_rmpad():
|
||||
from verl.utils.torch_functional import log_probs_from_logits_response, log_probs_from_logits_response_rmpad
|
||||
|
||||
vocab_size = 32000
|
||||
batch_size = 2
|
||||
prompt_length = 256
|
||||
response_length = 256
|
||||
|
||||
input_ids = torch.randint(low=0, high=vocab_size, size=(batch_size, prompt_length + response_length), device="cuda")
|
||||
attention_mask = create_random_mask(input_ids=input_ids, max_ratio_of_left_padding=0.2, max_ratio_of_valid_token=0.8, min_ratio_of_valid_token=0.6)
|
||||
|
||||
response_mask = attention_mask[:, -response_length:]
|
||||
|
||||
assert torch.all(response_mask[:, 0] == 1)
|
||||
|
||||
logits = torch.randn(batch_size, prompt_length + response_length, vocab_size, device="cuda")
|
||||
logits_rmpad = unpad_input(logits, attention_mask)[0]
|
||||
|
||||
expected_output = log_probs_from_logits_response(input_ids=input_ids, logits=logits, response_length=response_length)
|
||||
actual_output = log_probs_from_logits_response_rmpad(input_ids=input_ids, attention_mask=attention_mask, logits_rmpad=logits_rmpad, response_length=response_length)
|
||||
|
||||
# This should bitwise align as only this operation only contains gather operators
|
||||
assert torch.all(torch.eq(actual_output * response_mask, expected_output * response_mask))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [torch.float64, torch.float32, torch.float16, torch.bfloat16])
|
||||
def test_logprobs_from_logits_v2(dtype):
|
||||
from verl.utils.torch_functional import logprobs_from_logits_naive, logprobs_from_logits_v2
|
||||
|
||||
vocab_size = 32000
|
||||
batch_size = 2
|
||||
seq_len = 512
|
||||
|
||||
labels = torch.randint(low=0, high=vocab_size, size=(batch_size, seq_len), device="cuda")
|
||||
logits = torch.randn(batch_size, seq_len, vocab_size, device="cuda", dtype=dtype)
|
||||
|
||||
expected_output = logprobs_from_logits_naive(labels=labels, logits=logits)
|
||||
actual_output = logprobs_from_logits_v2(labels=labels, logits=logits)
|
||||
|
||||
if dtype in [torch.float16, torch.bfloat16]: # float16 falls back to an exactly equivalent method
|
||||
assert torch.equal(actual_output, expected_output)
|
||||
else: # small numerical difference when using gather / logsumexp approach
|
||||
torch.testing.assert_close(actual_output, expected_output, rtol=1e-5, atol=1e-5)
|
||||
|
||||
|
||||
def test_lr_scheduler():
|
||||
from torch import nn
|
||||
|
||||
model = nn.Linear(10, 10)
|
||||
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
|
||||
|
||||
from verl.utils.torch_functional import get_constant_schedule_with_warmup
|
||||
|
||||
constant_lr = get_constant_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=2)
|
||||
|
||||
lr_lst = []
|
||||
|
||||
for _ in range(5):
|
||||
lr_lst.append(constant_lr.get_last_lr()[0])
|
||||
constant_lr.step()
|
||||
|
||||
torch.testing.assert_close(lr_lst, [0.0, 0.0005, 0.001, 0.001, 0.001])
|
||||
|
||||
from verl.utils.torch_functional import get_cosine_schedule_with_warmup
|
||||
|
||||
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
|
||||
cosine_lr = get_cosine_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=2, num_training_steps=5, min_lr_ratio=0.1)
|
||||
|
||||
lr_lst = []
|
||||
|
||||
for _ in range(5):
|
||||
lr_lst.append(cosine_lr.get_last_lr()[0])
|
||||
cosine_lr.step()
|
||||
|
||||
torch.testing.assert_close(lr_lst, [0.0, 0.0005, 0.001, 0.0007750000000000002, 0.0003250000000000002])
|
@ -1,47 +0,0 @@
|
||||
# Copyright 2024 Bytedance Ltd. and/or its affiliates
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
|
||||
def test():
|
||||
wait_time = 10
|
||||
|
||||
my_env = os.environ.copy()
|
||||
my_env["WAIT_TIME"] = str(wait_time)
|
||||
|
||||
p = subprocess.Popen(["python3", "-u", "./check_worker_alive/main.py"], env=my_env, stdout=subprocess.PIPE)
|
||||
|
||||
count = 0
|
||||
while b"foo started" not in p.stdout.read():
|
||||
time.sleep(1)
|
||||
count += 1
|
||||
if count > 40:
|
||||
raise RuntimeError("timeout for start foo in check_worker_alive/main.py")
|
||||
|
||||
print(
|
||||
time.time(),
|
||||
f"wait 1.5 wait time {wait_time * 1.5} to let signal returned to process but still not exceed process wait time",
|
||||
)
|
||||
time.sleep(wait_time * 1.5)
|
||||
print(time.time(), "start checking")
|
||||
assert p.poll() is not None, f"process {p} still alive, expecting signal raised abort"
|
||||
assert p.returncode != 0, f"process {p} exit with code 0, expecting not-zero exit code"
|
||||
print("test passed")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test()
|
13
tests/single_controller/__init__.py
Normal file
13
tests/single_controller/__init__.py
Normal file
@ -0,0 +1,13 @@
|
||||
# Copyright 2025 Bytedance Ltd. and/or its affiliates
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
1
tests/special_distributed/README.md
Normal file
1
tests/special_distributed/README.md
Normal file
@ -0,0 +1 @@
|
||||
This folder is reserved for unit tests (instead of end-to-end tests) that require multiple GPUs.
|
@ -15,4 +15,4 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e -x
|
||||
torchrun --nproc-per-node=4 --standalone tests/distributed/test_tensor_dict.py
|
||||
torchrun --nproc-per-node=4 --standalone tests/special_distributed/test_tensor_dict.py
|
1
tests/special_e2e/README.md
Normal file
1
tests/special_e2e/README.md
Normal file
@ -0,0 +1 @@
|
||||
This folder is reserved for end-to-end tests that typically require multiple GPUs.
|
@ -138,7 +138,7 @@ python3 -m verl.trainer.main_ppo \
|
||||
| tee "${output_file}"
|
||||
|
||||
if [ "${CUSTOM_REWARD_FN}" = "True" ]; then
|
||||
python3 tests/e2e/check_custom_rwd_fn.py --output_file="${output_file}"
|
||||
python3 tests/special_e2e/check_custom_rwd_fn.py --output_file="${output_file}"
|
||||
check_exit_code=$?
|
||||
rm -rf "${reward_fn_file_path}"
|
||||
rm -rf "${output_file}"
|
@ -33,14 +33,14 @@ if not re_modules:
|
||||
print(f"❌ Invalid PR title: '{pr_title}'")
|
||||
print("Expected format: [module] type: description")
|
||||
print(f"Allowed modules: {', '.join(allowed_modules)}")
|
||||
sys.exit(1)
|
||||
raise Exception("Invalid PR title")
|
||||
else:
|
||||
modules = re.findall(r"[a-z]+", re_modules.group(1).lower())
|
||||
if not all(module in allowed_modules for module in modules):
|
||||
invalid_modules = [module for module in modules if module not in allowed_modules]
|
||||
print(f"❌ Invalid modules: {', '.join(invalid_modules)}")
|
||||
print(f"Allowed modules: {', '.join(allowed_modules)}")
|
||||
sys.exit(1)
|
||||
raise Exception("Invalid PR title")
|
||||
|
||||
types_pattern = "|".join(re.escape(t) for t in allowed_types)
|
||||
re_types_pattern = re.compile(rf"^\[[a-z_,\s]+\]\s+({types_pattern}):\s+.+$", re.IGNORECASE)
|
||||
@ -50,7 +50,7 @@ if not match:
|
||||
print(f"❌ Invalid PR title: '{pr_title}'")
|
||||
print("Expected format: [module] type: description")
|
||||
print(f"Allowed types: {', '.join(allowed_types)}")
|
||||
sys.exit(1)
|
||||
raise Exception("Invalid PR title")
|
||||
|
||||
change_type = match.group(1).lower()
|
||||
|
@ -71,5 +71,6 @@ def test_trainer_config_doc():
|
||||
print("Please read the top block of `verl/trainer/config/ppo_trainer.yaml` to see format rules:\n")
|
||||
for err in validation_errors:
|
||||
print(" -", err)
|
||||
raise Exception("Please fix documentation format.")
|
||||
else:
|
||||
print("YAML format check passed ✅")
|
1
tests/special_standalone/README.md
Normal file
1
tests/special_standalone/README.md
Normal file
@ -0,0 +1 @@
|
||||
The standalone test folder is reserved for tests that require dedicated environment (e.g. memory stress tests)
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user