[ci] refactor: setup testing guidance (#1958)

2025-10-20 13:43:50 +08:00 · 2025-06-12 06:16:58 -07:00
parent a0673f0c89
commit 5fa911b3ce
125 changed files with 680 additions and 1102 deletions
--- a/.github/workflows/check-pr-title.yml
+++ b/.github/workflows/check-pr-title.yml
@ -1,5 +1,34 @@
-# .github/workflows/check-pr-title.yml
-name: Check PR Title
+# # Tests layout
+
+# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
+# - `tests/trainer` for testing functionality related to `verl/trainer`
+# - `tests/models` for testing functionality related to `verl/models`
+# - ...
+
+# There are a few folders with `special_` prefix, created for special purposes:
+# - `special_distributed`: unit tests that must run with multiple GPUs
+# - `special_e2e`: end-to-end tests with training/generation scripts
+# - `special_npu`: tests for NPUs
+# - `special_sanity`: a suite of quick sanity tests
+# - `special_standalone`: a set of test that are designed to run in dedicated environments
+
+# Accelerators for tests 
+# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
+# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
+
+# # Workflow layout
+
+# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
+# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
+# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
+# 3. End-to-end tests: `e2e_*.yml`
+# 4. Unit tests
+#   - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
+#   - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
+#   - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
+#     - new workflow yaml is added to `.github/workflows`
+#     - new tests are added to workflow mentioned in 2.
+

 on:
  pull_request:
@ -18,6 +47,6 @@ jobs:
          python-version: '3.11'

      - name: Run PR title checker
-        run: python3 tests/sanity/check_pr_title.py
+        run: python3 tests/special_sanity/check_pr_title.py
        env:
          PR_TITLE: ${{ github.event.pull_request.title }}
--- a/.github/workflows/checkpoint_converter.yml
+++ b/.github/workflows/checkpoint_converter.yml
@ -1,3 +1,36 @@
+# # Tests layout
+
+# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
+# - `tests/trainer` for testing functionality related to `verl/trainer`
+# - `tests/models` for testing functionality related to `verl/models`
+# - ...
+
+# There are a few folders with `special_` prefix, created for special purposes:
+# - `special_distributed`: unit tests that must run with multiple GPUs
+# - `special_e2e`: end-to-end tests with training/generation scripts
+# - `special_npu`: tests for NPUs
+# - `special_sanity`: a suite of quick sanity tests
+# - `special_standalone`: a set of test that are designed to run in dedicated environments
+
+# Accelerators for tests 
+# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
+# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
+
+# # Workflow layout
+
+# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
+# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
+# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
+# 3. End-to-end tests: `e2e_*.yml`
+# 4. Unit tests
+#   - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
+#   - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
+#   - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
+#     - new workflow yaml is added to `.github/workflows`
+#     - new tests are added to workflow mentioned in 2.
+
+
+
 name: checkpoint_converter
 # latest version: Megatron-LM core_r0.11.0 https://github.com/NVIDIA/Megatron-LM/tree/core_r0.11.0

@ -27,7 +60,7 @@ on:
      - ".github/workflows/checkpoint_converter.yml"
      - ".github/workflows/e2e_ppo_trainer_megatron.yml"
      - "examples/data_preprocess/gsm8k.py"
-      - "tests/e2e/run_ppo_trainer_megatron.sh"
+      - "tests/special_e2e/run_ppo_trainer_megatron.sh"
      - "verl/trainer/main_ppo.py"
      - "verl/trainer/config/ppo_megatron_trainer.yaml"

--- a/.github/workflows/cpu_unit_tests.yml
+++ b/.github/workflows/cpu_unit_tests.yml
@ -1,3 +1,35 @@
+# # Tests layout
+
+# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
+# - `tests/trainer` for testing functionality related to `verl/trainer`
+# - `tests/models` for testing functionality related to `verl/models`
+# - ...
+
+# There are a few folders with `special_` prefix, created for special purposes:
+# - `special_distributed`: unit tests that must run with multiple GPUs
+# - `special_e2e`: end-to-end tests with training/generation scripts
+# - `special_npu`: tests for NPUs
+# - `special_sanity`: a suite of quick sanity tests
+# - `special_standalone`: a set of test that are designed to run in dedicated environments
+
+# Accelerators for tests 
+# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
+# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
+
+# # Workflow layout
+
+# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
+# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
+# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
+# 3. End-to-end tests: `e2e_*.yml`
+# 4. Unit tests
+#   - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
+#   - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
+#   - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
+#     - new workflow yaml is added to `.github/workflows`
+#     - new tests are added to workflow mentioned in 2.
+
+
 name: cpu_unit_tests

 on:
@ -40,20 +72,12 @@ jobs:
          python-version: ${{ matrix.python-version }}
      - name: Install the current repository
        run: |
-          pip install -e .[test]
-      - name: Running data proto test
+          pip install -e .[test,prime,geo]
+          pip install --upgrade "ray>=2.40.0" pillow
+      - name: Running CPU unit tests
        run: |
-          cd tests
-          pytest -s -x test_protocol.py
-      - name: running utils cpu tests
-        run: |
-          cd tests/utils/cpu_tests
-          pytest -s -x .
-      - name: Running trainer tests
-        run: |
-          cd tests/trainer
-          pytest -s -x .
-      - name: Running worker tests
-        run: |
-          cd tests/workers/reward_manager
-          pytest -s -x .
+          [ ! -d "$HOME/verl-data" ] && git clone --depth 1 https://github.com/eric-haibin-lin/verl-data ~/verl-data
+          python3 examples/data_preprocess/geo3k.py
+          echo '[pytest]' > pytest.ini
+          echo 'python_files = *_on_cpu.py' >> pytest.ini
+          pytest -s -x tests/
--- a/.github/workflows/dataset.yml
+++ b/.github/workflows/dataset.yml
@ -1,58 +0,0 @@
-name: dataset
-
-on:
-  # Trigger the workflow on push or pull request,
-  # but only for the main branch
-  push:
-    branches:
-      - main
-      - v0.*
-  pull_request:
-    branches:
-      - main
-    paths:
-      - "verl/utils/**/*.py"
-      - .github/workflows/dataset.yml
-
-# Cancel jobs on the same ref if a new one is triggered
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
-
-# Declare permissions just read content.
-permissions:
-  contents: read
-
-jobs:
-  ray:
-    runs-on: [L20x8]
-    timeout-minutes: 10 # Increase this timeout value as needed
-    env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
-      HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
-      NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
-      HF_ENDPOINT: "https://hf-mirror.com"
-      HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
-    container:
-      image: whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6.post5-mcore0.12.0-te2.3
-      options: --gpus all --shm-size=10g
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 0
-      - name: Install the current repository
-        run: |
-          pip install -e .[test]
-          pip install --upgrade "ray>=2.40.0"
-          pip install cupy-cuda12x
-      - name: Running dataset tests
-        run: |
-          [ ! -d "$HOME/verl-data" ] && git clone --depth 1 https://github.com/eric-haibin-lin/verl-data ~/verl-data
-          python3 examples/data_preprocess/geo3k.py
-          pytest -s -x tests/utils/gpu_tests/dataset/test_rl_dataset.py
-          pytest -s -x tests/utils/gpu_tests/dataset/test_sft_dataset.py
-          # pytest -s -x tests/utils/gpu_tests/dataset/test_rm_dataset.py
-      - name: Running ray test using cupy (move it to L20 when dockerfile ready)
-        run: |
-          cd tests/ray_gpu
-          pytest -s -x test_rvdz.py
--- a/.github/workflows/disabled/e2e_prime.yml
+++ b/.github/workflows/disabled/e2e_prime.yml
@ -25,7 +25,7 @@ on:
      # Entrypoints
      - ".github/workflows/e2e_prime.yml"
      - "examples/data_preprocess/gsm8k.py"
-      - "tests/e2e/run_prime.sh"
+      - "tests/special_e2e/run_prime.sh"

 # Cancel jobs on the same ref if a new one is triggered
 concurrency:
@ -63,4 +63,4 @@ jobs:
      - name: Running GSM8K E2E with prime alg
        run: |
          ray stop --force
-          bash tests/e2e/run_prime.sh
+          bash tests/special_e2e/run_prime.sh
--- a/.github/workflows/doc.yml
+++ b/.github/workflows/doc.yml
@ -1,3 +1,35 @@
+# # Tests layout
+
+# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
+# - `tests/trainer` for testing functionality related to `verl/trainer`
+# - `tests/models` for testing functionality related to `verl/models`
+# - ...
+
+# There are a few folders with `special_` prefix, created for special purposes:
+# - `special_distributed`: unit tests that must run with multiple GPUs
+# - `special_e2e`: end-to-end tests with training/generation scripts
+# - `special_npu`: tests for NPUs
+# - `special_sanity`: a suite of quick sanity tests
+# - `special_standalone`: a set of test that are designed to run in dedicated environments
+
+# Accelerators for tests 
+# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
+# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
+
+# # Workflow layout
+
+# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
+# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
+# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
+# 3. End-to-end tests: `e2e_*.yml`
+# 4. Unit tests
+#   - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
+#   - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
+#   - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
+#     - new workflow yaml is added to `.github/workflows`
+#     - new tests are added to workflow mentioned in 2.
+
+
 name: doc_test

 on:
--- a/.github/workflows/e2e_ascend.yml
+++ b/.github/workflows/e2e_ascend.yml
@ -1,3 +1,35 @@
+# # Tests layout
+
+# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
+# - `tests/trainer` for testing functionality related to `verl/trainer`
+# - `tests/models` for testing functionality related to `verl/models`
+# - ...
+
+# There are a few folders with `special_` prefix, created for special purposes:
+# - `special_distributed`: unit tests that must run with multiple GPUs
+# - `special_e2e`: end-to-end tests with training/generation scripts
+# - `special_npu`: tests for NPUs
+# - `special_sanity`: a suite of quick sanity tests
+# - `special_standalone`: a set of test that are designed to run in dedicated environments
+
+# Accelerators for tests 
+# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
+# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
+
+# # Workflow layout
+
+# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
+# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
+# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
+# 3. End-to-end tests: `e2e_*.yml`
+# 4. Unit tests
+#   - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
+#   - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
+#   - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
+#     - new workflow yaml is added to `.github/workflows`
+#     - new tests are added to workflow mentioned in 2.
+
+
 name: e2e_ascend

 on:
@ -24,7 +56,7 @@ on:
      - ".github/workflows/e2e_ascend.yml"
      - "examples/data_preprocess/gsm8k.py"
      - "examples/data_preprocess/geo3k.py"
-      - "tests/e2e/ppo_trainer"
+      - "tests/special_e2e/ppo_trainer"
      - "verl/trainer/main_ppo.py"
      - "verl/trainer/config/ppo_trainer.yaml"

@ -84,15 +116,15 @@ jobs:
      - name: Running gsm8k e2e training tests with LoRA on ASCEND NPU
        run: |
          ray stop --force
-          bash tests/e2e/sft/run_sft.sh
+          bash tests/special_e2e/sft/run_sft.sh
          rm -rf $HOME/ckpts
      - name: Running gsm8k e2e training tests with GRPO on ASCEND NPU
        run: |
          ray stop --force
-          bash tests/npu/run_qwen2_5_05b_grpo.sh
+          bash tests/special_npu/run_qwen2_5_05b_grpo.sh
          rm -rf $HOME/ckpts
      - name: Running gsm8k e2e training tests with DAPO on ASCEND NPU
        run: |
          ray stop --force
-          bash tests/npu/run_qwen2_5_05b_dapo.sh
-          rm -rf $HOME/ckpts
+          bash tests/special_npu/run_qwen2_5_05b_dapo.sh
+          rm -rf $HOME/ckpts
--- a/.github/workflows/e2e_dapo.yml
+++ b/.github/workflows/e2e_dapo.yml
@ -1,3 +1,35 @@
+# # Tests layout
+
+# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
+# - `tests/trainer` for testing functionality related to `verl/trainer`
+# - `tests/models` for testing functionality related to `verl/models`
+# - ...
+
+# There are a few folders with `special_` prefix, created for special purposes:
+# - `special_distributed`: unit tests that must run with multiple GPUs
+# - `special_e2e`: end-to-end tests with training/generation scripts
+# - `special_npu`: tests for NPUs
+# - `special_sanity`: a suite of quick sanity tests
+# - `special_standalone`: a set of test that are designed to run in dedicated environments
+
+# Accelerators for tests 
+# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
+# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
+
+# # Workflow layout
+
+# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
+# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
+# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
+# 3. End-to-end tests: `e2e_*.yml`
+# 4. Unit tests
+#   - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
+#   - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
+#   - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
+#     - new workflow yaml is added to `.github/workflows`
+#     - new tests are added to workflow mentioned in 2.
+
+
 name: e2e_dapo

 on:
@ -27,7 +59,7 @@ on:
      # Entrypoints
      - ".github/workflows/e2e_dapo.yml"
      - "examples/data_preprocess/gsm8k.py"
-      - "tests/e2e/run_dapo.sh"
+      - "tests/special_e2e/run_dapo.sh"

 # Cancel jobs on the same ref if a new one is triggered
 concurrency:
@ -64,4 +96,4 @@ jobs:
      - name: Running the E2E test with the DAPO algorithm
        run: |
          ray stop --force
-          bash tests/e2e/run_dapo.sh
+          bash tests/special_e2e/run_dapo.sh
--- a/.github/workflows/e2e_eval_aime24.yml
+++ b/.github/workflows/e2e_eval_aime24.yml
@ -1,3 +1,35 @@
+# # Tests layout
+
+# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
+# - `tests/trainer` for testing functionality related to `verl/trainer`
+# - `tests/models` for testing functionality related to `verl/models`
+# - ...
+
+# There are a few folders with `special_` prefix, created for special purposes:
+# - `special_distributed`: unit tests that must run with multiple GPUs
+# - `special_e2e`: end-to-end tests with training/generation scripts
+# - `special_npu`: tests for NPUs
+# - `special_sanity`: a suite of quick sanity tests
+# - `special_standalone`: a set of test that are designed to run in dedicated environments
+
+# Accelerators for tests 
+# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
+# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
+
+# # Workflow layout
+
+# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
+# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
+# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
+# 3. End-to-end tests: `e2e_*.yml`
+# 4. Unit tests
+#   - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
+#   - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
+#   - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
+#     - new workflow yaml is added to `.github/workflows`
+#     - new tests are added to workflow mentioned in 2.
+
+
 name: e2e_eval_aime24

 on:
@ -24,7 +56,7 @@ on:
      - "!recipe/**"
      # Entrypoints
      - ".github/workflows/e2e_eval_aime24.yml"
-      - "tests/e2e/run_r1_distill_qwen_aime24_eval.sh"
+      - "tests/special_e2e/run_r1_distill_qwen_aime24_eval.sh"
      - "verl/trainer/main_generation.py"
      - "verl/trainer/config/generation.yaml"

@ -65,4 +97,4 @@ jobs:
      - name: Running generation and evaluation in AIME 2024
        run: |
          ray stop --force
-          bash tests/e2e/run_r1_distill_qwen_aime24_eval.sh
+          bash tests/special_e2e/run_r1_distill_qwen_aime24_eval.sh
--- a/.github/workflows/e2e_ppo_trainer.yml
+++ b/.github/workflows/e2e_ppo_trainer.yml
@ -26,7 +26,7 @@ on:
      - ".github/workflows/e2e_ppo_trainer.yml"
      - "examples/data_preprocess/gsm8k.py"
      - "examples/data_preprocess/geo3k.py"
-      - "tests/e2e/ppo_trainer"
+      - "tests/special_e2e/ppo_trainer"
      - "verl/trainer/main_ppo.py"
      - "verl/trainer/config/ppo_trainer.yaml"

@ -86,11 +86,11 @@ jobs:
      - name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm with validation and saving (FSDP_SIZE=8)
        run: |
          ray stop --force
-          VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 SAVE_HF_MODEL=True VERL_EXP_NAME="qwen2.5-0.5b-function-reward-minimal-fsdp8" bash tests/e2e/ppo_trainer/run_function_reward.sh
+          VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 SAVE_HF_MODEL=True VERL_EXP_NAME="qwen2.5-0.5b-function-reward-minimal-fsdp8" bash tests/special_e2e/ppo_trainer/run_function_reward.sh
      - name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm after resuming
        run: |
          ray stop --force
-          RESUME_MODE=auto VERL_EXP_NAME="qwen2.5-0.5b-function-reward-minimal-fsdp8" bash tests/e2e/ppo_trainer/run_function_reward.sh
+          RESUME_MODE=auto VERL_EXP_NAME="qwen2.5-0.5b-function-reward-minimal-fsdp8" bash tests/special_e2e/ppo_trainer/run_function_reward.sh
      - name: Test merging FSDP checkpoints (Qwen Actor) 
        run: |
          exp_name="qwen2.5-0.5b-function-reward-minimal-fsdp8"
@ -98,7 +98,7 @@ jobs:
      - name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm with validation and saving (DDP_SIZE=2, FSDP_SIZE=4)
        run: |
          ray stop --force
-          VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 SAVE_HF_MODEL=True FSDP_SIZE=4 VERL_EXP_NAME="qwen2.5-0.5b-function-reward-minimal-ddp2-fsdp4" bash tests/e2e/ppo_trainer/run_function_reward.sh
+          VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 SAVE_HF_MODEL=True FSDP_SIZE=4 VERL_EXP_NAME="qwen2.5-0.5b-function-reward-minimal-ddp2-fsdp4" bash tests/special_e2e/ppo_trainer/run_function_reward.sh
      - name: Test merging DDP+FSDP checkpoints (Qwen Actor) 
        run: |
          exp_name="qwen2.5-0.5b-function-reward-minimal-ddp2-fsdp4"
@ -106,32 +106,32 @@ jobs:
      - name: Running GSM8K E2E without rmpad using function rm
        run: |
          ray stop --force
-          RM_PAD=False bash tests/e2e/ppo_trainer/run_function_reward.sh
+          RM_PAD=False bash tests/special_e2e/ppo_trainer/run_function_reward.sh
      - name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm (GRPO)
        run: |
          ray stop --force
-          ADV_ESTIMATOR=grpo USE_KL=True bash tests/e2e/ppo_trainer/run_function_reward.sh
+          ADV_ESTIMATOR=grpo USE_KL=True bash tests/special_e2e/ppo_trainer/run_function_reward.sh
      - name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm (ReMax)
        run: |
          ray stop --force
-          ADV_ESTIMATOR=remax USE_KL=True bash tests/e2e/ppo_trainer/run_function_reward.sh
+          ADV_ESTIMATOR=remax USE_KL=True bash tests/special_e2e/ppo_trainer/run_function_reward.sh
      - name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using customized reward function
        run: |
          ray stop --force
-          CUSTOM_REWARD_FN=True bash tests/e2e/ppo_trainer/run_function_reward.sh
+          CUSTOM_REWARD_FN=True bash tests/special_e2e/ppo_trainer/run_function_reward.sh
      - name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm with in-reward kl and kl loss
        run: |
          ray stop --force
-          USE_KL=True bash tests/e2e/ppo_trainer/run_function_reward.sh
+          USE_KL=True bash tests/special_e2e/ppo_trainer/run_function_reward.sh
      # LoRA tests
      - name: Running GSM8K E2E training tests on 8 L20 GPUs with grpo lora using function rm with use_shm
        run: |
          ray stop --force
-          ADV_ESTIMATOR=grpo USE_SHM=True LORA_RANK=32 LOAD_FORMAT=safetensors bash tests/e2e/ppo_trainer/run_function_reward.sh
+          ADV_ESTIMATOR=grpo USE_SHM=True LORA_RANK=32 LOAD_FORMAT=safetensors bash tests/special_e2e/ppo_trainer/run_function_reward.sh
      - name: Running GSM8K E2E training tests on 8 L20 GPUs with grpo lora using function rm with use_shm and layered_summon
        run: |
          ray stop --force
-          ADV_ESTIMATOR=grpo USE_SHM=True LORA_RANK=32 LOAD_FORMAT=safetensors LAYERED_SUMMON=True TOTAL_TRAIN_STEPS=1 SAVE_FREQ=1 FSDP_SIZE=4 VERL_EXP_NAME="qwen2.5-0.5b-function-reward-minimal" bash tests/e2e/ppo_trainer/run_function_reward.sh
+          ADV_ESTIMATOR=grpo USE_SHM=True LORA_RANK=32 LOAD_FORMAT=safetensors LAYERED_SUMMON=True TOTAL_TRAIN_STEPS=1 SAVE_FREQ=1 FSDP_SIZE=4 VERL_EXP_NAME="qwen2.5-0.5b-function-reward-minimal" bash tests/special_e2e/ppo_trainer/run_function_reward.sh
      - name: Test GRPO LoRA checkpoints merging function
        run: |
          export EXP_NAME="qwen2.5-0.5b-function-reward-minimal"
@ -141,40 +141,40 @@ jobs:
      - name: Running GSM8K E2E training tests on 8 L20 GPUs with grpo lora using function rm with use_shm and layered_summon with fsdp2
        run: |
          ray stop --force
-          ADV_ESTIMATOR=grpo USE_SHM=True LORA_RANK=32 LOAD_FORMAT=safetensors LAYERED_SUMMON=True STRATEGY=fsdp2 bash tests/e2e/ppo_trainer/run_function_reward.sh
+          ADV_ESTIMATOR=grpo USE_SHM=True LORA_RANK=32 LOAD_FORMAT=safetensors LAYERED_SUMMON=True STRATEGY=fsdp2 bash tests/special_e2e/ppo_trainer/run_function_reward.sh
      # Model RM
      - name: Running GRPO GSM8K E2E training tests with FSDP on 8 L20 GPUs (DeepSeek)
        run: |
          ray stop --force
-          MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct bash tests/e2e/ppo_trainer/run_function_reward.sh
+          MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct bash tests/special_e2e/ppo_trainer/run_function_reward.sh
      - name: Running GSM8K E2E with rmpad using model rm
        run: |
          ray stop --force
-          bash tests/e2e/ppo_trainer/run_model_reward.sh
+          bash tests/special_e2e/ppo_trainer/run_model_reward.sh
      - name: Running GSM8K E2E without rmpad using model rm
        run: |
          ray stop --force
-          RM_PAD=False bash tests/e2e/ppo_trainer/run_model_reward.sh
+          RM_PAD=False bash tests/special_e2e/ppo_trainer/run_model_reward.sh
      - name: Running GSM8K E2E with rmpad using model rm and ulysses sp=2
        run: |
          ray stop --force
-          SP_SIZE=2 bash tests/e2e/ppo_trainer/run_model_reward.sh
+          SP_SIZE=2 bash tests/special_e2e/ppo_trainer/run_model_reward.sh
      - name: Running GSM8K E2E with rmpad using model rm and dynamic batch size
        run: |
          ray stop --force
-          SEQ_BALANCE=True bash tests/e2e/ppo_trainer/run_model_reward.sh
+          SEQ_BALANCE=True bash tests/special_e2e/ppo_trainer/run_model_reward.sh
      - name: Running GSM8K E2E with rmpad using model rm with Liger Kernel enabled
        run: |
          ray stop --force
-          LIGER=True bash tests/e2e/ppo_trainer/run_model_reward.sh
+          LIGER=True bash tests/special_e2e/ppo_trainer/run_model_reward.sh
      - name: Running GSM8K E2E with rmpad using model rm with Fused Kernel enabled
        run: |
          ray stop --force
-          FUSED_KERNELS=True bash tests/e2e/ppo_trainer/run_model_reward.sh
+          FUSED_KERNELS=True bash tests/special_e2e/ppo_trainer/run_model_reward.sh
      - name: Running GSM8K E2E with rmpad using model rm with Fused Kernel enabled
        run: |
          ray stop --force
-          FUSED_KERNEL=True FUSED_KERNEL_BACKEND=triton bash tests/e2e/ppo_trainer/run_model_reward.sh
+          FUSED_KERNEL=True FUSED_KERNEL_BACKEND=triton bash tests/special_e2e/ppo_trainer/run_model_reward.sh

  e2e_ppo_trainer_vllm_vlm:
    runs-on: [L20x8]
@ -209,7 +209,7 @@ jobs:
            MODEL_ID=Qwen/Qwen2-VL-2B-Instruct \
            ADV_ESTIMATOR=grpo RM_PAD=True USE_KL=True ENABLE_CHUNKED_PREFILL=False \
            SP_SIZE=2 \
-            bash tests/e2e/ppo_trainer/run_function_reward.sh
+            bash tests/special_e2e/ppo_trainer/run_function_reward.sh

      - name: Running Geo3k VLM PPO E2E training tests on 8 L20 GPUs with rmpad using function rm
        run: |
@ -219,7 +219,7 @@ jobs:
            MODEL_ID=Qwen/Qwen2-VL-2B-Instruct \
            ADV_ESTIMATOR=gae RM_PAD=True USE_KL=True ENABLE_CHUNKED_PREFILL=False \
            SP_SIZE=2 \
-            bash tests/e2e/ppo_trainer/run_function_reward.sh
+            bash tests/special_e2e/ppo_trainer/run_function_reward.sh

  e2e_ppo_trainer_sglang:
    runs-on: [L20x8]
@ -248,7 +248,7 @@ jobs:
      - name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm and save ckpt
        run: |
          ray stop --force
-          ENGINE=sglang bash tests/e2e/ppo_trainer/run_function_reward.sh
+          ENGINE=sglang bash tests/special_e2e/ppo_trainer/run_function_reward.sh

  e2e_ppo_trainer_sglang_multiturn_with_tool:
    runs-on: [L20x8]
@ -277,11 +277,11 @@ jobs:
      - name: Running GSM8K with tool E2E training tests on 8 L20 GPUs with rmpad using function rm and save ckpt with sglang
        run: |
          ray stop --force
-          bash tests/e2e/run_gsm8k_fsdp_sgl_multiturn_w_tool.sh
+          bash tests/special_e2e/run_gsm8k_fsdp_sgl_multiturn_w_tool.sh
      - name: Running GSM8K with tool E2E training tests with FSDP2
        run: |
          ray stop --force
-          FSDP_STRATEGY=fsdp2 bash tests/e2e/run_gsm8k_fsdp_sgl_multiturn_w_tool.sh
+          FSDP_STRATEGY=fsdp2 bash tests/special_e2e/run_gsm8k_fsdp_sgl_multiturn_w_tool.sh

  e2e_ppo_trainer_sglang_vlm:
    runs-on: [L20x8]
@ -317,7 +317,7 @@ jobs:
            ADV_ESTIMATOR=grpo RM_PAD=True USE_KL=True ENABLE_CHUNKED_PREFILL=False \
            ENGINE=sglang GPU_MEMORY_UTILIZATION=0.6 ACTOR_FSDP_PARAM_OFFLOAD=True \
            ACTOR_FSDP_OPTIMIZER_OFFLOAD=True REF_FSDP_PARAM_OFFLOAD=True \
-            bash tests/e2e/ppo_trainer/run_function_reward.sh
+            bash tests/special_e2e/ppo_trainer/run_function_reward.sh
      - name: Running Geo3k VLM E2E with rmpad using torch fused kernel (Qwen2.5-VL)
        run: |
          ray stop --force
@ -327,7 +327,7 @@ jobs:
            ADV_ESTIMATOR=grpo RM_PAD=True USE_KL=True ENABLE_CHUNKED_PREFILL=False \
            ENGINE=sglang GPU_MEMORY_UTILIZATION=0.6 ACTOR_FSDP_PARAM_OFFLOAD=True \
            ACTOR_FSDP_OPTIMIZER_OFFLOAD=True REF_FSDP_PARAM_OFFLOAD=True \
-            bash tests/e2e/ppo_trainer/run_function_reward.sh
+            bash tests/special_e2e/ppo_trainer/run_function_reward.sh
      - name: Running Geo3k VLM E2E with rmpad using triton fused kernel (Qwen2.5-VL)
        run: |
          ray stop --force
@ -338,4 +338,4 @@ jobs:
            ADV_ESTIMATOR=grpo RM_PAD=True USE_KL=True ENABLE_CHUNKED_PREFILL=False \
            ENGINE=sglang GPU_MEMORY_UTILIZATION=0.6 ACTOR_FSDP_PARAM_OFFLOAD=True \
            ACTOR_FSDP_OPTIMIZER_OFFLOAD=True REF_FSDP_PARAM_OFFLOAD=True \
-            bash tests/e2e/ppo_trainer/run_function_reward.sh
+            bash tests/special_e2e/ppo_trainer/run_function_reward.sh
--- a/.github/workflows/e2e_ppo_trainer_megatron.yml
+++ b/.github/workflows/e2e_ppo_trainer_megatron.yml
@ -1,3 +1,34 @@
+# # Tests layout
+
+# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
+# - `tests/trainer` for testing functionality related to `verl/trainer`
+# - `tests/models` for testing functionality related to `verl/models`
+# - ...
+
+# There are a few folders with `special_` prefix, created for special purposes:
+# - `special_distributed`: unit tests that must run with multiple GPUs
+# - `special_e2e`: end-to-end tests with training/generation scripts
+# - `special_npu`: tests for NPUs
+# - `special_sanity`: a suite of quick sanity tests
+# - `special_standalone`: a set of test that are designed to run in dedicated environments
+
+# Accelerators for tests 
+# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
+# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
+
+# # Workflow layout
+
+# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
+# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
+# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
+# 3. End-to-end tests: `e2e_*.yml`
+# 4. Unit tests
+#   - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
+#   - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
+#   - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
+#     - new workflow yaml is added to `.github/workflows`
+#     - new tests are added to workflow mentioned in 2.
+
 name: e2e_ppo_trainer_megatron
 # latest version: Megatron-LM core_r0.11.0 https://github.com/NVIDIA/Megatron-LM/tree/core_r0.11.0

@ -27,7 +58,7 @@ on:
      - ".github/workflows/e2e_ppo_trainer_megatron.yml"
      - "examples/data_preprocess/gsm8k.py"
      - "examples/data_preprocess/geo3k.py"
-      - "tests/e2e/run_ppo_trainer_megatron.sh"
+      - "tests/special_e2e/run_ppo_trainer_megatron.sh"
      - "verl/trainer/main_ppo.py"
      - "verl/trainer/config/ppo_megatron_trainer.yaml"

@ -66,11 +97,11 @@ jobs:
      - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (DeepSeek)
        run: |
          ray stop --force
-          ALL_OFFLOAD=True SAVE_FREQ=1 MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct bash tests/e2e/run_ppo_trainer_megatron.sh
+          ALL_OFFLOAD=True SAVE_FREQ=1 MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct bash tests/special_e2e/run_ppo_trainer_megatron.sh
      - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (DeepSeek)
        run: |
          ray stop --force
-          RESUME_MODE=auto MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct TOTAL_TRAIN_STEPS=2 bash tests/e2e/run_ppo_trainer_megatron.sh
+          RESUME_MODE=auto MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct TOTAL_TRAIN_STEPS=2 bash tests/special_e2e/run_ppo_trainer_megatron.sh
      - name: Test Megatron checkpoints merging function (DeepSeek Actor and Critic)
        run: |
          exp_name="deepseek-coder-1.3b-instruct-megatron-gsm8k-minimal"
@ -79,7 +110,7 @@ jobs:
      - name: Running GRPO GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Deepseek)
        run: |
          ray stop --force
-          ADV_ESTIMATOR=grpo USE_DYNAMIC_BSZ=False MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct bash tests/e2e/run_ppo_trainer_megatron.sh
+          ADV_ESTIMATOR=grpo USE_DYNAMIC_BSZ=False MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct bash tests/special_e2e/run_ppo_trainer_megatron.sh
      - name: clean up
        run: |
          rm -rf checkpoints
@ -108,11 +139,11 @@ jobs:
      - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen3) with validation and saving
        run: |
          ray stop --force
-          ALL_OFFLOAD=True VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 MODEL_ID=Qwen/Qwen3-0.6B bash tests/e2e/run_ppo_trainer_megatron.sh
+          ALL_OFFLOAD=True VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 MODEL_ID=Qwen/Qwen3-0.6B bash tests/special_e2e/run_ppo_trainer_megatron.sh
      - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen3) testing learning rate scheduler
        run: |
          ray stop --force
-          LR_WARMUP_STEPS=1 TOTAL_TRAIN_STEPS=2 MODEL_ID=Qwen/Qwen3-0.6B bash tests/e2e/run_ppo_trainer_megatron.sh
+          LR_WARMUP_STEPS=1 TOTAL_TRAIN_STEPS=2 MODEL_ID=Qwen/Qwen3-0.6B bash tests/special_e2e/run_ppo_trainer_megatron.sh

      - name: Test Megatron checkpoints merging function (Qwen3 Actor and Critic)
        run: |
@ -147,11 +178,11 @@ jobs:
      - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with tie-embedding Megatron (Qwen) with train tp > infer tp
        run: |
          ray stop --force
-          VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 TRAIN_TP=2 INFER_TP=1 MODEL_ID=Qwen/Qwen2.5-1.5B bash tests/e2e/run_ppo_trainer_megatron.sh
+          VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 TRAIN_TP=2 INFER_TP=1 MODEL_ID=Qwen/Qwen2.5-1.5B bash tests/special_e2e/run_ppo_trainer_megatron.sh
      - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen) with  train tp < infer tp
        run: |
          ray stop --force
-          VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 TRAIN_TP=1 INFER_TP=2 MODEL_ID=Qwen/Qwen2.5-1.5B bash tests/e2e/run_ppo_trainer_megatron.sh
+          VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 TRAIN_TP=1 INFER_TP=2 MODEL_ID=Qwen/Qwen2.5-1.5B bash tests/special_e2e/run_ppo_trainer_megatron.sh
      - name: clean up
        run: |
          rm -rf checkpoints
@ -183,9 +214,9 @@ jobs:
      - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen)
        run: |
          ray stop --force
-          SAVE_FREQ=1 COMMON_PP=4 COMMON_VPP=null COMMON_CP=1 SKIP_SAVE_HF_MODEL=1 bash tests/e2e/run_ppo_trainer_megatron.sh +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_first_pipeline_stage=8 +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=4 actor_rollout_ref.actor.megatron.use_dist_checkpointing=true actor_rollout_ref.actor.megatron.dist_checkpointing_path=checkpoints/verl-test/qwen2.5-0.5b-megatron actor_rollout_ref.ref.megatron.use_dist_checkpointing=true actor_rollout_ref.ref.megatron.dist_checkpointing_path=checkpoints/verl-test/qwen2.5-0.5b-megatron critic.megatron.use_dist_checkpointing=true critic.megatron.dist_checkpointing_path=checkpoints/verl-test/qwen2.5-0.5b-megatron reward_model.megatron.use_dist_checkpointing=true reward_model.megatron.dist_checkpointing_path=checkpoints/verl-test/qwen2.5-0.5b-megatron
+          SAVE_FREQ=1 COMMON_PP=4 COMMON_VPP=null COMMON_CP=1 SKIP_SAVE_HF_MODEL=1 bash tests/special_e2e/run_ppo_trainer_megatron.sh +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_first_pipeline_stage=8 +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=4 actor_rollout_ref.actor.megatron.use_dist_checkpointing=true actor_rollout_ref.actor.megatron.dist_checkpointing_path=checkpoints/verl-test/qwen2.5-0.5b-megatron actor_rollout_ref.ref.megatron.use_dist_checkpointing=true actor_rollout_ref.ref.megatron.dist_checkpointing_path=checkpoints/verl-test/qwen2.5-0.5b-megatron critic.megatron.use_dist_checkpointing=true critic.megatron.dist_checkpointing_path=checkpoints/verl-test/qwen2.5-0.5b-megatron reward_model.megatron.use_dist_checkpointing=true reward_model.megatron.dist_checkpointing_path=checkpoints/verl-test/qwen2.5-0.5b-megatron
          cp -r checkpoints checkpoints-dut
-          SAVE_FREQ=1 COMMON_PP=4 COMMON_VPP=null COMMON_CP=1 bash tests/e2e/run_ppo_trainer_megatron.sh
+          SAVE_FREQ=1 COMMON_PP=4 COMMON_VPP=null COMMON_CP=1 bash tests/special_e2e/run_ppo_trainer_megatron.sh
      - name: Test Megatron checkpoints merging function (Qwen Actor and Critic)
        run: |
          exp_name="qwen2.5-0.5b-megatron-gsm8k-minimal"
@ -219,7 +250,7 @@ jobs:
      - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (DeepSeek)
        run: |
          ray stop --force
-          SAVE_FREQ=1 MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct COMMON_PP=2 COMMON_VPP=null bash tests/e2e/run_ppo_trainer_megatron.sh +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=true +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=true
+          SAVE_FREQ=1 MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct COMMON_PP=2 COMMON_VPP=null bash tests/special_e2e/run_ppo_trainer_megatron.sh +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=true +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=true
      - name: Test Megatron checkpoints merging function (DeepSeek Actor and Critic)
        run: |
          exp_name="deepseek-coder-1.3b-instruct-megatron-gsm8k-minimal"
@ -253,12 +284,12 @@ jobs:
      - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (DeepSeek)
        run: |
          ray stop --force
-          ADV_ESTIMATOR=grpo USE_DUMMY_MODEL=True DUMMY_MODEL_CONFIG_PATH=tests/e2e/ppo_trainer/expert_parallel/qwen2moe_minimal.json \
+          ADV_ESTIMATOR=grpo USE_DUMMY_MODEL=True DUMMY_MODEL_CONFIG_PATH=tests/special_e2e/ppo_trainer/expert_parallel/qwen2moe_minimal.json \
          PPO_MAX_TOKEN_LEN=512 FWD_MAX_TOKEN_LEN=512 \
          MAX_PROMPT_LENGTH=256 MAX_RESPONSE_LENGTH=256 \
          MODEL_ID=Qwen/Qwen1.5-MoE-A2.7B-Chat \
          COMMON_PP=2 COMMON_VPP=null COMMON_CP=1 COMMON_TP=4 COMMON_EP=4 COMMON_ETP=1 INFER_TP=8 \
-          USE_DIST_CKPT=True ALL_OFFLOAD=True SKIP_SAVE_HF_MODEL=1 bash tests/e2e/run_ppo_trainer_megatron.sh
+          USE_DIST_CKPT=True ALL_OFFLOAD=True SKIP_SAVE_HF_MODEL=1 bash tests/special_e2e/run_ppo_trainer_megatron.sh
      - name: clean up
        run: |
          rm -rf checkpoints
@ -290,7 +321,7 @@ jobs:
      - name: Running Geo3k E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen)
        run: |
          ray stop --force
-          TRAIN_FILES=${HOME}/data/geo3k/train.parquet VAL_FILES=${HOME}/data/geo3k/test.parquet MAX_PROMPT_LENGTH=1024 MAX_RESPONSE_LENGTH=2048  MODEL_ID=Qwen/Qwen2.5-VL-3B-Instruct ADV_ESTIMATOR=grpo USE_DYNAMIC_BSZ=False SKIP_SAVE_HF_MODEL=1 COMMON_PP=4 COMMON_VPP=null COMMON_CP=1 COMMON_TP=2 USE_DIST_CKPT=true DIST_CKPT_PATH=checkpoints/verl-test/qwen2.5-vl-3b-megatron bash tests/e2e/run_ppo_trainer_megatron.sh 
+          TRAIN_FILES=${HOME}/data/geo3k/train.parquet VAL_FILES=${HOME}/data/geo3k/test.parquet MAX_PROMPT_LENGTH=1024 MAX_RESPONSE_LENGTH=2048  MODEL_ID=Qwen/Qwen2.5-VL-3B-Instruct ADV_ESTIMATOR=grpo USE_DYNAMIC_BSZ=False SKIP_SAVE_HF_MODEL=1 COMMON_PP=4 COMMON_VPP=null COMMON_CP=1 COMMON_TP=2 USE_DIST_CKPT=true DIST_CKPT_PATH=checkpoints/verl-test/qwen2.5-vl-3b-megatron bash tests/special_e2e/run_ppo_trainer_megatron.sh 
      - name: clean up
        run: |
          rm -rf checkpoints
--- a/.github/workflows/e2e_sft.yml
+++ b/.github/workflows/e2e_sft.yml
@ -1,3 +1,34 @@
+# # Tests layout
+
+# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
+# - `tests/trainer` for testing functionality related to `verl/trainer`
+# - `tests/models` for testing functionality related to `verl/models`
+# - ...
+
+# There are a few folders with `special_` prefix, created for special purposes:
+# - `special_distributed`: unit tests that must run with multiple GPUs
+# - `special_e2e`: end-to-end tests with training/generation scripts
+# - `special_npu`: tests for NPUs
+# - `special_sanity`: a suite of quick sanity tests
+# - `special_standalone`: a set of test that are designed to run in dedicated environments
+
+# Accelerators for tests 
+# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
+# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
+
+# # Workflow layout
+
+# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
+# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
+# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
+# 3. End-to-end tests: `e2e_*.yml`
+# 4. Unit tests
+#   - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
+#   - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
+#   - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
+#     - new workflow yaml is added to `.github/workflows`
+#     - new tests are added to workflow mentioned in 2.
+
 name: e2e_sft

 on:
@ -25,7 +56,7 @@ on:
      # Entrypoints
      - ".github/workflows/e2e_sft.yml"
      - "examples/data_preprocess/gsm8k.py"
-      - "tests/e2e/sft"
+      - "tests/special_e2e/sft"
      - "verl/trainer/fsdp_sft_trainer.py"
      - "verl/trainer/config/sft_trainer.yaml"

@ -66,25 +97,25 @@ jobs:
      - name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm
        run: |
          ray stop --force
-          bash tests/e2e/sft/run_sft.sh
+          bash tests/special_e2e/sft/run_sft.sh
      - name: Running GSM8K E2E training tests on 8 L20 GPUs w/o rmpad using function rm
        run: |
          ray stop --force
-          RM_PAD=False bash tests/e2e/sft/run_sft.sh
+          RM_PAD=False bash tests/special_e2e/sft/run_sft.sh
      - name: Running GSM8K E2E training tests on 8 L20 GPUs with sequence parallism
        run: |
          ray stop --force
-          SP_SIZE=2 bash tests/e2e/sft/run_sft.sh
+          SP_SIZE=2 bash tests/special_e2e/sft/run_sft.sh
      - name: Check loss difference between sequence parallel vs. default implementation
        run: |
          ray stop --force
-          ENTRYPOINT="tests/e2e/sft/test_sp_loss_match.py" SP_SIZE=2 bash tests/e2e/sft/run_sft.sh
+          ENTRYPOINT="tests/special_e2e/sft/test_sp_loss_match.py" SP_SIZE=2 bash tests/special_e2e/sft/run_sft.sh
      - name: Running GSM8K E2E training tests on 8 L20 GPUs with sequence parallism and liger
        run: |
          ray stop --force
-          SP_SIZE=2 LIGER=True bash tests/e2e/sft/run_sft.sh
+          SP_SIZE=2 LIGER=True bash tests/special_e2e/sft/run_sft.sh
      - name: Running GSM8K E2E training tests with LoRA
        run: |
          ray stop --force
-          LORA_RANK=32 bash tests/e2e/sft/run_sft.sh
+          LORA_RANK=32 bash tests/special_e2e/sft/run_sft.sh
      # TODO: multiturn
--- a/.github/workflows/e2e_spin.yml
+++ b/.github/workflows/e2e_spin.yml
@ -27,13 +27,18 @@ on:
      # Entrypoints
      - ".github/workflows/e2e_spin.yml"
      - "examples/data_preprocess/gsm8k.py"
-      - "tests/e2e/run_spin.sh"
+      - "tests/special_e2e/run_spin.sh"
      - "!examples"

 # Declare permissions just read content.
 permissions:
  contents: read

+# Cancel jobs on the same ref if a new one is triggered
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+
 jobs:
  e2e_spin:
    runs-on: [L20x8]
@ -63,4 +68,4 @@ jobs:
      - name: Running the E2E test with the spin algorithm
        run: |
          ray stop --force
-          bash tests/e2e/run_spin.sh
+          bash tests/special_e2e/run_spin.sh
--- a/.github/workflows/e2e_sppo.yml
+++ b/.github/workflows/e2e_sppo.yml
@ -27,12 +27,17 @@ on:
      # Entrypoints
      - ".github/workflows/e2e_sppo.yml"
      - "examples/data_preprocess/gsm8k.py"
-      - "tests/e2e/run_sppo.sh"
+      - "tests/special_e2e/run_sppo.sh"

 # Declare permissions just read content.
 permissions:
  contents: read

+# Cancel jobs on the same ref if a new one is triggered
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+
 jobs:
  e2e_sppo:
    runs-on: [L20x8]
@ -62,4 +67,4 @@ jobs:
      - name: Running the E2E test with the SPPO algorithm
        run: |
          ray stop --force
-          bash tests/e2e/run_sppo.sh
+          bash tests/special_e2e/run_sppo.sh
--- a/.github/workflows/gpu_unit_tests.yml
+++ b/.github/workflows/gpu_unit_tests.yml
@ -0,0 +1,97 @@
+# # Tests layout
+
+# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
+# - `tests/trainer` for testing functionality related to `verl/trainer`
+# - `tests/models` for testing functionality related to `verl/models`
+# - ...
+
+# There are a few folders with `special_` prefix, created for special purposes:
+# - `special_distributed`: unit tests that must run with multiple GPUs
+# - `special_e2e`: end-to-end tests with training/generation scripts
+# - `special_npu`: tests for NPUs
+# - `special_sanity`: a suite of quick sanity tests
+# - `special_standalone`: a set of test that are designed to run in dedicated environments
+
+# Accelerators for tests 
+# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
+# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
+
+# # Workflow layout
+
+# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
+# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
+# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
+# 3. End-to-end tests: `e2e_*.yml`
+# 4. Unit tests
+#   - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
+#   - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
+#   - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
+#     - new workflow yaml is added to `.github/workflows`
+#     - new tests are added to workflow mentioned in 2.
+
+name: GPU unit tests
+
+on:
+  # Trigger the workflow on push or pull request,
+  # but only for the main branch
+  push:
+    branches:
+      - main
+      - v0.4.x
+    paths:
+      - "**/*.py"
+      - .github/workflows/gpu_unit_tests.yml
+  pull_request:
+    branches:
+      - main
+      - v0.4.x
+    paths:
+      - "**/*.py"
+      # Other entrypoints
+      - "!examples/**"
+      - "!tests/**"
+      - "!verl/trainer/main_*.py"
+      - "!verl/trainer/fsdp_sft_trainer.py"
+      # Recipes
+      - "!recipe/**"
+      # Entrypoints
+      - .github/workflows/gpu_unit_tests.yml
+      - "tests/*"
+
+# Cancel jobs on the same ref if a new one is triggered
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+
+# Declare permissions just read content.
+permissions: 
+  contents: read
+
+jobs:
+  gpu_unit_tests:
+    runs-on: [L20x8]
+    timeout-minutes: 40 # Increase this timeout value as needed
+    env:
+      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
+      HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
+      NO_PROXY: "localhost,127.0.0.1"
+      HF_HUB_ENABLE_HF_TRANSFER: 1
+    container:
+      image: whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6.post5-mcore0.12.0-te2.3
+      options: --gpus all --shm-size=10g
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+            fetch-depth: 0
+      - name: Install the current repository
+        run: |
+          pip3 install hf_transfer
+          pip3 install --no-deps -e .[test]
+          pip3 install --upgrade "ray>=2.40.0"
+          pip3 install cupy-cuda12x
+      - name: Run all GPU unit tests
+        run: |
+          pytest -s -x --ignore-glob="*test_linear_cross_entropy_tp.py" --ignore-glob='*on_cpu.py' --ignore-glob="*test_vllm*" --ignore-glob="*_sglang*" --ignore-glob="*_hf_rollout*" --ignore-glob="tests/models/" --ignore-glob='tests/special*' tests/
+      - name: Testing LinearCrossEntropyTP Correctness, Computation Time and Memory Consumption
+        run: |
+          LOW_MEMORY=True torchrun --standalone --nnodes=1 --nproc-per-node=8 tests/utils/test_linear_cross_entropy_tp.py
--- a/.github/workflows/kernels.yml
+++ b/.github/workflows/kernels.yml
@ -1,65 +0,0 @@
-name: kernels
-# latest version: Megatron-LM core_r0.11.0 https://github.com/NVIDIA/Megatron-LM/tree/core_r0.11.0
-
-on:
-  # Trigger the workflow on push or pull request,
-  # but only for the main branch
-  push:
-    branches:
-      - main
-      - v0.2.x
-    paths:
-      - "**/*.py"
-      - .github/workflows/kernels.yml
-  pull_request:
-    branches:
-      - main
-      - v0.2.x
-    paths:
-      - "**/*.py"
-      # Other entrypoints
-      - "!examples/**"
-      - "!tests/**"
-      - "!verl/trainer/main_*.py"
-      - "!verl/trainer/fsdp_sft_trainer.py"
-      # Recipes
-      - "!recipe/**"
-      # Entrypoints
-      - .github/workflows/kernels.yml
-      - "tests/kernels/*"
-
-# Cancel jobs on the same ref if a new one is triggered
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
-
-# Declare permissions just read content.
-permissions: 
-  contents: read
-
-jobs:
-  kernels:
-    runs-on: [L20x8]
-    timeout-minutes: 40 # Increase this timeout value as needed
-    env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
-      HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
-      NO_PROXY: "localhost,127.0.0.1"
-      HF_HUB_ENABLE_HF_TRANSFER: 1
-    container:
-      image: whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6.post5-mcore0.12.0-te2.3
-      options: --gpus all --shm-size=10g
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-            fetch-depth: 0
-      - name: Install the current repository
-        run: |
-          pip3 install hf_transfer
-          pip3 install --no-deps -e .[test]
-      - name: Testing LinearCrossEntropy Correction, Computation Time and Memory Consumption
-        run: |
-          python3 tests/kernels/test_linear_cross_entropy.py
-      - name: Testing LinearCrossEntropyTP Correction, Computation Time and Memory Consumption
-        run: |
-          LOW_MEMORY=True torchrun --standalone --nnodes=1 --nproc-per-node=8 tests/kernels/test_linear_cross_entropy_tp.py
--- a/.github/workflows/model.yml
+++ b/.github/workflows/model.yml
@ -1,3 +1,35 @@
+# # Tests layout
+
+# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
+# - `tests/trainer` for testing functionality related to `verl/trainer`
+# - `tests/models` for testing functionality related to `verl/models`
+# - ...
+
+# There are a few folders with `special_` prefix, created for special purposes:
+# - `special_distributed`: unit tests that must run with multiple GPUs
+# - `special_e2e`: end-to-end tests with training/generation scripts
+# - `special_npu`: tests for NPUs
+# - `special_sanity`: a suite of quick sanity tests
+# - `special_standalone`: a set of test that are designed to run in dedicated environments
+
+# Accelerators for tests 
+# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
+# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
+
+# # Workflow layout
+
+# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
+# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
+# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
+# 3. End-to-end tests: `e2e_*.yml`
+# 4. Unit tests
+#   - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
+#   - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
+#   - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
+#     - new workflow yaml is added to `.github/workflows`
+#     - new tests are added to workflow mentioned in 2.
+# name: Check PR Title
+
 name: model_rmpad

 on:
@ -15,14 +47,19 @@ on:
      - "verl/**/*.py"
      # Entrypoints
      - ".github/workflows/model.yml"
-      - "tests/utils/gpu_tests/checkpoint/test_fsdp_ckpt.py"
-      - "tests/models/test_transformers_ulysses.py"
-      - "tests/distributed/run_all.sh"
+      - "tests/special_distributed/test_fsdp_ckpt.py"
+      - "tests/models/**"
+      - "tests/special_distributed/run_all.sh"

 # Declare permissions just read content.
 permissions:
  contents: read

+# Cancel jobs on the same ref if a new one is triggered
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+
 jobs:
  model_rmpad:
    runs-on: [L20x8]
@ -53,7 +90,7 @@ jobs:
          pytest -s tests/models/test_transformer.py
      - name: Running FSDP rmpad model tests on 8 L20 GPUs + latest flash_attn
        run: |
-          STRATEGY=fsdp torchrun --nproc_per_node=8 tests/utils/gpu_tests/checkpoint/test_fsdp_ckpt.py
+          STRATEGY=fsdp torchrun --nproc_per_node=8 tests/special_distributed/test_fsdp_ckpt.py
      - name: Running transformers ulysses tests on 8 L20 GPUs + latest transformers
        run: |
          torchrun --nproc_per_node=8 -m pytest tests/models/test_transformers_ulysses.py
@ -79,7 +116,7 @@ jobs:
          torchrun --nproc_per_node=8 -m pytest tests/models/test_transformers_ulysses.py
      - name: Run distributed test
        run: |
-          bash tests/distributed/run_all.sh
+          bash tests/special_distributed/run_all.sh

  # TODO: Move this back to model_rmpad once FSDP2 is stable.
  # NOTE: List as an independent job to make rerun easier.
@ -106,4 +143,4 @@ jobs:
      - name: Running FSDP2 rmpad model tests on 8 L20 GPUs + latest flash_attn
        run: |
          pip3 install --upgrade flash_attn --no-build-isolation
-          STRATEGY=fsdp2 torchrun --nproc_per_node=8 tests/utils/gpu_tests/checkpoint/test_fsdp_ckpt.py
+          STRATEGY=fsdp2 torchrun --nproc_per_node=8 tests/special_distributed/test_fsdp_ckpt.py
--- a/.github/workflows/ray_cpu_test.yml
+++ b/.github/workflows/ray_cpu_test.yml
@ -1,48 +0,0 @@
-name: ray_cpu
-
-on:
-  # Trigger the workflow on push or pull request,
-  # but only for the main branch
-  push:
-    branches:
-      - main
-      - v0.*
-  pull_request:
-    branches:
-      - main
-      - v0.*
-    paths:
-      - "verl/single_controller/*.py"
-      - .github/workflows/ray_cpu_test.yml
-      - "!recipe/**/*.py"
-
-# Cancel jobs on the same ref if a new one is triggered
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
-
-# Declare permissions just read content.
-permissions:
-  contents: read
-
-jobs:
-  ray_cpu:
-    runs-on: ubuntu-latest
-    timeout-minutes: 10 # Increase this timeout value as needed
-    strategy:
-      matrix:
-        python-version: ["3.10"]
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Install the current repository
-        run: |
-          pip install -e .[test]
-          pip install --upgrade "ray>=2.40.0"
-      - name: Running ray tests that can be tested on CPU machines
-        run: |
-          cd tests/ray_cpu
-          pytest -s -x --ignore=test_check_worker_alive.py .
--- a/.github/workflows/sandbox.yml
+++ b/.github/workflows/sandbox.yml
@ -1,51 +0,0 @@
-name: sandbox
-
-on:
-  # Trigger the workflow on push or pull request,
-  # but only for the main branch
-  push:
-    branches:
-      - main
-      - v0.*
-  pull_request:
-    branches:
-      - main
-      - v0.*
-    paths:
-      - "**/*.py"
-      - .github/workflows/sandbox.yml
-
-# Cancel jobs on the same ref if a new one is triggered
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
-
-# Declare permissions just read content.
-permissions:
-  contents: read
-
-jobs:
-  sandbox:
-    runs-on: [L20x8]
-    timeout-minutes: 10 # Increase this timeout value as needed
-    env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
-      HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
-      NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
-      HF_ENDPOINT: "https://hf-mirror.com"
-      HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
-    container:
-      image: whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6.post5-mcore0.12.0-te2.3
-      options: --gpus all --shm-size=10g
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 0
-      - name: Install the current repository
-        run: |
-          pip3 install -e .[test,prime]
-          pip3 install vllm==0.5.4
-      - name: Running sandbox tests on 8 L20 GPUs
-        run: |
-          cd tests/utils/reward_score
-          pytest -s -x .
--- a/.github/workflows/sanity.yml
+++ b/.github/workflows/sanity.yml
@ -1,3 +1,35 @@
+# # Tests layout
+
+# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
+# - `tests/trainer` for testing functionality related to `verl/trainer`
+# - `tests/models` for testing functionality related to `verl/models`
+# - ...
+
+# There are a few folders with `special_` prefix, created for special purposes:
+# - `special_distributed`: unit tests that must run with multiple GPUs
+# - `special_e2e`: end-to-end tests with training/generation scripts
+# - `special_npu`: tests for NPUs
+# - `special_sanity`: a suite of quick sanity tests
+# - `special_standalone`: a set of test that are designed to run in dedicated environments
+
+# Accelerators for tests 
+# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
+# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
+
+# # Workflow layout
+
+# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
+# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
+# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
+# 3. End-to-end tests: `e2e_*.yml`
+# 4. Unit tests
+#   - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
+#   - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
+#   - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
+#     - new workflow yaml is added to `.github/workflows`
+#     - new tests are added to workflow mentioned in 2.
+# name: Check PR Title
+
 name: sanity

 on:
@ -14,6 +46,7 @@ on:
    paths:
      - "**/*.py"
      - .github/workflows/sanity.yml
+      - "tests/special_sanity/**"

 # Cancel jobs on the same ref if a new one is triggered
 concurrency:
@ -42,10 +75,10 @@ jobs:
          pip install -e .[test]
      - name: Run sanity test
        run: |
-          pytest -s -x tests/sanity
+          pytest -s -x tests/special_sanity
      - name: Run license test
        run: |
-          python3 tests/sanity/check_license.py --directory .
+          python3 tests/special_sanity/check_license.py --directory .
      - name: Assert naming convention
        run: |
          if grep -rIn --exclude-dir=.git --exclude-dir=.github --exclude-dir=venv --exclude-dir=__pycache__ 'veRL' .; then
--- a/.github/workflows/sgl.yml
+++ b/.github/workflows/sgl.yml
@ -1,3 +1,34 @@
+# # Tests layout
+
+# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
+# - `tests/trainer` for testing functionality related to `verl/trainer`
+# - `tests/models` for testing functionality related to `verl/models`
+# - ...
+
+# There are a few folders with `special_` prefix, created for special purposes:
+# - `special_distributed`: unit tests that must run with multiple GPUs
+# - `special_e2e`: end-to-end tests with training/generation scripts
+# - `special_npu`: tests for NPUs
+# - `special_sanity`: a suite of quick sanity tests
+# - `special_standalone`: a set of test that are designed to run in dedicated environments
+
+# Accelerators for tests 
+# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
+# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
+
+# # Workflow layout
+
+# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
+# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
+# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
+# 3. End-to-end tests: `e2e_*.yml`
+# 4. Unit tests
+#   - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
+#   - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
+#   - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
+#     - new workflow yaml is added to `.github/workflows`
+#     - new tests are added to workflow mentioned in 2.
+
 name: sgl

 on:
@ -85,4 +116,5 @@ jobs:
      - name: Test the latest SGLang Rollout async with search tool
        run: |
          cd tests/workers/rollout
-          pytest -s test_sglang_async_rollout_search_tools.py
+          pytest -s test_sglang_async_rollout_search_tools.py
+      # Note(haibin.lin): for any new test, please update gpu_unit_tests.yaml to avoid repeated tests
--- a/.github/workflows/single_controller_unit_tests.yml
+++ b/.github/workflows/single_controller_unit_tests.yml
@ -1,52 +0,0 @@
-name: single_controller_unit_tests
-
-on:
-  # Trigger the workflow on push or pull request,
-  # but only for the main branch
-  push:
-    branches:
-      - main
-      - v0.*
-  pull_request:
-    branches:
-      - main
-      - v0.*
-    paths:
-      - "verl/single_controller/*.py"
-      - .github/workflows/ray_gpu_test.yml
-      - "!recipe/**/*.py"
-
-# Cancel jobs on the same ref if a new one is triggered
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
-
-# Declare permissions just read content.
-permissions:
-  contents: read
-
-jobs:
-  single_controller_unit_tests:
-    runs-on: [L20x8]
-    timeout-minutes: 10 # Increase this timeout value as needed
-    env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
-      HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
-      NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
-      HF_ENDPOINT: "https://hf-mirror.com"
-      HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
-    container:
-      image: whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6.post5-mcore0.12.0-te2.3
-      options: --gpus all --shm-size=10g
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 0
-      - name: Install the current repository
-        run: |
-          pip install -e .[test]
-          pip install --upgrade "ray>=2.40.0"
-      - name: Running ray tests that need 8 GPUs
-        run: |
-          cd tests/ray_gpu
-          pytest -s -x --ignore=test_rvdz.py .
--- a/.github/workflows/utils_gpu_test.yml
+++ b/.github/workflows/utils_gpu_test.yml
@ -1,47 +0,0 @@
-name: utils_gpu_test
-
-on:
-  # Trigger the workflow on push or pull request,
-  # but only for the main branch
-  push:
-    branches:
-      - main
-      - v0.*
-  pull_request:
-    branches:
-      - main
-      - v0.*
-    paths:
-      - "**/*.py"
-      - .github/workflows/utils_gpu_test.yml
-      - "!recipe/**/*.py"
-
-# Cancel jobs on the same ref if a new one is triggered
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
-
-# Declare permissions just read content.
-permissions:
-  contents: read
-
-jobs:
-  utils_gpu_test:
-    runs-on: [L20x8]
-    timeout-minutes: 20 # Increase this timeout value as needed
-    container:
-      image: whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6.post5-mcore0.12.0-te2.3
-      options: --gpus all --shm-size=10g
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Install the current repository
-        run: |
-          pip install -e .[test]
-      - name: Running utils gpu tests
-        run: |
-          cd tests/utils/gpu_tests
-          pytest -s -x --ignore=dataset/ --ignore=checkpoint/ .
--- a/.github/workflows/vllm.yml
+++ b/.github/workflows/vllm.yml
@ -1,3 +1,35 @@
+# # Tests layout
+
+# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
+# - `tests/trainer` for testing functionality related to `verl/trainer`
+# - `tests/models` for testing functionality related to `verl/models`
+# - ...
+
+# There are a few folders with `special_` prefix, created for special purposes:
+# - `special_distributed`: unit tests that must run with multiple GPUs
+# - `special_e2e`: end-to-end tests with training/generation scripts
+# - `special_npu`: tests for NPUs
+# - `special_sanity`: a suite of quick sanity tests
+# - `special_standalone`: a set of test that are designed to run in dedicated environments
+
+# Accelerators for tests 
+# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
+# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
+
+# # Workflow layout
+
+# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
+# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
+# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
+# 3. End-to-end tests: `e2e_*.yml`
+# 4. Unit tests
+#   - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
+#   - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
+#   - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
+#     - new workflow yaml is added to `.github/workflows`
+#     - new tests are added to workflow mentioned in 2.
+
+
 name: vllm

 on:
@ -28,8 +60,8 @@ on:
      - "!**/*sglang*"
      # Entrypoints
      - ".github/workflows/vllm.yml"
-      - "tests/e2e/generation"
-      - "tests/rollout"
+      - "tests/special_e2e/generation"
+      - "tests/workers/rollout"
      - "verl/trainer/main_generation.py"
      - "verl/trainer/config/generation.yaml"

@ -82,13 +114,13 @@ jobs:
          torchrun --standalone --nnodes=1 --nproc_per_node=4 $(which pytest) -s test_vllm_spmd.py
      - name: Run Qwen 0.5B generation test
        run: |
-          cd tests/e2e/generation
+          cd tests/special_e2e/generation
          export OUTPUT_PATH="${HOME}/data/gen/qwen_05_gen_test.parquet"
          MODEL_ID=Qwen/Qwen2.5-0.5B-Instruct NGPUS_PER_NODE=4 GEN_TP=2 bash ./run_gen_qwen05.sh
          rm -rf "${OUTPUT_PATH}"
      - name: Run Qwen 0.5B generation test when world_size == 1
        run: |
-          cd tests/e2e/generation
+          cd tests/special_e2e/generation
          export OUTPUT_PATH="${HOME}/data/gen/qwen_05_gen_test.parquet"
          MODEL_ID=Qwen/Qwen2.5-0.5B-Instruct NGPUS_PER_NODE=1 GEN_TP=1 bash ./run_gen_qwen05.sh
          rm -rf "${OUTPUT_PATH}"
@ -96,3 +128,4 @@ jobs:
        run: |
          pip3 install --upgrade vllm==0.8.3 tensordict==0.7.2
          pytest -svvv tests/workers/rollout/test_vllm_chat_scheduler.py
+      # Note(haibin.lin): for any new test, please update gpu_unit_tests.yaml to avoid repeated tests
--- a/docs/sglang_multiturn/sandbox_fusion.rst
+++ b/docs/sglang_multiturn/sandbox_fusion.rst
@ -208,7 +208,7 @@ Unit Tests

 e2e Tests
 ----------
-we provide e2e test scripts in `tests/e2e` folder, named `tests/e2e/run_gsm8k_fsdp_sgl_multiturn_sf_tool.sh`
+we provide e2e test scripts in `tests/special_e2e` folder, named `tests/special_e2e/run_gsm8k_fsdp_sgl_multiturn_sf_tool.sh`

 by setting 'trainer.rollout_data_dir' you can dump the rollout data to local disk. here is an sample taken from the rollout data:

--- a/examples/generation/run_deepseek_v2_lite_math.sh
+++ b/examples/generation/run_deepseek_v2_lite_math.sh
@ -1,7 +1,7 @@
 set -x

-data_path=$HOME/data/rlhf/gsm8k/test.parquet
-save_path=$HOME/data/rlhf/math/deepseek_v2_lite_gen_test.parquet
+data_path=$HOME/data/gsm8k/test.parquet
+save_path=$HOME/data/gsm8k/deepseek_v2_lite_gen_test.parquet
 model_path=deepseek-ai/deepseek-llm-7b-chat

 python3 -m verl.trainer.main_generation \
--- a/setup.py
+++ b/setup.py
@ -45,7 +45,7 @@ install_requires = [

 TEST_REQUIRES = ["pytest", "pre-commit", "py-spy"]
 PRIME_REQUIRES = ["pyext"]
-GEO_REQUIRES = ["mathruler"]
+GEO_REQUIRES = ["mathruler", "torchvision", "qwen_vl_utils"]
 GPU_REQUIRES = ["liger-kernel", "flash-attn"]
 MATH_REQUIRES = ["math-verify"]  # Add math-verify as an optional dependency
 VLLM_REQUIRES = ["tensordict<=0.6.2", "vllm<=0.8.5"]
--- a/tests/README.md
+++ b/tests/README.md
@ -0,0 +1,30 @@
+# Tests layout
+
+Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
+- `tests/trainer` for testing functionality related to `verl/trainer`
+- `tests/models` for testing functionality related to `verl/models`
+- ...
+
+There are a few folders with `special_` prefix, created for special purposes:
+- `special_distributed`: unit tests that must run with multiple GPUs
+- `special_e2e`: end-to-end tests with training/generation scripts
+- `special_npu`: tests for NPUs
+- `special_sanity`: a suite of quick sanity tests
+- `special_standalone`: a set of test that are designed to run in dedicated environments
+
+Accelerators for tests 
+- By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
+- For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
+
+# Workflow layout
+
+All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
+1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
+2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
+3. End-to-end tests: `e2e_*.yml`
+4. Unit tests
+  - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
+  - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
+  - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
+    - new workflow yaml is added to `.github/workflows`
+    - new tests are added to workflow mentioned in 2.
--- a/tests/e2e/arithmetic_sequence/data/create_dataset.py
+++ b/tests/e2e/arithmetic_sequence/data/create_dataset.py
@ -1,46 +0,0 @@
-# Copyright 2024 Bytedance Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-from torch.utils import data
-
-from tests.e2e.envs.digit_completion import DigitCompletion
-
-if __name__ == "__main__":
-    simple_task = DigitCompletion(max_number=9, max_diff=9, max_num_in_response=9)
-    all_prompts = simple_task.get_all_prompts()
-
-    # 21 * 6 * 4
-    train_data, test_data = data.random_split(all_prompts, lengths=[0.8, 0.2])
-    train_data = list(train_data)
-    test_data = list(test_data)
-
-    train_data = [[{"role": "user", "content": str(item)}] for item in train_data]
-    test_data = [[{"role": "user", "content": str(item)}] for item in test_data]
-
-    print(f"Size of train: {len(train_data)}, size of test: {len(test_data)}")
-
-    train_data = {"prompt": train_data}
-    test_data = {"prompt": test_data}
-
-    model_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)))
-
-    import pandas as pd
-
-    train_data_frame = pd.DataFrame(train_data)
-    test_data_frame = pd.DataFrame(test_data)
-
-    train_data_frame.to_parquet(os.path.join(model_folder, "train.parquet"))
-    test_data_frame.to_parquet(os.path.join(model_folder, "test.parquet"))
--- a/tests/e2e/arithmetic_sequence/data/test.parquet
+++ b/tests/e2e/arithmetic_sequence/data/test.parquet
--- a/tests/e2e/arithmetic_sequence/data/train.parquet
+++ b/tests/e2e/arithmetic_sequence/data/train.parquet
--- a/tests/e2e/arithmetic_sequence/model/config.json
+++ b/tests/e2e/arithmetic_sequence/model/config.json
@ -1,29 +0,0 @@
-{
-  "architectures": [
-    "LlamaForCausalLM"
-  ],
-  "attention_bias": false,
-  "attention_dropout": 0.0,
-  "bos_token_id": null,
-  "eos_token_id": 1,
-  "hidden_act": "silu",
-  "hidden_size": 128,
-  "initializer_range": 0.02,
-  "intermediate_size": 344,
-  "max_position_embeddings": 2048,
-  "mlp_bias": false,
-  "model_type": "llama",
-  "num_attention_heads": 4,
-  "num_hidden_layers": 4,
-  "num_key_value_heads": 4,
-  "pad_token_id": 2,
-  "pretraining_tp": 1,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 10000.0,
-  "tie_word_embeddings": false,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.43.3",
-  "use_cache": true,
-  "vocab_size": 16
-}
--- a/tests/e2e/arithmetic_sequence/model/create_model_tokenizer.py
+++ b/tests/e2e/arithmetic_sequence/model/create_model_tokenizer.py
@ -1,58 +0,0 @@
-# Copyright 2024 Bytedance Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Create a random model and tokenizer for PPO training
-"""
-
-import os
-
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaConfig
-
-from tests.e2e.envs.digit_completion import CharTokenizer
-
-tokenizer = CharTokenizer(
-    characters=["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ",", ":"],
-    model_max_length=2048,
-    chat_template="{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set role = message['role'] %}{{ message['content'] }}{% endfor %}{% if add_generation_prompt %}{{ sep_token }}{% endif %}",  # noqa: E501
-)
-
-config = LlamaConfig(
-    vocab_size=(tokenizer.vocab_size + 16 - 1) // 16 * 16,
-    hidden_size=128,
-    intermediate_size=344,
-    num_hidden_layers=4,
-    num_attention_heads=4,
-    num_key_value_heads=4,
-    pad_token_id=tokenizer.pad_token_id,
-    bos_token_id=tokenizer.bos_token_id,
-    eos_token_id=tokenizer.eos_token_id,
-)
-
-model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.bfloat16)
-
-model_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)))
-os.makedirs(model_folder, exist_ok=True)
-
-model.save_pretrained(model_folder)
-
-tokenizer_folder = model_folder
-tokenizer.save_pretrained(tokenizer_folder)
-
-load_tokenizer = AutoTokenizer.from_pretrained(tokenizer_folder)
-
-chat = [{"role": "user", "content": "1,0:2,3"}]
-
-load_tokenizer.padding_side = "left"
-print(load_tokenizer.apply_chat_template(chat, tokenize=True, add_generation_prompt=True, max_length=10, padding="max_length"))
--- a/tests/e2e/arithmetic_sequence/model/generation_config.json
+++ b/tests/e2e/arithmetic_sequence/model/generation_config.json
@ -1,6 +0,0 @@
-{
-  "_from_model_config": true,
-  "eos_token_id": 1,
-  "pad_token_id": 2,
-  "transformers_version": "4.43.3"
-}
--- a/tests/e2e/arithmetic_sequence/model/model.safetensors
+++ b/tests/e2e/arithmetic_sequence/model/model.safetensors
--- a/tests/e2e/arithmetic_sequence/model/tokenizer_config.json
+++ b/tests/e2e/arithmetic_sequence/model/tokenizer_config.json
@ -1,18 +0,0 @@
-{
-    "char_ords": [
-        48,
-        49,
-        50,
-        51,
-        52,
-        53,
-        54,
-        55,
-        56,
-        57,
-        44,
-        58
-    ],
-    "model_max_length": 2048,
-    "chat_template": "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set role = message['role'] %}{{ message['content'] }}{% endfor %}{% if add_generation_prompt %}{{ sep_token }}{% endif %}"
-}
--- a/tests/e2e/arithmetic_sequence/rl/README.md
+++ b/tests/e2e/arithmetic_sequence/rl/README.md
@ -1,30 +0,0 @@
-# Digit completion
-
-This is an example of solving a digit completion problem. The problem is defined as below:
-
-The prompt is a sequence of numbers with fixed difference. The agent's goal is to complete the next N numbers.
-If the max number is reached, the next number should be modulo with max number.
-
-For example,
- prompt = [1, 2, 3]
- N = 5
- max_number = 6
-
-The response should be [4, 5, 6, 7%6, 8%6] = [4, 5, 6, 0, 1].
-
-# Environment definition
-
-The core definition of the task is defined in tests/e2e/envs/digit_completion/task.py
-
-It is highly recommended to take a look at it for better understanding.
-
-
-
-# Run experiments
-
-An example of running the task is provided in `tests/e2e/run_ray_trainer.sh`.
-
-```bash
-bash tests/e2e/run_ray_trainer.sh
-```
-
--- a/tests/e2e/arithmetic_sequence/rl/main_trainer.py
+++ b/tests/e2e/arithmetic_sequence/rl/main_trainer.py
@ -1,161 +0,0 @@
-# Copyright 2024 Bytedance Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Using FSDPTrainer
-"""
-
-import os
-
-import hydra
-import ray
-import torch
-from transformers import AutoTokenizer
-
-from verl import DataProto
-from verl.trainer.ppo.ray_trainer import RayPPOTrainer
-from verl.utils.fs import copy_to_local
-
-
-def make_reward_function(tokenizer, num_examine):
-    def arithmetic_sequence_reward_function(data: DataProto, return_dict: bool = False):
-        from tests.e2e.envs.digit_completion.task import compute_reward
-
-        reward_tensor = torch.zeros_like(data.batch["responses"], dtype=torch.float32)
-
-        for i in range(data.batch.batch_size[0]):
-            data_item = data[i]  # DataProtoItem
-
-            prompt_ids = data_item.batch["prompts"]
-
-            prompt_length = prompt_ids.shape[-1]
-
-            # extract raw prompt
-            valid_prompt_length = data_item.batch["attention_mask"][:prompt_length].sum()
-            valid_prompt_ids = prompt_ids[-valid_prompt_length:]
-
-            # extract response
-            response_ids = data_item.batch["responses"]
-            response_length = response_ids.shape[-1]
-            response_mask = data.batch["attention_mask"][i][-response_length:]
-            valid_response_length = data_item.batch["attention_mask"][prompt_length:].sum()
-            valid_response_ids = response_ids[:valid_response_length]
-
-            # decode
-            prompt = tokenizer.decode(valid_prompt_ids)
-            response = tokenizer.decode(valid_response_ids)
-            # remove bos and eos
-            prompt = prompt.replace(tokenizer.sep_token, "")
-            response = response.replace(tokenizer.eos_token, "")
-            if i < num_examine:
-                print(prompt, response)
-
-            reward_output = compute_reward(prompt, response)
-            dense_reward = reward_output[0].tolist()
-            ground_truth_response = reward_output[1]["ground_truth_response"]
-            last_reward = dense_reward[-1] if len(dense_reward) > 0 else 1 if len(ground_truth_response) == 0 else 0
-
-            # pad to response_length
-            for _ in range(reward_tensor.shape[-1] - len(dense_reward)):
-                dense_reward.append(last_reward)
-
-            dense_reward = torch.as_tensor(dense_reward, dtype=torch.float32, device=reward_tensor.device)
-            reward_tensor[i] = dense_reward * response_mask
-
-        if return_dict:
-            return {"reward_tensor": reward_tensor}
-        else:
-            return reward_tensor
-
-    return arithmetic_sequence_reward_function
-
-
-@hydra.main(config_path="../../../../verl/trainer/config", config_name="ppo_trainer", version_base=None)
-def main(config):
-    ray.init(
-        runtime_env={
-            "env_vars": {
-                "MEGATRON_USE_CUDA_TIMER": "0",
-                "MEGATRON_START_PROCESS_TIMER": "False",
-                "TOKENIZERS_PARALLELISM": "true",
-                "NCCL_DEBUG": "WARN",
-            }
-        },
-        num_cpus=config.ray_init.num_cpus,
-    )
-
-    # print initial config
-    from pprint import pprint
-
-    from omegaconf import OmegaConf
-
-    pprint(OmegaConf.to_container(config, resolve=True))  # resolve=True will eval symbol values
-
-    # print the config
-    # print initial config
-    print("Config after normalizing batch_size")
-    pprint(OmegaConf.to_container(config, resolve=True))  # resolve=True will eval symbol values
-
-    # download the checkpoint from hdfs
-    local_path = copy_to_local(config.actor_rollout_ref.model.path)
-    local_path = os.path.expanduser(local_path)
-    # instantiate tokenizern
-    from transformers import LlamaConfig
-
-    from tests.e2e.envs.digit_completion import CharTokenizer
-
-    AutoTokenizer.register(LlamaConfig, CharTokenizer, exist_ok=True)
-    tokenizer = AutoTokenizer.from_pretrained(local_path)
-    print(f"Tokenizer vocab_size: {tokenizer.vocab_size}")
-
-    # define worker classes
-    from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
-    from verl.workers.fsdp_workers import ActorRolloutRefWorker, CriticWorker
-
-    role_worker_mapping = {
-        Role.ActorRollout: ray.remote(ActorRolloutRefWorker),
-        Role.Critic: ray.remote(CriticWorker),
-    }
-
-    global_pool_id = "global_pool"
-    resource_pool_spec = {
-        global_pool_id: [config.trainer.n_gpus_per_node] * config.trainer.nnodes,
-    }
-    mapping = {
-        Role.ActorRollout: global_pool_id,
-        Role.Critic: global_pool_id,
-    }
-
-    # use reward model
-    if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
-        role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker)
-        mapping[Role.RefPolicy] = global_pool_id
-
-    reward_fn = make_reward_function(tokenizer=tokenizer, num_examine=1)
-
-    resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping)
-
-    trainer = RayPPOTrainer(
-        config=config,
-        tokenizer=tokenizer,
-        role_worker_mapping=role_worker_mapping,
-        resource_pool_manager=resource_pool_manager,
-        reward_fn=reward_fn,
-        val_reward_fn=reward_fn,
-    )
-    trainer.init_workers()
-    trainer.fit()
-
-
-if __name__ == "__main__":
-    main()
--- a/tests/e2e/run_ray_trainer.sh
+++ b/tests/e2e/run_ray_trainer.sh
@ -1,40 +0,0 @@
-#!/usr/bin/env bash
-
-set -e -x
-
-OUTPUT_FILE="/tmp/output_ray_trainer.txt"
-
-export PATH=$PATH:~/.local/bin
-
-rm -rf $OUTPUT_FILE
-python3 tests/e2e/arithmetic_sequence/rl/main_trainer.py \
-    algorithm.adv_estimator=gae \
-    data.train_files=tests/e2e/arithmetic_sequence/data/train.parquet \
-    data.val_files=tests/e2e/arithmetic_sequence/data/test.parquet \
-    data.train_batch_size=800 \
-    data.max_prompt_length=16 \
-    data.max_response_length=32 \
-    data.return_raw_input_ids=True \
-    actor_rollout_ref.model.path=tests/e2e/arithmetic_sequence/model \
-    actor_rollout_ref.model.external_lib=tests.e2e.envs.digit_completion \
-    actor_rollout_ref.model.use_fused_kernels=True \
-    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=128 \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.actor.optim.lr=1e-4 \
-    actor_rollout_ref.actor.use_kl_loss=False \
-    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=200 \
-    actor_rollout_ref.rollout.name=hf \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
-    critic.ppo_micro_batch_size_per_gpu=128 \
-    critic.model.path=tests/e2e/arithmetic_sequence/model \
-    critic.optim.lr=1e-3 \
-    algorithm.use_kl_in_reward=False \
-    trainer.total_epochs=200 \
-    trainer.experiment_name=arithmetic_sequences \
-    trainer.logger=['console'] \
-    trainer.n_gpus_per_node=1 \
-    trainer.test_freq=1 \
-    trainer.save_freq=110 | tee $OUTPUT_FILE;
-
-python3 tests/e2e/check_results.py --output_file=$OUTPUT_FILE
-rm -rf $OUTPUT_FILE
--- a/tests/e2e/run_ray_trainer_fire_sampling.sh
+++ b/tests/e2e/run_ray_trainer_fire_sampling.sh
@ -1,41 +0,0 @@
-#!/usr/bin/env bash
-
-set -e -x
-
-OUTPUT_FILE="/tmp/output_ray_trainer.txt"
-
-export PATH=$PATH:~/.local/bin
-
-rm -rf $OUTPUT_FILE
-python3 tests/e2e/arithmetic_sequence/rl/main_trainer.py \
-    algorithm.adv_estimator=gae \
-    data.train_files=tests/e2e/arithmetic_sequence/data/train.parquet \
-    data.val_files=tests/e2e/arithmetic_sequence/data/test.parquet \
-    data.train_batch_size=800 \
-    data.val_batch_size=200 \
-    data.max_prompt_length=16 \
-    data.max_response_length=32 \
-    data.return_raw_input_ids=True \
-    actor_rollout_ref.model.path=tests/e2e/arithmetic_sequence/model \
-    actor_rollout_ref.model.external_lib=tests.e2e.envs.digit_completion \
-    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=128 \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.actor.optim.lr=1e-4 \
-    actor_rollout_ref.actor.use_kl_loss=False \
-    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=200 \
-    actor_rollout_ref.rollout.name=hf \
-    actor_rollout_ref.rollout.use_fire_sampling=True \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
-    critic.ppo_micro_batch_size_per_gpu=128 \
-    critic.model.path=tests/e2e/arithmetic_sequence/model \
-    critic.optim.lr=1e-3 \
-    algorithm.use_kl_in_reward=False \
-    trainer.total_epochs=200 \
-    trainer.experiment_name=arithmetic_sequences \
-    trainer.logger=['console'] \
-    trainer.n_gpus_per_node=1 \
-    trainer.test_freq=1 \
-    trainer.save_freq=110 | tee $OUTPUT_FILE;
-
-python3 tests/e2e/check_results.py --output_file=$OUTPUT_FILE --target 0.19
-rm -rf $OUTPUT_FILE
--- a/tests/e2e/run_ray_trainer_rmpad.sh
+++ b/tests/e2e/run_ray_trainer_rmpad.sh
@ -1,20 +0,0 @@
-#!/usr/bin/env bash
-
-set -e -x
-
-huggingface-cli download Qwen/Qwen2.5-0.5B --local-dir $HOME/models/Qwen/Qwen2.5-0.5B
-
-python3 tests/e2e/arithmetic_sequence/rl/main_trainer.py \
-    algorithm.adv_estimator=gae \
-    data.train_files=tests/e2e/arithmetic_sequence/data/train.parquet \
-    data.val_files=tests/e2e/arithmetic_sequence/data/test.parquet \
-    actor_rollout_ref.model.use_fused_kernels=True \
-    actor_rollout_ref.actor.use_kl_loss=False \
-    actor_rollout_ref.model.path=tests/e2e/arithmetic_sequence/model \
-    actor_rollout_ref.rollout.name=vllm \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
-    actor_rollout_ref.model.tokenizer_path=tests/e2e/arithmetic_sequence/model \
-    critic.model.path=Qwen/Qwen2.5-0.5B \
-    critic.model.use_remove_padding=True \
-    algorithm.use_kl_in_reward=False \
-    trainer.total_epochs=1
--- a/tests/gpu_utility/test_ops.py
+++ b/tests/gpu_utility/test_ops.py
@ -1,45 +0,0 @@
-# Copyright 2024 Bytedance Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-def test_flash_attn_cross_entropy():
-    import torch
-    from flash_attn.ops.triton.cross_entropy import cross_entropy_loss
-    from torch import nn
-
-    from verl.utils.debug import log_gpu_memory_usage
-    from verl.utils.torch_functional import logprobs_from_logits_naive
-
-    log_gpu_memory_usage("At start")
-
-    hidden_states = torch.randn(size=(2048, 5120), device="cuda", requires_grad=True, dtype=torch.bfloat16)
-
-    linear = nn.Linear(in_features=5120, out_features=155136, bias=False, device="cuda", dtype=torch.bfloat16)
-
-    logits = linear(hidden_states)
-
-    # logits = logits.float()
-    labels = torch.randint(low=0, high=155136, size=(2048,), device="cuda")
-
-    log_gpu_memory_usage("before computation")
-    # output = checkpoint.checkpoint(logprobs_from_logits, logits, labels, use_reentrant=True)
-    output = -cross_entropy_loss(logits, labels)[0]
-    # output = logprobs_from_logits(logits, labels)
-    log_gpu_memory_usage("After forward")
-    output.sum().backward()
-    log_gpu_memory_usage("After backward")
-
-    groundtruth = logprobs_from_logits_naive(logits.float(), labels)
-
-    torch.testing.assert_close(output, groundtruth)
--- a/tests/gpu_utility/test_torch_functional.py
+++ b/tests/gpu_utility/test_torch_functional.py
@ -1,96 +0,0 @@
-# Copyright 2024 Bytedance Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-import torch
-from flash_attn.bert_padding import unpad_input
-
-from verl.utils.model import create_random_mask
-
-
-def test_log_probs_from_logits_response_rmpad():
-    from verl.utils.torch_functional import log_probs_from_logits_response, log_probs_from_logits_response_rmpad
-
-    vocab_size = 32000
-    batch_size = 2
-    prompt_length = 256
-    response_length = 256
-
-    input_ids = torch.randint(low=0, high=vocab_size, size=(batch_size, prompt_length + response_length), device="cuda")
-    attention_mask = create_random_mask(input_ids=input_ids, max_ratio_of_left_padding=0.2, max_ratio_of_valid_token=0.8, min_ratio_of_valid_token=0.6)
-
-    response_mask = attention_mask[:, -response_length:]
-
-    assert torch.all(response_mask[:, 0] == 1)
-
-    logits = torch.randn(batch_size, prompt_length + response_length, vocab_size, device="cuda")
-    logits_rmpad = unpad_input(logits, attention_mask)[0]
-
-    expected_output = log_probs_from_logits_response(input_ids=input_ids, logits=logits, response_length=response_length)
-    actual_output = log_probs_from_logits_response_rmpad(input_ids=input_ids, attention_mask=attention_mask, logits_rmpad=logits_rmpad, response_length=response_length)
-
-    # This should bitwise align as only this operation only contains gather operators
-    assert torch.all(torch.eq(actual_output * response_mask, expected_output * response_mask))
-
-
-@pytest.mark.parametrize("dtype", [torch.float64, torch.float32, torch.float16, torch.bfloat16])
-def test_logprobs_from_logits_v2(dtype):
-    from verl.utils.torch_functional import logprobs_from_logits_naive, logprobs_from_logits_v2
-
-    vocab_size = 32000
-    batch_size = 2
-    seq_len = 512
-
-    labels = torch.randint(low=0, high=vocab_size, size=(batch_size, seq_len), device="cuda")
-    logits = torch.randn(batch_size, seq_len, vocab_size, device="cuda", dtype=dtype)
-
-    expected_output = logprobs_from_logits_naive(labels=labels, logits=logits)
-    actual_output = logprobs_from_logits_v2(labels=labels, logits=logits)
-
-    if dtype in [torch.float16, torch.bfloat16]:  # float16 falls back to an exactly equivalent method
-        assert torch.equal(actual_output, expected_output)
-    else:  # small numerical difference when using gather / logsumexp approach
-        torch.testing.assert_close(actual_output, expected_output, rtol=1e-5, atol=1e-5)
-
-
-def test_lr_scheduler():
-    from torch import nn
-
-    model = nn.Linear(10, 10)
-    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
-
-    from verl.utils.torch_functional import get_constant_schedule_with_warmup
-
-    constant_lr = get_constant_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=2)
-
-    lr_lst = []
-
-    for _ in range(5):
-        lr_lst.append(constant_lr.get_last_lr()[0])
-        constant_lr.step()
-
-    torch.testing.assert_close(lr_lst, [0.0, 0.0005, 0.001, 0.001, 0.001])
-
-    from verl.utils.torch_functional import get_cosine_schedule_with_warmup
-
-    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
-    cosine_lr = get_cosine_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=2, num_training_steps=5, min_lr_ratio=0.1)
-
-    lr_lst = []
-
-    for _ in range(5):
-        lr_lst.append(cosine_lr.get_last_lr()[0])
-        cosine_lr.step()
-
-    torch.testing.assert_close(lr_lst, [0.0, 0.0005, 0.001, 0.0007750000000000002, 0.0003250000000000002])
--- a/tests/ray_cpu/test_check_worker_alive.py
+++ b/tests/ray_cpu/test_check_worker_alive.py
@ -1,47 +0,0 @@
-# Copyright 2024 Bytedance Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import subprocess
-import time
-
-
-def test():
-    wait_time = 10
-
-    my_env = os.environ.copy()
-    my_env["WAIT_TIME"] = str(wait_time)
-
-    p = subprocess.Popen(["python3", "-u", "./check_worker_alive/main.py"], env=my_env, stdout=subprocess.PIPE)
-
-    count = 0
-    while b"foo started" not in p.stdout.read():
-        time.sleep(1)
-        count += 1
-        if count > 40:
-            raise RuntimeError("timeout for start foo in check_worker_alive/main.py")
-
-    print(
-        time.time(),
-        f"wait 1.5 wait time {wait_time * 1.5} to let signal returned to process but still not exceed process wait time",
-    )
-    time.sleep(wait_time * 1.5)
-    print(time.time(), "start checking")
-    assert p.poll() is not None, f"process {p} still alive, expecting signal raised abort"
-    assert p.returncode != 0, f"process {p} exit with code 0, expecting not-zero exit code"
-    print("test passed")
-
-
-if __name__ == "__main__":
-    test()
--- a/tests/single_controller/init.py
+++ b/tests/single_controller/init.py
@ -0,0 +1,13 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/tests/single_controller/check_worker_alive/main.py
+++ b/tests/single_controller/check_worker_alive/main.py
--- a/tests/single_controller/detached_worker/README.md
+++ b/tests/single_controller/detached_worker/README.md
--- a/tests/single_controller/detached_worker/client.py
+++ b/tests/single_controller/detached_worker/client.py
--- a/tests/single_controller/detached_worker/run.sh
+++ b/tests/single_controller/detached_worker/run.sh
--- a/tests/single_controller/detached_worker/server.py
+++ b/tests/single_controller/detached_worker/server.py
--- a/tests/single_controller/test_auto_padding_on_cpu.py
+++ b/tests/single_controller/test_auto_padding_on_cpu.py
--- a/tests/single_controller/test_colocated_workers.py
+++ b/tests/single_controller/test_colocated_workers.py
--- a/tests/single_controller/test_colocated_workers_fused.py
+++ b/tests/single_controller/test_colocated_workers_fused.py
--- a/tests/single_controller/test_data_transfer.py
+++ b/tests/single_controller/test_data_transfer.py
--- a/tests/single_controller/test_decorator_on_cpu.py
+++ b/tests/single_controller/test_decorator_on_cpu.py
--- a/tests/single_controller/test_driverfunc_to_worker.py
+++ b/tests/single_controller/test_driverfunc_to_worker.py
--- a/tests/single_controller/test_fused_workers_on_cpu.py
+++ b/tests/single_controller/test_fused_workers_on_cpu.py
--- a/tests/single_controller/test_high_level_scheduling_api.py
+++ b/tests/single_controller/test_high_level_scheduling_api.py
--- a/tests/single_controller/test_ray_local_envs_on_cpu.py
+++ b/tests/single_controller/test_ray_local_envs_on_cpu.py
--- a/tests/single_controller/test_ray_utils_on_cpu.py
+++ b/tests/single_controller/test_ray_utils_on_cpu.py
--- a/tests/single_controller/test_rvdz.py
+++ b/tests/single_controller/test_rvdz.py
--- a/tests/single_controller/test_worker_group_basics.py
+++ b/tests/single_controller/test_worker_group_basics.py
--- a/tests/single_controller/test_worker_group_torch.py
+++ b/tests/single_controller/test_worker_group_torch.py
--- a/tests/special_distributed/README.md
+++ b/tests/special_distributed/README.md
@ -0,0 +1 @@
+This folder is reserved for unit tests (instead of end-to-end tests) that require multiple GPUs.
--- a/tests/special_distributed/run_all.sh
+++ b/tests/special_distributed/run_all.sh
@ -15,4 +15,4 @@
 #!/usr/bin/env bash

 set -e -x
-torchrun --nproc-per-node=4 --standalone tests/distributed/test_tensor_dict.py
+torchrun --nproc-per-node=4 --standalone tests/special_distributed/test_tensor_dict.py
--- a/tests/utils/gpu_tests/checkpoint/test_fsdp_ckpt.py
+++ b/tests/utils/gpu_tests/checkpoint/test_fsdp_ckpt.py
--- a/tests/special_distributed/test_tensor_dict.py
+++ b/tests/special_distributed/test_tensor_dict.py
--- a/tests/special_e2e/README.md
+++ b/tests/special_e2e/README.md
@ -0,0 +1 @@
+This folder is reserved for end-to-end tests that typically require multiple GPUs.
--- a/tests/special_e2e/init.py
+++ b/tests/special_e2e/init.py
--- a/tests/special_e2e/check_custom_rwd_fn.py
+++ b/tests/special_e2e/check_custom_rwd_fn.py
--- a/tests/special_e2e/check_results.py
+++ b/tests/special_e2e/check_results.py
--- a/tests/special_e2e/envs/init.py
+++ b/tests/special_e2e/envs/init.py
--- a/tests/special_e2e/envs/digit_completion/init.py
+++ b/tests/special_e2e/envs/digit_completion/init.py
--- a/tests/special_e2e/envs/digit_completion/task.py
+++ b/tests/special_e2e/envs/digit_completion/task.py
--- a/tests/special_e2e/envs/digit_completion/tokenizer.py
+++ b/tests/special_e2e/envs/digit_completion/tokenizer.py
--- a/tests/special_e2e/generation/run_gen_qwen05.sh
+++ b/tests/special_e2e/generation/run_gen_qwen05.sh
--- a/tests/special_e2e/ppo_trainer/expert_parallel/qwen2moe_minimal.json
+++ b/tests/special_e2e/ppo_trainer/expert_parallel/qwen2moe_minimal.json
--- a/tests/special_e2e/ppo_trainer/run_function_reward.sh
+++ b/tests/special_e2e/ppo_trainer/run_function_reward.sh
@ -138,7 +138,7 @@ python3 -m verl.trainer.main_ppo \
    | tee "${output_file}"

 if [ "${CUSTOM_REWARD_FN}" = "True" ]; then
-    python3 tests/e2e/check_custom_rwd_fn.py --output_file="${output_file}"
+    python3 tests/special_e2e/check_custom_rwd_fn.py --output_file="${output_file}"
    check_exit_code=$?
    rm -rf "${reward_fn_file_path}"
    rm -rf "${output_file}"
--- a/tests/special_e2e/ppo_trainer/run_model_reward.sh
+++ b/tests/special_e2e/ppo_trainer/run_model_reward.sh
--- a/tests/special_e2e/run_dapo.sh
+++ b/tests/special_e2e/run_dapo.sh
--- a/tests/special_e2e/run_grpo_lora_with_merge.sh
+++ b/tests/special_e2e/run_grpo_lora_with_merge.sh
--- a/tests/special_e2e/run_gsm8k_fsdp_sgl_multiturn_sf_tool.sh
+++ b/tests/special_e2e/run_gsm8k_fsdp_sgl_multiturn_sf_tool.sh
--- a/tests/special_e2e/run_gsm8k_fsdp_sgl_multiturn_w_tool.sh
+++ b/tests/special_e2e/run_gsm8k_fsdp_sgl_multiturn_w_tool.sh
--- a/tests/special_e2e/run_ppo_trainer_megatron.sh
+++ b/tests/special_e2e/run_ppo_trainer_megatron.sh
--- a/tests/special_e2e/run_prime.sh
+++ b/tests/special_e2e/run_prime.sh
--- a/tests/special_e2e/run_r1_distill_qwen_aime24_eval.sh
+++ b/tests/special_e2e/run_r1_distill_qwen_aime24_eval.sh
--- a/tests/special_e2e/run_spin.sh
+++ b/tests/special_e2e/run_spin.sh
--- a/tests/special_e2e/run_sppo.sh
+++ b/tests/special_e2e/run_sppo.sh
--- a/tests/special_e2e/run_test.sh
+++ b/tests/special_e2e/run_test.sh
--- a/tests/special_e2e/sft/run_sft.sh
+++ b/tests/special_e2e/sft/run_sft.sh
--- a/tests/special_e2e/sft/test_sp_loss_match.py
+++ b/tests/special_e2e/sft/test_sp_loss_match.py
--- a/tests/special_npu/run_qwen2_5_05b_dapo.sh
+++ b/tests/special_npu/run_qwen2_5_05b_dapo.sh
--- a/tests/special_npu/run_qwen2_5_05b_grpo.sh
+++ b/tests/special_npu/run_qwen2_5_05b_grpo.sh
--- a/tests/special_npu/run_qwen2_5_32b_grpo.sh
+++ b/tests/special_npu/run_qwen2_5_32b_grpo.sh
--- a/tests/special_npu/run_qwen2_5_7b_grpo.sh
+++ b/tests/special_npu/run_qwen2_5_7b_grpo.sh
--- a/tests/special_sanity/check_license.py
+++ b/tests/special_sanity/check_license.py
--- a/tests/special_sanity/check_pr_title.py
+++ b/tests/special_sanity/check_pr_title.py
@ -33,14 +33,14 @@ if not re_modules:
    print(f"❌ Invalid PR title: '{pr_title}'")
    print("Expected format: [module] type: description")
    print(f"Allowed modules: {', '.join(allowed_modules)}")
-    sys.exit(1)
+    raise Exception("Invalid PR title")
 else:
    modules = re.findall(r"[a-z]+", re_modules.group(1).lower())
    if not all(module in allowed_modules for module in modules):
        invalid_modules = [module for module in modules if module not in allowed_modules]
        print(f"❌ Invalid modules: {', '.join(invalid_modules)}")
        print(f"Allowed modules: {', '.join(allowed_modules)}")
-        sys.exit(1)
+        raise Exception("Invalid PR title")

 types_pattern = "|".join(re.escape(t) for t in allowed_types)
 re_types_pattern = re.compile(rf"^\[[a-z_,\s]+\]\s+({types_pattern}):\s+.+$", re.IGNORECASE)
@ -50,7 +50,7 @@ if not match:
    print(f"❌ Invalid PR title: '{pr_title}'")
    print("Expected format: [module] type: description")
    print(f"Allowed types: {', '.join(allowed_types)}")
-    sys.exit(1)
+    raise Exception("Invalid PR title")

 change_type = match.group(1).lower()

--- a/tests/special_sanity/test_config_docs.py
+++ b/tests/special_sanity/test_config_docs.py
@ -71,5 +71,6 @@ def test_trainer_config_doc():
        print("Please read the top block of `verl/trainer/config/ppo_trainer.yaml` to see format rules:\n")
        for err in validation_errors:
            print(" -", err)
+        raise Exception("Please fix documentation format.")
    else:
        print("YAML format check passed ✅")
--- a/tests/special_sanity/test_import.py
+++ b/tests/special_sanity/test_import.py
--- a/tests/special_standalone/README.md
+++ b/tests/special_standalone/README.md
@ -0,0 +1 @@
+The standalone test folder is reserved for tests that require dedicated environment (e.g. memory stress tests)
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`This folder is reserved for unit tests (instead of end-to-end tests) that require multiple GPUs.`
				`@ -0,0 +1 @@`
				`This folder is reserved for end-to-end tests that typically require multiple GPUs.`
				`@ -0,0 +1 @@`
				`The standalone test folder is reserved for tests that require dedicated environment (e.g. memory stress tests)`