From 6ff2b43d131fd3f6cace3aadf9587e957a8047bb Mon Sep 17 00:00:00 2001 From: Joel Date: Fri, 26 Sep 2025 09:25:53 +0800 Subject: [PATCH] [ci] feat: upgrade sglang to 0.5.2 (#3613) ### What does this PR do? Solve https://github.com/volcengine/verl/pull/3530#issuecomment-3332840437 --- .../workflows/.deprecate/e2e_ppo_trainer.yml | 4 +- .../e2e_ppo_trainer_megatron_sglang.yml | 2 +- .github/workflows/.deprecate/e2e_spin.yml | 2 +- .github/workflows/.deprecate/e2e_sppo.yml | 2 +- .github/workflows/checkpoint_converter.yml | 4 +- .../e2e_ppo_trainer_megatron_sglang.yml | 2 +- .../e2e_ppo_trainer_megatron_sglang_2.yml | 3 +- .github/workflows/e2e_sft.yml | 2 +- .github/workflows/gpu_unit_tests.yml | 2 +- .github/workflows/sgl.yml | 2 +- .../Dockerfile.app.sglang | 4 + .../Dockerfile.base | 108 ++++++++++++++++++ requirements_sglang.txt | 2 +- setup.py | 4 +- .../sglang_rollout/async_sglang_server.py | 2 +- 15 files changed, 128 insertions(+), 17 deletions(-) create mode 100644 docker/verl0.6-cu128-torch2.8.0-fa2.7.4/Dockerfile.app.sglang create mode 100644 docker/verl0.6-cu128-torch2.8.0-fa2.7.4/Dockerfile.base diff --git a/.github/workflows/.deprecate/e2e_ppo_trainer.yml b/.github/workflows/.deprecate/e2e_ppo_trainer.yml index 74dcc4c20..fa6fef0bd 100644 --- a/.github/workflows/.deprecate/e2e_ppo_trainer.yml +++ b/.github/workflows/.deprecate/e2e_ppo_trainer.yml @@ -77,7 +77,7 @@ jobs: HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable container: - image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2 + image: verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2 options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 @@ -110,7 +110,7 @@ jobs: HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable container: - image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2 + image: verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2 options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 diff --git a/.github/workflows/.deprecate/e2e_ppo_trainer_megatron_sglang.yml b/.github/workflows/.deprecate/e2e_ppo_trainer_megatron_sglang.yml index 2275249cd..30c22d994 100644 --- a/.github/workflows/.deprecate/e2e_ppo_trainer_megatron_sglang.yml +++ b/.github/workflows/.deprecate/e2e_ppo_trainer_megatron_sglang.yml @@ -75,7 +75,7 @@ permissions: contents: read env: - IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2" + IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2" DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner" jobs: diff --git a/.github/workflows/.deprecate/e2e_spin.yml b/.github/workflows/.deprecate/e2e_spin.yml index 080d50d2d..b3c8a85e0 100644 --- a/.github/workflows/.deprecate/e2e_spin.yml +++ b/.github/workflows/.deprecate/e2e_spin.yml @@ -53,7 +53,7 @@ permissions: contents: read env: - IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2" + IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2" DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner" # Cancel jobs on the same ref if a new one is triggered diff --git a/.github/workflows/.deprecate/e2e_sppo.yml b/.github/workflows/.deprecate/e2e_sppo.yml index 2dacbb2ca..0dddd849c 100644 --- a/.github/workflows/.deprecate/e2e_sppo.yml +++ b/.github/workflows/.deprecate/e2e_sppo.yml @@ -56,7 +56,7 @@ concurrency: cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} env: - IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2" + IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2" DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner" TRANSFORMERS_VERSION: "4.56.2" diff --git a/.github/workflows/checkpoint_converter.yml b/.github/workflows/checkpoint_converter.yml index 7ae9752e2..e716709b6 100644 --- a/.github/workflows/checkpoint_converter.yml +++ b/.github/workflows/checkpoint_converter.yml @@ -81,7 +81,7 @@ jobs: NO_PROXY: "localhost,127.0.0.1" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable container: - image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2 + image: verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2 options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 @@ -116,7 +116,7 @@ jobs: HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable HF_ENDPOINT: "https://hf-mirror.com" container: - image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2 + image: verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2 options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 diff --git a/.github/workflows/e2e_ppo_trainer_megatron_sglang.yml b/.github/workflows/e2e_ppo_trainer_megatron_sglang.yml index 7183dee40..cf7b2599d 100644 --- a/.github/workflows/e2e_ppo_trainer_megatron_sglang.yml +++ b/.github/workflows/e2e_ppo_trainer_megatron_sglang.yml @@ -86,7 +86,7 @@ permissions: contents: read env: - IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2" + IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2" DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner" jobs: diff --git a/.github/workflows/e2e_ppo_trainer_megatron_sglang_2.yml b/.github/workflows/e2e_ppo_trainer_megatron_sglang_2.yml index 8209124ce..7c4cba92c 100644 --- a/.github/workflows/e2e_ppo_trainer_megatron_sglang_2.yml +++ b/.github/workflows/e2e_ppo_trainer_megatron_sglang_2.yml @@ -86,7 +86,7 @@ permissions: contents: read env: - IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2" + IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2" DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner" jobs: @@ -217,7 +217,6 @@ jobs: - name: Install the current repository run: | pip3 install -e .[test,geo,gpu,sglang] --no-deps - pip install "transformers[hf_xet]==4.54.0" # Geo3k - name: Prepare GEO3K dataset run: | diff --git a/.github/workflows/e2e_sft.yml b/.github/workflows/e2e_sft.yml index ff421836a..4a7305bef 100644 --- a/.github/workflows/e2e_sft.yml +++ b/.github/workflows/e2e_sft.yml @@ -70,7 +70,7 @@ permissions: contents: read env: - IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2" + IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2" DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner" jobs: diff --git a/.github/workflows/gpu_unit_tests.yml b/.github/workflows/gpu_unit_tests.yml index 17f244bf6..828ce9b59 100644 --- a/.github/workflows/gpu_unit_tests.yml +++ b/.github/workflows/gpu_unit_tests.yml @@ -80,7 +80,7 @@ jobs: NO_PROXY: "localhost,127.0.0.1" HF_HUB_ENABLE_HF_TRANSFER: 1 container: - image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2 + image: verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2 options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 diff --git a/.github/workflows/sgl.yml b/.github/workflows/sgl.yml index b43571187..1f490ddfb 100644 --- a/.github/workflows/sgl.yml +++ b/.github/workflows/sgl.yml @@ -77,7 +77,7 @@ permissions: contents: read env: - IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2" + IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2" DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner" jobs: diff --git a/docker/verl0.6-cu128-torch2.8.0-fa2.7.4/Dockerfile.app.sglang b/docker/verl0.6-cu128-torch2.8.0-fa2.7.4/Dockerfile.app.sglang new file mode 100644 index 000000000..23dbea72c --- /dev/null +++ b/docker/verl0.6-cu128-torch2.8.0-fa2.7.4/Dockerfile.app.sglang @@ -0,0 +1,4 @@ +FROM verlai/verl:base-verl0.6-cu128-cudnn9.8-torch2.8.0-fa2.7.4 + +RUN pip install --no-cache-dir "sglang[all]==0.5.2" +RUN pip install --no-cache-dir "torch-memory-saver==0.0.9rc1" diff --git a/docker/verl0.6-cu128-torch2.8.0-fa2.7.4/Dockerfile.base b/docker/verl0.6-cu128-torch2.8.0-fa2.7.4/Dockerfile.base new file mode 100644 index 000000000..7bfd48169 --- /dev/null +++ b/docker/verl0.6-cu128-torch2.8.0-fa2.7.4/Dockerfile.base @@ -0,0 +1,108 @@ +# Start from the NVIDIA official image (ubuntu-24.04 + cuda-12.8 + python-3.12) +# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-03.html +FROM nvcr.io/nvidia/pytorch:25.03-py3 + +# Define environments +ENV MAX_JOBS=32 +ENV VLLM_WORKER_MULTIPROC_METHOD=spawn +ENV DEBIAN_FRONTEND=noninteractive +ENV NODE_OPTIONS="" +ENV PIP_ROOT_USER_ACTION=ignore +ENV HF_HUB_ENABLE_HF_TRANSFER="1" +ENV PIP_CONSTRAINT="" + +ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple + +# Change pip source +RUN pip config set global.index-url "${PIP_INDEX}" && \ + pip config set global.extra-index-url "${PIP_INDEX}" && \ + pip config set global.no-cache-dir "true" && \ + python -m pip install --upgrade pip + +# Install systemctl +RUN apt-get update && \ + apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \ + apt-get clean + +# Install libxml2 +RUN apt-get update && \ + apt-get install -y libxml2 aria2 && \ + apt-get clean + +# Uninstall nv-pytorch fork +RUN pip uninstall -y torch torchvision torchaudio \ + pytorch-quantization pytorch-triton torch-tensorrt \ + transformer_engine flash_attn apex megatron-core \ + xgboost opencv grpcio + +# Fix packages +RUN pip install --no-cache-dir tensordict torchdata "transformers[hf_xet]==4.55.4" accelerate datasets peft hf-transfer \ + "numpy<2.0.0" "pyarrow>=19.0.1" pandas \ + ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \ + pytest py-spy pre-commit ruff + +# Fix cv2 +RUN rm -rf /usr/local/lib/python3.11/dist-packages/cv2 + +# Install torch +RUN pip install --no-cache-dir torch==2.8.0 --index-url https://download.pytorch.org/whl/cu128 + +# Install flash-attn +RUN pip install --no-cache-dir --no-build-isolation flash_attn==2.7.4.post1 + +# Install DeepEP +# the dependency of IBGDA +RUN ln -s /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so + +# Clone and build deepep and deepep-nvshmem +RUN git clone -b v2.3.1 https://github.com/NVIDIA/gdrcopy.git && \ + git clone https://github.com/deepseek-ai/DeepEP.git && \ + cd DeepEP && git checkout a84a248 + +# Prepare nvshmem +RUN wget https://developer.nvidia.com/downloads/assets/secure/nvshmem/nvshmem_src_3.2.5-1.txz && \ + tar -xvf nvshmem_src_3.2.5-1.txz && mv nvshmem_src deepep-nvshmem && \ + cd deepep-nvshmem && git apply ../DeepEP/third-party/nvshmem.patch + +## Build deepep-nvshmem +RUN apt-get install -y ninja-build cmake + +ENV CUDA_HOME=/usr/local/cuda +### Set MPI environment variables. Having errors when not set. +ENV CPATH=/usr/local/mpi/include:$CPATH +ENV LD_LIBRARY_PATH=/usr/local/mpi/lib:$LD_LIBRARY_PATH +ENV LD_LIBRARY_PATH=/usr/local/x86_64-linux-gnu:$LD_LIBRARY_PATH +ENV GDRCOPY_HOME=/workspace/gdrcopy +ENV GDRCOPY_INCLUDE=/workspace/gdrcopy/include + +RUN cd deepep-nvshmem && \ + NVSHMEM_SHMEM_SUPPORT=0 \ + NVSHMEM_UCX_SUPPORT=0 \ + NVSHMEM_USE_NCCL=0 \ + NVSHMEM_MPI_SUPPORT=0 \ + NVSHMEM_IBGDA_SUPPORT=1 \ + NVSHMEM_PMIX_SUPPORT=0 \ + NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \ + NVSHMEM_USE_GDRCOPY=1 \ + cmake -G Ninja -S . -B build/ -DCMAKE_INSTALL_PREFIX=/workspace/deepep-nvshmem/install && cmake --build build/ --target install + +ENV NVSHMEM_DIR=/workspace/deepep-nvshmem/install +ENV LD_LIBRARY_PATH=$NVSHMEM_DIR/lib:$LD_LIBRARY_PATH +ENV PATH=$NVSHMEM_DIR/bin:$PATH + +## Build deepep +RUN cd DeepEP && \ + python setup.py install + +# Install Apex +RUN pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git + +# Install TransformerEngine +RUN export NVTE_FRAMEWORK=pytorch && pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@v2.2.1 + +# Install Megatron-LM +RUN git clone -b core_v0.13.0 https://github.com/NVIDIA/Megatron-LM.git && \ + cd Megatron-LM && pip3 install --no-deps -e . + +# Install mbridge +RUN pip3 install --no-cache-dir git+https://github.com/ISEEKYAN/mbridge.git diff --git a/requirements_sglang.txt b/requirements_sglang.txt index 34e23f4cd..113bca0d3 100644 --- a/requirements_sglang.txt +++ b/requirements_sglang.txt @@ -17,5 +17,5 @@ torchdata torchvision transformers wandb -sglang[all]==0.4.10.post2 +sglang[all]==0.5.2 huggingface_hub diff --git a/setup.py b/setup.py index 780d622e1..4a86f035d 100644 --- a/setup.py +++ b/setup.py @@ -52,8 +52,8 @@ MATH_REQUIRES = ["math-verify"] # Add math-verify as an optional dependency VLLM_REQUIRES = ["tensordict>=0.8.0,<=0.10.0,!=0.9.0", "vllm>=0.7.3,<=0.9.1"] SGLANG_REQUIRES = [ "tensordict>=0.8.0,<=0.10.0,!=0.9.0", - "sglang[srt,openai]==0.4.10.post2", - "torch==2.7.1", + "sglang[srt,openai]==0.5.2", + "torch==2.8.0", ] TRL_REQUIRES = ["trl<=0.9.6"] MCORE_REQUIRES = ["mbridge"] diff --git a/verl/workers/rollout/sglang_rollout/async_sglang_server.py b/verl/workers/rollout/sglang_rollout/async_sglang_server.py index fb6828706..241896403 100644 --- a/verl/workers/rollout/sglang_rollout/async_sglang_server.py +++ b/verl/workers/rollout/sglang_rollout/async_sglang_server.py @@ -159,7 +159,7 @@ class SGLangHttpServer: scheduler_info=self.scheduler_info, ) ) - + app.is_single_tokenizer_mode = True self._server_port, self._server_task = await run_unvicorn(app, server_args) async def wake_up(self):