[ci] feat: upgrade sglang to 0.5.2 (#3613)

### What does this PR do?

Solve
https://github.com/volcengine/verl/pull/3530#issuecomment-3332840437
This commit is contained in:
Joel
2025-09-26 09:25:53 +08:00
committed by GitHub
parent 14c397f474
commit 6ff2b43d13
15 changed files with 128 additions and 17 deletions

View File

@ -77,7 +77,7 @@ jobs:
HF_ENDPOINT: "https://hf-mirror.com" HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
container: container:
image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2 image: verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2
options: --gpus all --shm-size=10g options: --gpus all --shm-size=10g
steps: steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@ -110,7 +110,7 @@ jobs:
HF_ENDPOINT: "https://hf-mirror.com" HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
container: container:
image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2 image: verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2
options: --gpus all --shm-size=10g options: --gpus all --shm-size=10g
steps: steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

View File

@ -75,7 +75,7 @@ permissions:
contents: read contents: read
env: env:
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2" IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2"
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner" DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
jobs: jobs:

View File

@ -53,7 +53,7 @@ permissions:
contents: read contents: read
env: env:
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2" IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2"
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner" DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
# Cancel jobs on the same ref if a new one is triggered # Cancel jobs on the same ref if a new one is triggered

View File

@ -56,7 +56,7 @@ concurrency:
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
env: env:
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2" IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2"
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner" DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
TRANSFORMERS_VERSION: "4.56.2" TRANSFORMERS_VERSION: "4.56.2"

View File

@ -81,7 +81,7 @@ jobs:
NO_PROXY: "localhost,127.0.0.1" NO_PROXY: "localhost,127.0.0.1"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
container: container:
image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2 image: verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2
options: --gpus all --shm-size=10g options: --gpus all --shm-size=10g
steps: steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@ -116,7 +116,7 @@ jobs:
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
HF_ENDPOINT: "https://hf-mirror.com" HF_ENDPOINT: "https://hf-mirror.com"
container: container:
image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2 image: verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2
options: --gpus all --shm-size=10g options: --gpus all --shm-size=10g
steps: steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

View File

@ -86,7 +86,7 @@ permissions:
contents: read contents: read
env: env:
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2" IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2"
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner" DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
jobs: jobs:

View File

@ -86,7 +86,7 @@ permissions:
contents: read contents: read
env: env:
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2" IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2"
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner" DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
jobs: jobs:
@ -217,7 +217,6 @@ jobs:
- name: Install the current repository - name: Install the current repository
run: | run: |
pip3 install -e .[test,geo,gpu,sglang] --no-deps pip3 install -e .[test,geo,gpu,sglang] --no-deps
pip install "transformers[hf_xet]==4.54.0"
# Geo3k # Geo3k
- name: Prepare GEO3K dataset - name: Prepare GEO3K dataset
run: | run: |

View File

@ -70,7 +70,7 @@ permissions:
contents: read contents: read
env: env:
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2" IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2"
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner" DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
jobs: jobs:

View File

@ -80,7 +80,7 @@ jobs:
NO_PROXY: "localhost,127.0.0.1" NO_PROXY: "localhost,127.0.0.1"
HF_HUB_ENABLE_HF_TRANSFER: 1 HF_HUB_ENABLE_HF_TRANSFER: 1
container: container:
image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2 image: verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2
options: --gpus all --shm-size=10g options: --gpus all --shm-size=10g
steps: steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

View File

@ -77,7 +77,7 @@ permissions:
contents: read contents: read
env: env:
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2" IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2"
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner" DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
jobs: jobs:

View File

@ -0,0 +1,4 @@
FROM verlai/verl:base-verl0.6-cu128-cudnn9.8-torch2.8.0-fa2.7.4
RUN pip install --no-cache-dir "sglang[all]==0.5.2"
RUN pip install --no-cache-dir "torch-memory-saver==0.0.9rc1"

View File

@ -0,0 +1,108 @@
# Start from the NVIDIA official image (ubuntu-24.04 + cuda-12.8 + python-3.12)
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-03.html
FROM nvcr.io/nvidia/pytorch:25.03-py3
# Define environments
ENV MAX_JOBS=32
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV PIP_ROOT_USER_ACTION=ignore
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
ENV PIP_CONSTRAINT=""
ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
# Change pip source
RUN pip config set global.index-url "${PIP_INDEX}" && \
pip config set global.extra-index-url "${PIP_INDEX}" && \
pip config set global.no-cache-dir "true" && \
python -m pip install --upgrade pip
# Install systemctl
RUN apt-get update && \
apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
apt-get clean
# Install libxml2
RUN apt-get update && \
apt-get install -y libxml2 aria2 && \
apt-get clean
# Uninstall nv-pytorch fork
RUN pip uninstall -y torch torchvision torchaudio \
pytorch-quantization pytorch-triton torch-tensorrt \
transformer_engine flash_attn apex megatron-core \
xgboost opencv grpcio
# Fix packages
RUN pip install --no-cache-dir tensordict torchdata "transformers[hf_xet]==4.55.4" accelerate datasets peft hf-transfer \
"numpy<2.0.0" "pyarrow>=19.0.1" pandas \
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
pytest py-spy pre-commit ruff
# Fix cv2
RUN rm -rf /usr/local/lib/python3.11/dist-packages/cv2
# Install torch
RUN pip install --no-cache-dir torch==2.8.0 --index-url https://download.pytorch.org/whl/cu128
# Install flash-attn
RUN pip install --no-cache-dir --no-build-isolation flash_attn==2.7.4.post1
# Install DeepEP
# the dependency of IBGDA
RUN ln -s /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so
# Clone and build deepep and deepep-nvshmem
RUN git clone -b v2.3.1 https://github.com/NVIDIA/gdrcopy.git && \
git clone https://github.com/deepseek-ai/DeepEP.git && \
cd DeepEP && git checkout a84a248
# Prepare nvshmem
RUN wget https://developer.nvidia.com/downloads/assets/secure/nvshmem/nvshmem_src_3.2.5-1.txz && \
tar -xvf nvshmem_src_3.2.5-1.txz && mv nvshmem_src deepep-nvshmem && \
cd deepep-nvshmem && git apply ../DeepEP/third-party/nvshmem.patch
## Build deepep-nvshmem
RUN apt-get install -y ninja-build cmake
ENV CUDA_HOME=/usr/local/cuda
### Set MPI environment variables. Having errors when not set.
ENV CPATH=/usr/local/mpi/include:$CPATH
ENV LD_LIBRARY_PATH=/usr/local/mpi/lib:$LD_LIBRARY_PATH
ENV LD_LIBRARY_PATH=/usr/local/x86_64-linux-gnu:$LD_LIBRARY_PATH
ENV GDRCOPY_HOME=/workspace/gdrcopy
ENV GDRCOPY_INCLUDE=/workspace/gdrcopy/include
RUN cd deepep-nvshmem && \
NVSHMEM_SHMEM_SUPPORT=0 \
NVSHMEM_UCX_SUPPORT=0 \
NVSHMEM_USE_NCCL=0 \
NVSHMEM_MPI_SUPPORT=0 \
NVSHMEM_IBGDA_SUPPORT=1 \
NVSHMEM_PMIX_SUPPORT=0 \
NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
NVSHMEM_USE_GDRCOPY=1 \
cmake -G Ninja -S . -B build/ -DCMAKE_INSTALL_PREFIX=/workspace/deepep-nvshmem/install && cmake --build build/ --target install
ENV NVSHMEM_DIR=/workspace/deepep-nvshmem/install
ENV LD_LIBRARY_PATH=$NVSHMEM_DIR/lib:$LD_LIBRARY_PATH
ENV PATH=$NVSHMEM_DIR/bin:$PATH
## Build deepep
RUN cd DeepEP && \
python setup.py install
# Install Apex
RUN pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git
# Install TransformerEngine
RUN export NVTE_FRAMEWORK=pytorch && pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@v2.2.1
# Install Megatron-LM
RUN git clone -b core_v0.13.0 https://github.com/NVIDIA/Megatron-LM.git && \
cd Megatron-LM && pip3 install --no-deps -e .
# Install mbridge
RUN pip3 install --no-cache-dir git+https://github.com/ISEEKYAN/mbridge.git

View File

@ -17,5 +17,5 @@ torchdata
torchvision torchvision
transformers transformers
wandb wandb
sglang[all]==0.4.10.post2 sglang[all]==0.5.2
huggingface_hub huggingface_hub

View File

@ -52,8 +52,8 @@ MATH_REQUIRES = ["math-verify"] # Add math-verify as an optional dependency
VLLM_REQUIRES = ["tensordict>=0.8.0,<=0.10.0,!=0.9.0", "vllm>=0.7.3,<=0.9.1"] VLLM_REQUIRES = ["tensordict>=0.8.0,<=0.10.0,!=0.9.0", "vllm>=0.7.3,<=0.9.1"]
SGLANG_REQUIRES = [ SGLANG_REQUIRES = [
"tensordict>=0.8.0,<=0.10.0,!=0.9.0", "tensordict>=0.8.0,<=0.10.0,!=0.9.0",
"sglang[srt,openai]==0.4.10.post2", "sglang[srt,openai]==0.5.2",
"torch==2.7.1", "torch==2.8.0",
] ]
TRL_REQUIRES = ["trl<=0.9.6"] TRL_REQUIRES = ["trl<=0.9.6"]
MCORE_REQUIRES = ["mbridge"] MCORE_REQUIRES = ["mbridge"]

View File

@ -159,7 +159,7 @@ class SGLangHttpServer:
scheduler_info=self.scheduler_info, scheduler_info=self.scheduler_info,
) )
) )
app.is_single_tokenizer_mode = True
self._server_port, self._server_task = await run_unvicorn(app, server_args) self._server_port, self._server_task = await run_unvicorn(app, server_args)
async def wake_up(self): async def wake_up(self):