mirror of
https://github.com/volcengine/verl.git
synced 2025-10-20 13:43:50 +08:00
[ci] feat: upgrade sglang to 0.5.2 (#3613)
### What does this PR do? Solve https://github.com/volcengine/verl/pull/3530#issuecomment-3332840437
This commit is contained in:
@ -77,7 +77,7 @@ jobs:
|
|||||||
HF_ENDPOINT: "https://hf-mirror.com"
|
HF_ENDPOINT: "https://hf-mirror.com"
|
||||||
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
|
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
|
||||||
container:
|
container:
|
||||||
image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2
|
image: verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2
|
||||||
options: --gpus all --shm-size=10g
|
options: --gpus all --shm-size=10g
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
@ -110,7 +110,7 @@ jobs:
|
|||||||
HF_ENDPOINT: "https://hf-mirror.com"
|
HF_ENDPOINT: "https://hf-mirror.com"
|
||||||
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
|
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
|
||||||
container:
|
container:
|
||||||
image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2
|
image: verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2
|
||||||
options: --gpus all --shm-size=10g
|
options: --gpus all --shm-size=10g
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
@ -75,7 +75,7 @@ permissions:
|
|||||||
contents: read
|
contents: read
|
||||||
|
|
||||||
env:
|
env:
|
||||||
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2"
|
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2"
|
||||||
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
|
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
2
.github/workflows/.deprecate/e2e_spin.yml
vendored
2
.github/workflows/.deprecate/e2e_spin.yml
vendored
@ -53,7 +53,7 @@ permissions:
|
|||||||
contents: read
|
contents: read
|
||||||
|
|
||||||
env:
|
env:
|
||||||
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2"
|
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2"
|
||||||
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
|
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
|
||||||
|
|
||||||
# Cancel jobs on the same ref if a new one is triggered
|
# Cancel jobs on the same ref if a new one is triggered
|
||||||
|
2
.github/workflows/.deprecate/e2e_sppo.yml
vendored
2
.github/workflows/.deprecate/e2e_sppo.yml
vendored
@ -56,7 +56,7 @@ concurrency:
|
|||||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||||
|
|
||||||
env:
|
env:
|
||||||
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2"
|
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2"
|
||||||
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
|
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
|
||||||
TRANSFORMERS_VERSION: "4.56.2"
|
TRANSFORMERS_VERSION: "4.56.2"
|
||||||
|
|
||||||
|
4
.github/workflows/checkpoint_converter.yml
vendored
4
.github/workflows/checkpoint_converter.yml
vendored
@ -81,7 +81,7 @@ jobs:
|
|||||||
NO_PROXY: "localhost,127.0.0.1"
|
NO_PROXY: "localhost,127.0.0.1"
|
||||||
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
|
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
|
||||||
container:
|
container:
|
||||||
image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2
|
image: verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2
|
||||||
options: --gpus all --shm-size=10g
|
options: --gpus all --shm-size=10g
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
@ -116,7 +116,7 @@ jobs:
|
|||||||
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
|
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
|
||||||
HF_ENDPOINT: "https://hf-mirror.com"
|
HF_ENDPOINT: "https://hf-mirror.com"
|
||||||
container:
|
container:
|
||||||
image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2
|
image: verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2
|
||||||
options: --gpus all --shm-size=10g
|
options: --gpus all --shm-size=10g
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
@ -86,7 +86,7 @@ permissions:
|
|||||||
contents: read
|
contents: read
|
||||||
|
|
||||||
env:
|
env:
|
||||||
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2"
|
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2"
|
||||||
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
|
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
@ -86,7 +86,7 @@ permissions:
|
|||||||
contents: read
|
contents: read
|
||||||
|
|
||||||
env:
|
env:
|
||||||
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2"
|
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2"
|
||||||
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
|
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
@ -217,7 +217,6 @@ jobs:
|
|||||||
- name: Install the current repository
|
- name: Install the current repository
|
||||||
run: |
|
run: |
|
||||||
pip3 install -e .[test,geo,gpu,sglang] --no-deps
|
pip3 install -e .[test,geo,gpu,sglang] --no-deps
|
||||||
pip install "transformers[hf_xet]==4.54.0"
|
|
||||||
# Geo3k
|
# Geo3k
|
||||||
- name: Prepare GEO3K dataset
|
- name: Prepare GEO3K dataset
|
||||||
run: |
|
run: |
|
||||||
|
2
.github/workflows/e2e_sft.yml
vendored
2
.github/workflows/e2e_sft.yml
vendored
@ -70,7 +70,7 @@ permissions:
|
|||||||
contents: read
|
contents: read
|
||||||
|
|
||||||
env:
|
env:
|
||||||
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2"
|
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2"
|
||||||
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
|
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
2
.github/workflows/gpu_unit_tests.yml
vendored
2
.github/workflows/gpu_unit_tests.yml
vendored
@ -80,7 +80,7 @@ jobs:
|
|||||||
NO_PROXY: "localhost,127.0.0.1"
|
NO_PROXY: "localhost,127.0.0.1"
|
||||||
HF_HUB_ENABLE_HF_TRANSFER: 1
|
HF_HUB_ENABLE_HF_TRANSFER: 1
|
||||||
container:
|
container:
|
||||||
image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2
|
image: verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2
|
||||||
options: --gpus all --shm-size=10g
|
options: --gpus all --shm-size=10g
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
2
.github/workflows/sgl.yml
vendored
2
.github/workflows/sgl.yml
vendored
@ -77,7 +77,7 @@ permissions:
|
|||||||
contents: read
|
contents: read
|
||||||
|
|
||||||
env:
|
env:
|
||||||
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2"
|
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2"
|
||||||
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
|
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
@ -0,0 +1,4 @@
|
|||||||
|
FROM verlai/verl:base-verl0.6-cu128-cudnn9.8-torch2.8.0-fa2.7.4
|
||||||
|
|
||||||
|
RUN pip install --no-cache-dir "sglang[all]==0.5.2"
|
||||||
|
RUN pip install --no-cache-dir "torch-memory-saver==0.0.9rc1"
|
108
docker/verl0.6-cu128-torch2.8.0-fa2.7.4/Dockerfile.base
Normal file
108
docker/verl0.6-cu128-torch2.8.0-fa2.7.4/Dockerfile.base
Normal file
@ -0,0 +1,108 @@
|
|||||||
|
# Start from the NVIDIA official image (ubuntu-24.04 + cuda-12.8 + python-3.12)
|
||||||
|
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-03.html
|
||||||
|
FROM nvcr.io/nvidia/pytorch:25.03-py3
|
||||||
|
|
||||||
|
# Define environments
|
||||||
|
ENV MAX_JOBS=32
|
||||||
|
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
ENV NODE_OPTIONS=""
|
||||||
|
ENV PIP_ROOT_USER_ACTION=ignore
|
||||||
|
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
|
||||||
|
ENV PIP_CONSTRAINT=""
|
||||||
|
|
||||||
|
ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
||||||
|
|
||||||
|
# Change pip source
|
||||||
|
RUN pip config set global.index-url "${PIP_INDEX}" && \
|
||||||
|
pip config set global.extra-index-url "${PIP_INDEX}" && \
|
||||||
|
pip config set global.no-cache-dir "true" && \
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
|
||||||
|
# Install systemctl
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
|
||||||
|
apt-get clean
|
||||||
|
|
||||||
|
# Install libxml2
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y libxml2 aria2 && \
|
||||||
|
apt-get clean
|
||||||
|
|
||||||
|
# Uninstall nv-pytorch fork
|
||||||
|
RUN pip uninstall -y torch torchvision torchaudio \
|
||||||
|
pytorch-quantization pytorch-triton torch-tensorrt \
|
||||||
|
transformer_engine flash_attn apex megatron-core \
|
||||||
|
xgboost opencv grpcio
|
||||||
|
|
||||||
|
# Fix packages
|
||||||
|
RUN pip install --no-cache-dir tensordict torchdata "transformers[hf_xet]==4.55.4" accelerate datasets peft hf-transfer \
|
||||||
|
"numpy<2.0.0" "pyarrow>=19.0.1" pandas \
|
||||||
|
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
|
||||||
|
pytest py-spy pre-commit ruff
|
||||||
|
|
||||||
|
# Fix cv2
|
||||||
|
RUN rm -rf /usr/local/lib/python3.11/dist-packages/cv2
|
||||||
|
|
||||||
|
# Install torch
|
||||||
|
RUN pip install --no-cache-dir torch==2.8.0 --index-url https://download.pytorch.org/whl/cu128
|
||||||
|
|
||||||
|
# Install flash-attn
|
||||||
|
RUN pip install --no-cache-dir --no-build-isolation flash_attn==2.7.4.post1
|
||||||
|
|
||||||
|
# Install DeepEP
|
||||||
|
# the dependency of IBGDA
|
||||||
|
RUN ln -s /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so
|
||||||
|
|
||||||
|
# Clone and build deepep and deepep-nvshmem
|
||||||
|
RUN git clone -b v2.3.1 https://github.com/NVIDIA/gdrcopy.git && \
|
||||||
|
git clone https://github.com/deepseek-ai/DeepEP.git && \
|
||||||
|
cd DeepEP && git checkout a84a248
|
||||||
|
|
||||||
|
# Prepare nvshmem
|
||||||
|
RUN wget https://developer.nvidia.com/downloads/assets/secure/nvshmem/nvshmem_src_3.2.5-1.txz && \
|
||||||
|
tar -xvf nvshmem_src_3.2.5-1.txz && mv nvshmem_src deepep-nvshmem && \
|
||||||
|
cd deepep-nvshmem && git apply ../DeepEP/third-party/nvshmem.patch
|
||||||
|
|
||||||
|
## Build deepep-nvshmem
|
||||||
|
RUN apt-get install -y ninja-build cmake
|
||||||
|
|
||||||
|
ENV CUDA_HOME=/usr/local/cuda
|
||||||
|
### Set MPI environment variables. Having errors when not set.
|
||||||
|
ENV CPATH=/usr/local/mpi/include:$CPATH
|
||||||
|
ENV LD_LIBRARY_PATH=/usr/local/mpi/lib:$LD_LIBRARY_PATH
|
||||||
|
ENV LD_LIBRARY_PATH=/usr/local/x86_64-linux-gnu:$LD_LIBRARY_PATH
|
||||||
|
ENV GDRCOPY_HOME=/workspace/gdrcopy
|
||||||
|
ENV GDRCOPY_INCLUDE=/workspace/gdrcopy/include
|
||||||
|
|
||||||
|
RUN cd deepep-nvshmem && \
|
||||||
|
NVSHMEM_SHMEM_SUPPORT=0 \
|
||||||
|
NVSHMEM_UCX_SUPPORT=0 \
|
||||||
|
NVSHMEM_USE_NCCL=0 \
|
||||||
|
NVSHMEM_MPI_SUPPORT=0 \
|
||||||
|
NVSHMEM_IBGDA_SUPPORT=1 \
|
||||||
|
NVSHMEM_PMIX_SUPPORT=0 \
|
||||||
|
NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
|
||||||
|
NVSHMEM_USE_GDRCOPY=1 \
|
||||||
|
cmake -G Ninja -S . -B build/ -DCMAKE_INSTALL_PREFIX=/workspace/deepep-nvshmem/install && cmake --build build/ --target install
|
||||||
|
|
||||||
|
ENV NVSHMEM_DIR=/workspace/deepep-nvshmem/install
|
||||||
|
ENV LD_LIBRARY_PATH=$NVSHMEM_DIR/lib:$LD_LIBRARY_PATH
|
||||||
|
ENV PATH=$NVSHMEM_DIR/bin:$PATH
|
||||||
|
|
||||||
|
## Build deepep
|
||||||
|
RUN cd DeepEP && \
|
||||||
|
python setup.py install
|
||||||
|
|
||||||
|
# Install Apex
|
||||||
|
RUN pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git
|
||||||
|
|
||||||
|
# Install TransformerEngine
|
||||||
|
RUN export NVTE_FRAMEWORK=pytorch && pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@v2.2.1
|
||||||
|
|
||||||
|
# Install Megatron-LM
|
||||||
|
RUN git clone -b core_v0.13.0 https://github.com/NVIDIA/Megatron-LM.git && \
|
||||||
|
cd Megatron-LM && pip3 install --no-deps -e .
|
||||||
|
|
||||||
|
# Install mbridge
|
||||||
|
RUN pip3 install --no-cache-dir git+https://github.com/ISEEKYAN/mbridge.git
|
@ -17,5 +17,5 @@ torchdata
|
|||||||
torchvision
|
torchvision
|
||||||
transformers
|
transformers
|
||||||
wandb
|
wandb
|
||||||
sglang[all]==0.4.10.post2
|
sglang[all]==0.5.2
|
||||||
huggingface_hub
|
huggingface_hub
|
||||||
|
4
setup.py
4
setup.py
@ -52,8 +52,8 @@ MATH_REQUIRES = ["math-verify"] # Add math-verify as an optional dependency
|
|||||||
VLLM_REQUIRES = ["tensordict>=0.8.0,<=0.10.0,!=0.9.0", "vllm>=0.7.3,<=0.9.1"]
|
VLLM_REQUIRES = ["tensordict>=0.8.0,<=0.10.0,!=0.9.0", "vllm>=0.7.3,<=0.9.1"]
|
||||||
SGLANG_REQUIRES = [
|
SGLANG_REQUIRES = [
|
||||||
"tensordict>=0.8.0,<=0.10.0,!=0.9.0",
|
"tensordict>=0.8.0,<=0.10.0,!=0.9.0",
|
||||||
"sglang[srt,openai]==0.4.10.post2",
|
"sglang[srt,openai]==0.5.2",
|
||||||
"torch==2.7.1",
|
"torch==2.8.0",
|
||||||
]
|
]
|
||||||
TRL_REQUIRES = ["trl<=0.9.6"]
|
TRL_REQUIRES = ["trl<=0.9.6"]
|
||||||
MCORE_REQUIRES = ["mbridge"]
|
MCORE_REQUIRES = ["mbridge"]
|
||||||
|
@ -159,7 +159,7 @@ class SGLangHttpServer:
|
|||||||
scheduler_info=self.scheduler_info,
|
scheduler_info=self.scheduler_info,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
app.is_single_tokenizer_mode = True
|
||||||
self._server_port, self._server_task = await run_unvicorn(app, server_args)
|
self._server_port, self._server_task = await run_unvicorn(app, server_args)
|
||||||
|
|
||||||
async def wake_up(self):
|
async def wake_up(self):
|
||||||
|
Reference in New Issue
Block a user