mirror of
https://github.com/volcengine/verl.git
synced 2025-10-20 13:43:50 +08:00
Compare commits
1 Commits
7ddb9b29f0
...
v0.5.x
Author | SHA1 | Date | |
---|---|---|---|
3392f3b6c6 |
141
docker/Dockerfile.rocm7
Normal file
141
docker/Dockerfile.rocm7
Normal file
@ -0,0 +1,141 @@
|
||||
# default base image
|
||||
ARG REMOTE_VLLM="1"
|
||||
ARG COMMON_WORKDIR=/app
|
||||
ARG BASE_IMAGE=rocm/vllm-dev:base_rocm7_0930_rc1_20250916_tuned_20250917
|
||||
|
||||
FROM ${BASE_IMAGE} AS base
|
||||
|
||||
ARG ARG_PYTORCH_ROCM_ARCH
|
||||
ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
|
||||
|
||||
# Install some basic utilities
|
||||
RUN apt-get update -q -y && apt-get install -q -y \
|
||||
sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
|
||||
apt-transport-https ca-certificates wget curl
|
||||
# Remove sccache
|
||||
RUN python3 -m pip install --upgrade pip
|
||||
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
|
||||
ARG COMMON_WORKDIR
|
||||
WORKDIR ${COMMON_WORKDIR}
|
||||
|
||||
|
||||
# -----------------------
|
||||
# vLLM fetch stages
|
||||
FROM base AS fetch_vllm_0
|
||||
ONBUILD COPY ./ vllm/
|
||||
FROM base AS fetch_vllm_1
|
||||
#ARG VLLM_REPO="https://github.com/ROCm/vllm.git"
|
||||
#ARG VLLM_BRANCH="main"
|
||||
ARG VLLM_REPO=https://github.com/HollowMan6/vllm.git
|
||||
ARG VLLM_BRANCH="sleep_amd"
|
||||
ONBUILD RUN git clone ${VLLM_REPO} \
|
||||
&& cd vllm \
|
||||
&& git checkout ${VLLM_BRANCH}
|
||||
FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
|
||||
|
||||
# -----------------------
|
||||
# vLLM build stages
|
||||
FROM fetch_vllm AS build_vllm
|
||||
# Build vLLM
|
||||
RUN cd vllm \
|
||||
&& python3 -m pip install -r requirements/rocm.txt \
|
||||
&& python3 setup.py clean --all \
|
||||
&& ln -sf /opt/rocm/lib/libamdhip64.so /usr/lib/libamdhip64.so \
|
||||
&& VLLM_TARGET_DEVICE=rocm ROCM_PATH=/opt/rocm/ VLLM_GPU_LANG=HIP SETUPTOOLS_SCM_PRETEND_VERSION=0.8.4.dev python3 setup.py bdist_wheel --dist-dir=dist
|
||||
#&& python3 setup.py bdist_wheel --dist-dir=dist
|
||||
FROM scratch AS export_vllm
|
||||
ARG COMMON_WORKDIR
|
||||
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl /
|
||||
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements /requirements
|
||||
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks
|
||||
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests
|
||||
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples
|
||||
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
|
||||
|
||||
# -----------------------
|
||||
# Test vLLM image
|
||||
FROM base AS test
|
||||
|
||||
RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install vLLM
|
||||
RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
|
||||
cd /install \
|
||||
&& pip install -U -r requirements/rocm.txt \
|
||||
&& pip install -U -r requirements/rocm-test.txt \
|
||||
&& pip uninstall -y vllm \
|
||||
&& pip install *.whl
|
||||
|
||||
WORKDIR /vllm-workspace
|
||||
ARG COMMON_WORKDIR
|
||||
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN cd /vllm-workspace \
|
||||
&& rm -rf vllm \
|
||||
&& python3 -m pip install -e tests/vllm_test_utils \
|
||||
&& python3 -m pip install lm-eval[api]==0.4.4 \
|
||||
&& python3 -m pip install pytest-shard
|
||||
|
||||
# -----------------------
|
||||
# Final vLLM image
|
||||
FROM base AS final
|
||||
|
||||
RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
|
||||
# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
|
||||
# Manually remove it so that later steps of numpy upgrade can continue
|
||||
RUN case "$(which python3)" in \
|
||||
*"/opt/conda/envs/py_3.9"*) \
|
||||
rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \
|
||||
*) ;; esac
|
||||
|
||||
RUN python3 -m pip install --upgrade huggingface-hub[cli]
|
||||
|
||||
# Install vLLM
|
||||
RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
|
||||
cd /install \
|
||||
&& pip install -U -r requirements/rocm.txt \
|
||||
&& pip uninstall -y vllm \
|
||||
&& pip install *.whl
|
||||
|
||||
ARG COMMON_WORKDIR
|
||||
|
||||
# Copy over the benchmark scripts as well
|
||||
COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks
|
||||
COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
|
||||
|
||||
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
||||
ENV TOKENIZERS_PARALLELISM=false
|
||||
|
||||
# ENV that can improve safe tensor loading, and end-to-end time
|
||||
ENV SAFETENSORS_FAST_GPU=1
|
||||
|
||||
# Performance environment variable.
|
||||
ENV HIP_FORCE_DEV_KERNARG=1
|
||||
|
||||
# -----------------------
|
||||
# Install verl
|
||||
RUN pip install "tensordict==0.6.2" --no-deps && \
|
||||
pip install accelerate \
|
||||
codetiming \
|
||||
datasets \
|
||||
dill \
|
||||
hydra-core \
|
||||
liger-kernel \
|
||||
numpy \
|
||||
pandas \
|
||||
peft \
|
||||
"pyarrow>=15.0.0" \
|
||||
pylatexenc \
|
||||
torchdata \
|
||||
wandb \
|
||||
orjson \
|
||||
pybind11
|
||||
|
||||
WORKDIR /workspace/
|
||||
RUN git clone -b v0.5.x https://github.com/volcengine/verl.git && \
|
||||
cd verl && \
|
||||
pip install -e .
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
|
Reference in New Issue
Block a user