mirror of
https://github.com/volcengine/verl.git
synced 2025-10-20 21:53:50 +08:00
[megatron] chore: add a docker image for with mcore0.15 and TE2.7 (#3540)
This commit is contained in:
@ -0,0 +1,39 @@
|
||||
# Start from the verl base image
|
||||
# Dockerfile.base
|
||||
FROM iseekyan/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.7.4-h100
|
||||
|
||||
# Define environments
|
||||
ENV MAX_JOBS=32
|
||||
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV NODE_OPTIONS=""
|
||||
ENV PIP_ROOT_USER_ACTION=ignore
|
||||
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
|
||||
|
||||
# Install torch-2.7.1+cu126 + vllm-0.10.0
|
||||
RUN pip install --resume-retries 999 --no-cache-dir vllm==0.10.0
|
||||
|
||||
# Fix packages
|
||||
# transformers 4.54.0 still not support
|
||||
RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.55.4" accelerate datasets peft hf-transfer \
|
||||
"numpy<2.0.0" "pyarrow>=19.0.1" pandas \
|
||||
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
|
||||
pytest py-spy pyext pre-commit ruff
|
||||
|
||||
RUN pip uninstall -y pynvml nvidia-ml-py && \
|
||||
pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
|
||||
|
||||
RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
|
||||
|
||||
# Install TransformerEngine
|
||||
RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@release_v2.7
|
||||
RUN pip install onnxscript
|
||||
|
||||
# Install Megatron-LM
|
||||
RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.15.0rc4
|
||||
|
||||
# Install mbridge
|
||||
RUN pip3 install --no-cache-dir mbridge==v0.15.0
|
||||
|
||||
# Fix qwen vl
|
||||
RUN pip3 install --no-cache-dir --no-deps trl
|
@ -24,3 +24,4 @@ megatron.core==core_r0.13.0
|
||||
- App image:
|
||||
- `verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2`
|
||||
- `verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2`
|
||||
- `iseekyan/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.15.0-te2.7`
|
||||
|
@ -79,6 +79,8 @@ For latest vLLM with FSDP, please refer to `hiyouga/verl <https://hub.docker.com
|
||||
|
||||
For latest SGLang with FSDP, please refer to `hebiaobuaa/verl <https://hub.docker.com/r/hebiaobuaa/verl>`_ repository and the latest version is ``hebiaobuaa/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2`` which is provided by SGLang RL Group.
|
||||
|
||||
For latest vLLM with Megatron, please refer to `iseekyan/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.15.0-te2.7`
|
||||
|
||||
See files under ``docker/`` for NGC-based image or if you want to build your own.
|
||||
|
||||
Note that For aws instances with EFA net interface (Sagemaker AI Pod),
|
||||
|
Reference in New Issue
Block a user