mirror of
https://github.com/volcengine/verl.git
synced 2025-10-20 21:53:50 +08:00
### What does this PR do? > This PR adds tensorboard as a dependency to requirements.txt file, across several Dockerfiles (Dockerfile.ngc.vllm, Dockerfile.ngc.vllm0.8, Dockerfile.ngc.vllm0.8.sagemaker), a setup script (install_vllm_sglang_mcore.sh), and the main setup.py file. This change ensures that the tensorboard package is consistently installed, enabling visualization of training metrics for various configurations and deployment environments. This is a maintenance task that enhances the project's observability without altering core functionality. ### Test > This change is a dependency update and doesn't require specific testing beyond confirming the installation is successful. ### API and Usage Example > No API changes are introduced. The usage of TensorBoard would be initiated by the user after installing the requirements. ```python # No code snippet is applicable for this change
47 lines
1.7 KiB
Docker
47 lines
1.7 KiB
Docker
# Using a pre-built image from AWS DLC which contains the current version of python (3.10) and supported cuda version (12.1)
|
|
FROM 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training:2.1.0-transformers4.36.0-gpu-py310-cu121-ubuntu20.04
|
|
|
|
# uninstall nv-pytorch fork
|
|
RUN pip3 uninstall -y pytorch-quantization \
|
|
pytorch-triton torch torch-tensorrt torchvision \
|
|
xgboost transformer_engine flash_attn apex megatron-core
|
|
|
|
# Define environments
|
|
ENV MAX_JOBS=32
|
|
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
ENV DEBIAN_FRONTEND=noninteractive
|
|
ENV NODE_OPTIONS=""
|
|
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
|
|
|
|
# Install systemctl
|
|
RUN apt-get update && \
|
|
apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
|
|
apt-get clean
|
|
|
|
# Install tini
|
|
RUN apt-get update && \
|
|
apt-get install -y tini && \
|
|
apt-get clean
|
|
|
|
# Install torch-2.6.0 + vllm-0.8.2
|
|
RUN pip install --no-cache-dir vllm==0.8.2 torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 tensordict torchdata==0.11.0 \
|
|
transformers>=4.49.0 accelerate datasets peft hf-transfer \
|
|
ray[default] codetiming hydra-core pandas pyarrow>=15.0.0 pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler \
|
|
pytest pre-commit py-spy pyext ruff tensorboard
|
|
|
|
# Install flash_attn-2.7.4.post1
|
|
RUN pip uninstall -y transformer-engine flash-attn && \
|
|
pip install flash-attn==2.7.4.post1 --no-build-isolation
|
|
|
|
# Fix cv2
|
|
RUN pip uninstall -y pynvml nvidia-ml-py && \
|
|
pip install --no-cache-dir nvidia-ml-py>=12.560.30 opencv-python-headless==4.8.0.74 fastapi==0.115.6 && \
|
|
pip install --no-cache-dir --upgrade optree>=0.13.0
|
|
|
|
# Install verl
|
|
RUN pip install --no-cache-dir verl[vllm] -U
|
|
|
|
# Reset pip config
|
|
RUN pip config unset global.index-url && \
|
|
pip config unset global.extra-index-url
|