mirror of
https://github.com/volcengine/verl.git
synced 2025-10-20 13:43:50 +08:00
### What does this PR do? > This PR adds tensorboard as a dependency to requirements.txt file, across several Dockerfiles (Dockerfile.ngc.vllm, Dockerfile.ngc.vllm0.8, Dockerfile.ngc.vllm0.8.sagemaker), a setup script (install_vllm_sglang_mcore.sh), and the main setup.py file. This change ensures that the tensorboard package is consistently installed, enabling visualization of training metrics for various configurations and deployment environments. This is a maintenance task that enhances the project's observability without altering core functionality. ### Test > This change is a dependency update and doesn't require specific testing beyond confirming the installation is successful. ### API and Usage Example > No API changes are introduced. The usage of TensorBoard would be initiated by the user after installing the requirements. ```python # No code snippet is applicable for this change
49 lines
1.8 KiB
Docker
49 lines
1.8 KiB
Docker
# docker buildx build --platform linux/x86_64 -t "verlai/verl:ngc-th2.4.0-cu124-vllm0.6.3-ray2.4-te1.7-v0.0.6" -f docker/Dockerfile.ngc.vllm . --builder cloud-verlai-verl-builder --progress=plain --push
|
|
FROM nvcr.io/nvidia/pytorch:24.05-py3
|
|
|
|
# uninstall nv-pytorch fork
|
|
RUN pip3 uninstall pytorch-quantization \
|
|
pytorch-triton \
|
|
torch \
|
|
torch-tensorrt \
|
|
torchvision \
|
|
xgboost transformer_engine flash_attn \
|
|
apex megatron-core -y
|
|
|
|
RUN pip3 install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu124
|
|
|
|
# =============== Megatron dependencies (optional) =================
|
|
# install apex, set MAX_JOBS to avoid OOMs
|
|
RUN MAX_JOBS=4 pip3 install -v --disable-pip-version-check --no-cache-dir --no-build-isolation \
|
|
--config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" \
|
|
git+https://github.com/NVIDIA/apex
|
|
# =============== End of Megatron dependencies (optional) =================
|
|
|
|
RUN pip3 install --no-cache-dir \
|
|
accelerate \
|
|
codetiming \
|
|
datasets \
|
|
dill \
|
|
hydra-core \
|
|
numpy \
|
|
'pandas' \
|
|
'peft' \
|
|
'pyarrow>=15.0.0' \
|
|
'pybind11' \
|
|
'pylatexenc' \
|
|
'ray>=2.10' \
|
|
'tensordict<0.6' \
|
|
'transformers' \
|
|
'vllm==0.6.3.post1' \
|
|
'wandb' \
|
|
'tensorboard'
|
|
|
|
# full dependencies
|
|
RUN pip3 install pytest pre-commit py-spy pyext liger-kernel
|
|
|
|
# =============== Megatron dependencies (optional) =================
|
|
# install Transformer Engine, which requires FA 2.5.8. Do it in a separate step for docker cache
|
|
RUN MAX_JOBS=4 NINJA_FLAGS="-j4" pip3 install flash-attn==2.5.8 --no-cache-dir --no-build-isolation
|
|
RUN MAX_JOBS=1 NINJA_FLAGS="-j1" TE_BUILD_WITH_NINJA=0 pip3 install git+https://github.com/eric-haibin-lin/TransformerEngine.git@v1.7.0
|
|
# =============== End of Megatron dependencies (optional) =================
|