mirror of
https://github.com/volcengine/verl.git
synced 2025-10-20 13:43:50 +08:00
[docker] aws efa driver dockerfile (#1631)
### Checklist Before Starting - [x] Search for similar PR(s). ### What does this PR do? Add sample dockerfile to support aws efa driver. Otherwise NCCL raise system error on such aws instances (like sagemaker ai pod).
This commit is contained in:
53
docker/Dockerfile.awsefa
Normal file
53
docker/Dockerfile.awsefa
Normal file
@ -0,0 +1,53 @@
|
||||
FROM whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6-mcore0.12.0-te2.3
|
||||
|
||||
# For aws instances with EFA net interface (Sagemaker AI Pod)
|
||||
# install EFA driver:
|
||||
######## AWS EFA ############
|
||||
ENV NCCL_VERSION=2.25.1-1
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV EFA_INSTALLER_VERSION=1.40.0
|
||||
ENV AWS_OFI_NCCL_VERSION=1.14.2
|
||||
ENV FI_EFA_SET_CUDA_SYNC_MEMOPS=0
|
||||
ENV FI_PROVIDER=efa
|
||||
|
||||
RUN apt update && apt install -y linux-image-generic libhwloc-dev
|
||||
|
||||
RUN cd /tmp && \
|
||||
curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \
|
||||
tar -xf aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \
|
||||
cd aws-efa-installer && \
|
||||
./efa_installer.sh -y -g --skip-kmod --skip-limit-conf --no-verify && \
|
||||
ldconfig && \
|
||||
rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/*
|
||||
|
||||
# NCCL EFA Plugin
|
||||
RUN cd /tmp && \
|
||||
curl -LO https://github.com/aws/aws-ofi-nccl/archive/refs/tags/v${AWS_OFI_NCCL_VERSION}.tar.gz && \
|
||||
tar -xzf /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \
|
||||
rm /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \
|
||||
mv aws-ofi-nccl-${AWS_OFI_NCCL_VERSION} aws-ofi-nccl && \
|
||||
cd /tmp/aws-ofi-nccl && \
|
||||
./autogen.sh && \
|
||||
./configure --prefix=/opt/amazon/efa \
|
||||
--with-libfabric=/opt/amazon/efa \
|
||||
--with-cuda=/usr/local/cuda \
|
||||
--enable-platform-aws \
|
||||
--with-mpi=/opt/amazon/openmpi && \
|
||||
make -j$(nproc) install && \
|
||||
rm -rf /tmp/aws-ofi/nccl
|
||||
|
||||
# NCCL
|
||||
RUN echo "/usr/local/lib" >> /etc/ld.so.conf.d/local.conf && \
|
||||
echo "/opt/amazon/openmpi/lib" >> /etc/ld.so.conf.d/efa.conf && \
|
||||
ldconfig
|
||||
|
||||
ENV OMPI_MCA_pml=^cm,ucx \
|
||||
OMPI_MCA_btl=tcp,self \
|
||||
OMPI_MCA_btl_tcp_if_exclude=lo,docker0,veth_def_agent \
|
||||
OPAL_PREFIX=/opt/amazon/openmpi \
|
||||
NCCL_SOCKET_IFNAME=^docker,lo,veth_def_agent \
|
||||
FI_EFA_USE_HUGE_PAGE=0
|
||||
|
||||
# docker build -t whatcanyousee/verl:awsefa --label "commit=$(git rev-parse --short HEAD)" .
|
||||
# on aws:
|
||||
# docker run --ipc=host --privileged --name verldev --gpus all --network=host --shm-size=1800gb -itd whatcanyousee/verl:awsefa
|
@ -69,7 +69,7 @@ See files under ``docker/`` for NGC-based image or if you want to build your own
|
||||
# pip3 install verl[sglang]
|
||||
|
||||
.. note::
|
||||
|
||||
|
||||
The Docker image ``whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6-mcore0.12.0-te2.3`` is built with the following configurations:
|
||||
|
||||
- **PyTorch**: 2.6.0+cu124
|
||||
@ -84,6 +84,10 @@ See files under ``docker/`` for NGC-based image or if you want to build your own
|
||||
- **TransformerEngine**: 2.3
|
||||
- **Ray**: 2.44.1
|
||||
|
||||
.. note::
|
||||
|
||||
For aws instances with EFA net interface (Sagemaker AI Pod),
|
||||
you need to install EFA driver as shown in ``docker/Dockerfile.awsefa``
|
||||
|
||||
Install from custom environment
|
||||
---------------------------------------------
|
||||
|
Reference in New Issue
Block a user