[docker] aws efa driver dockerfile (#1631)

### Checklist Before Starting

- [x] Search for similar PR(s).

### What does this PR do?

Add sample dockerfile to support aws efa driver. Otherwise NCCL raise
system error on such aws instances (like sagemaker ai pod).
This commit is contained in:
spacegoing
2025-05-22 12:53:21 +08:00
committed by GitHub
parent c07013ea39
commit be215d7b08
2 changed files with 58 additions and 1 deletions

53
docker/Dockerfile.awsefa Normal file
View File

@ -0,0 +1,53 @@
FROM whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6-mcore0.12.0-te2.3
# For aws instances with EFA net interface (Sagemaker AI Pod)
# install EFA driver:
######## AWS EFA ############
ENV NCCL_VERSION=2.25.1-1
ENV DEBIAN_FRONTEND=noninteractive
ENV EFA_INSTALLER_VERSION=1.40.0
ENV AWS_OFI_NCCL_VERSION=1.14.2
ENV FI_EFA_SET_CUDA_SYNC_MEMOPS=0
ENV FI_PROVIDER=efa
RUN apt update && apt install -y linux-image-generic libhwloc-dev
RUN cd /tmp && \
curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \
tar -xf aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \
cd aws-efa-installer && \
./efa_installer.sh -y -g --skip-kmod --skip-limit-conf --no-verify && \
ldconfig && \
rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/*
# NCCL EFA Plugin
RUN cd /tmp && \
curl -LO https://github.com/aws/aws-ofi-nccl/archive/refs/tags/v${AWS_OFI_NCCL_VERSION}.tar.gz && \
tar -xzf /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \
rm /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \
mv aws-ofi-nccl-${AWS_OFI_NCCL_VERSION} aws-ofi-nccl && \
cd /tmp/aws-ofi-nccl && \
./autogen.sh && \
./configure --prefix=/opt/amazon/efa \
--with-libfabric=/opt/amazon/efa \
--with-cuda=/usr/local/cuda \
--enable-platform-aws \
--with-mpi=/opt/amazon/openmpi && \
make -j$(nproc) install && \
rm -rf /tmp/aws-ofi/nccl
# NCCL
RUN echo "/usr/local/lib" >> /etc/ld.so.conf.d/local.conf && \
echo "/opt/amazon/openmpi/lib" >> /etc/ld.so.conf.d/efa.conf && \
ldconfig
ENV OMPI_MCA_pml=^cm,ucx \
OMPI_MCA_btl=tcp,self \
OMPI_MCA_btl_tcp_if_exclude=lo,docker0,veth_def_agent \
OPAL_PREFIX=/opt/amazon/openmpi \
NCCL_SOCKET_IFNAME=^docker,lo,veth_def_agent \
FI_EFA_USE_HUGE_PAGE=0
# docker build -t whatcanyousee/verl:awsefa --label "commit=$(git rev-parse --short HEAD)" .
# on aws:
# docker run --ipc=host --privileged --name verldev --gpus all --network=host --shm-size=1800gb -itd whatcanyousee/verl:awsefa

View File

@ -69,7 +69,7 @@ See files under ``docker/`` for NGC-based image or if you want to build your own
# pip3 install verl[sglang]
.. note::
The Docker image ``whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6-mcore0.12.0-te2.3`` is built with the following configurations:
- **PyTorch**: 2.6.0+cu124
@ -84,6 +84,10 @@ See files under ``docker/`` for NGC-based image or if you want to build your own
- **TransformerEngine**: 2.3
- **Ray**: 2.44.1
.. note::
For aws instances with EFA net interface (Sagemaker AI Pod),
you need to install EFA driver as shown in ``docker/Dockerfile.awsefa``
Install from custom environment
---------------------------------------------