mirror of
https://github.com/deepspeedai/DeepSpeed.git
synced 2025-10-20 23:46:02 +08:00
@ -166,7 +166,8 @@ overview](./docs/features.md) for descriptions and usage.
|
||||
|
||||
|
||||
### Installation
|
||||
**TODO**
|
||||
|
||||
Please see our [Azure tutorial](azure/README.md) to get started with DeepSpeed on Azure!
|
||||
|
||||
|
||||
### Writing DeepSpeed Models
|
||||
|
112
azure/Dockerfile
Normal file
112
azure/Dockerfile
Normal file
@ -0,0 +1,112 @@
|
||||
FROM nvidia/cuda:10.0-devel-ubuntu18.04
|
||||
|
||||
##############################################################################
|
||||
# Installation/Basic Utilities
|
||||
##############################################################################
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
software-properties-common \
|
||||
openssh-client openssh-server \
|
||||
pdsh curl sudo net-tools \
|
||||
vim iputils-ping wget
|
||||
|
||||
##############################################################################
|
||||
# Installation Latest Git
|
||||
##############################################################################
|
||||
RUN add-apt-repository ppa:git-core/ppa -y && \
|
||||
apt-get update && \
|
||||
apt-get install -y git && \
|
||||
git --version
|
||||
|
||||
##############################################################################
|
||||
# Python
|
||||
##############################################################################
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV PYTHON_VERSION=3
|
||||
RUN apt-get install -y python3 python3-dev && \
|
||||
rm -f /usr/bin/python && \
|
||||
ln -s /usr/bin/python3 /usr/bin/python && \
|
||||
curl -O https://bootstrap.pypa.io/get-pip.py && \
|
||||
python get-pip.py && \
|
||||
rm get-pip.py && \
|
||||
pip install --upgrade pip && \
|
||||
# Print python an pip version
|
||||
python -V && pip -V
|
||||
|
||||
##############################################################################
|
||||
# TensorFlow
|
||||
##############################################################################
|
||||
ENV TENSORFLOW_VERSION=1.14.0
|
||||
RUN pip install tensorflow-gpu==${TENSORFLOW_VERSION}
|
||||
|
||||
##############################################################################
|
||||
# PyTorch
|
||||
##############################################################################
|
||||
ENV PYTORCH_VERSION=1.2.0
|
||||
ENV TORCHVISION_VERSION=0.4.0
|
||||
ENV TENSORBOARDX_VERSION=1.8
|
||||
RUN pip install torch==${PYTORCH_VERSION}
|
||||
RUN pip install torchvision==${TORCHVISION_VERSION}
|
||||
RUN pip install tensorboardX==${TENSORBOARDX_VERSION}
|
||||
|
||||
##############################################################################
|
||||
# Temporary Installation Directory
|
||||
##############################################################################
|
||||
ENV STAGE_DIR=/tmp
|
||||
RUN mkdir -p ${STAGE_DIR}
|
||||
|
||||
##############################################################################
|
||||
# Mellanox OFED
|
||||
##############################################################################
|
||||
ENV MLNX_OFED_VERSION=4.6-1.0.1.1
|
||||
RUN apt-get install -y libnuma-dev
|
||||
RUN cd ${STAGE_DIR} && \
|
||||
wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64.tgz | tar xzf - && \
|
||||
cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64 && \
|
||||
./mlnxofedinstall --user-space-only --without-fw-update --all -q && \
|
||||
cd ${STAGE_DIR} && \
|
||||
rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64*
|
||||
|
||||
##############################################################################
|
||||
# nv_peer_mem
|
||||
##############################################################################
|
||||
RUN mkdir -p ${STAGE_DIR} && \
|
||||
git clone https://github.com/Mellanox/nv_peer_memory.git ${STAGE_DIR}/nv_peer_memory && \
|
||||
cd ${STAGE_DIR}/nv_peer_memory && \
|
||||
./build_module.sh && \
|
||||
cd ${STAGE_DIR} && \
|
||||
tar xzf ${STAGE_DIR}/nvidia-peer-memory_1.0.orig.tar.gz && \
|
||||
cd ${STAGE_DIR}/nvidia-peer-memory-1.0 && \
|
||||
apt-get install -y dkms && \
|
||||
dpkg-buildpackage -us -uc && \
|
||||
dpkg -i ${STAGE_DIR}/nvidia-peer-memory_1.0-8_all.deb
|
||||
|
||||
##############################################################################
|
||||
## Ucomment and set SSH Daemon port
|
||||
###############################################################################
|
||||
ENV SSH_PORT=2222
|
||||
RUN cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
|
||||
sed "0,/^#Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
|
||||
|
||||
##############################################################################
|
||||
## Add deepspeed user
|
||||
###############################################################################
|
||||
# Add a deepspeed user with user id 8877
|
||||
#RUN useradd --create-home --uid 8877 deepspeed
|
||||
RUN useradd --create-home --uid 1000 --shell /bin/bash deepspeed
|
||||
RUN usermod -aG sudo deepspeed
|
||||
RUN echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
|
||||
# # Change to non-root privilege
|
||||
USER deepspeed
|
||||
|
||||
##############################################################################
|
||||
# DeepSpeed
|
||||
# TODO: once repo is public we can install latest deepspeed via this command
|
||||
##############################################################################
|
||||
#RUN git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
|
||||
#RUN cd ${STAGE_DIR}/DeepSpeed && \
|
||||
# git checkout . && \
|
||||
# git checkout master && \
|
||||
# sudo ./install.sh
|
||||
#RUN rm -rf ${STAGE_DIR}/DeepSpeed
|
||||
#RUN python -c "import deepspeed; print(deepspeed.__version__)"
|
45
azure/README.md
Normal file
45
azure/README.md
Normal file
@ -0,0 +1,45 @@
|
||||
# DeepSpeed with Azure
|
||||
|
||||
This tutorial will help you get started running DeepSpeed on [Azure VMs](https://azure.microsoft.com/en-us/services/virtual-machines/), support for [Azure ML](https://azure.microsoft.com/en-us/services/machine-learning/) will be coming soon!
|
||||
|
||||
To help with launching Azure instances we suggest using the [Azure CLI](https://docs.microsoft.com/en-us/cli/azure/?view=azure-cli-latest). We have created several helper scripts to get you quickly started using DeepSpeed with Azure.
|
||||
* Install Azure CLI on your local box: https://docs.microsoft.com/en-us/cli/azure/install-azure-cli
|
||||
* Alternatively you can use the Azure in-browser shell: https://shell.azure.com/
|
||||
|
||||
## Create an SSH key
|
||||
Generate a SSH key that will be used across this tutorial to SSH into your VMs and between Docker containers. `ssh-keygen` is the recommended way of doing this. Our scripts assume your key is located inside the same directory as the Azure scripts.
|
||||
|
||||
## Azure Config JSON
|
||||
Our helper scripts depend on the following a configuration JSON for deployment and setup. We have provided a simple example JSON (see: [azure_config.json](azure_config.json)) that sets up a basic environment with two VMs, see the example below.
|
||||
```json
|
||||
{
|
||||
"num_vms": 2,
|
||||
"location": "southcentralus",
|
||||
"azure_sku": "Standard_NV6_Promo",
|
||||
"ssh_private_key": "id_rsa",
|
||||
"docker_ssh_port": 2222
|
||||
}
|
||||
```
|
||||
|
||||
## Create Azure VMs
|
||||
[./create_vms.sh](create_vms.sh) will create VMs with the Azure SKU you choose and in the region you specify in your config JSON. Feel free to customize your JSON to your desired region/SKU. This step will take a few minutes to complete while it sets up all of your VMs on Azure.
|
||||
|
||||
## Setup VM environment to use DeepSpeed
|
||||
[./setup_vms.sh](setup_vms.sh) will generate the MPI-style hostfile and SSH config that all of your VMs will use so that your Docker containers can talk to one another after they are setup.
|
||||
|
||||
## Start the DeepSpeed docker container
|
||||
[./setup_docker.sh](setup_docker.sh) will pull the DeepSpeed docker image on all VMs and start a container instance in the background. This will take several minutes since it needs to pull the entire Docker image.
|
||||
|
||||
## Access VMs
|
||||
[./azure_ssh.sh](azure_ssh.sh) will let you SSH into any of your VMs with this syntax: `./azure_ssh.sh <node-id> [command]`, the node-id is a number between [0, num_vms). This script will find the public IP address of your VM and use the SSH key you provided in the azure config JSON.
|
||||
|
||||
## Access DeepSpeed container
|
||||
Everything should be up and running at this point, let's access the running DeepSpeed container on the first VM and make sure we can talk to the other containers in our setup. Let's complete the following steps:
|
||||
|
||||
* SSH into the first VM via: `./azure_ssh.sh 0`
|
||||
* Change directories into the azure folder of this repo via: `cd ~/workdir/DeepSpeed/azure`
|
||||
* Attach the running docker container via: `./attach.sh`
|
||||
* You should now be able to ssh into any other docker container, the containers can be accessed via their SSH alias of 'worker-N' where N is the VM number between [0, num_vms). In this example we should be able to successfully run `ssh worker-1 hostname`.
|
||||
|
||||
## Run Cifar example model
|
||||
TODO
|
4
azure/attach.sh
Executable file
4
azure/attach.sh
Executable file
@ -0,0 +1,4 @@
|
||||
#!/bin/bash
|
||||
|
||||
name=${1-deepspeed}
|
||||
docker exec -i -t $name /bin/bash
|
7
azure/azure_config.json
Normal file
7
azure/azure_config.json
Normal file
@ -0,0 +1,7 @@
|
||||
{
|
||||
"num_vms": 2,
|
||||
"location": "southcentralus",
|
||||
"azure_sku": "Standard_NV6_Promo",
|
||||
"ssh_private_key": "id_rsa",
|
||||
"docker_ssh_port": 2222
|
||||
}
|
21
azure/azure_ssh.sh
Executable file
21
azure/azure_ssh.sh
Executable file
@ -0,0 +1,21 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_file=azure_config.json
|
||||
while getopts 'c:' flag; do
|
||||
case "${flag}" in
|
||||
c) config_file="${OPTARG}" ;;
|
||||
*) error "Unexpected option ${flag}" ;;
|
||||
esac
|
||||
done
|
||||
shift $(expr $OPTIND - 1)
|
||||
echo "Using $config_file"
|
||||
|
||||
nodeid=$1
|
||||
cmds=${@:2}
|
||||
echo $nodeid $cmds
|
||||
ip_addr=`az vm list-ip-addresses | jq .[${nodeid}].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'`
|
||||
|
||||
ssh_private_key=`cat ${config_file} | jq .ssh_private_key | sed 's/"//g'`
|
||||
if [ $ssh_private_key == "null" ]; then echo 'missing ssh_private_key in config'; exit 1; fi
|
||||
|
||||
ssh -i ${ssh_private_key} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null deepspeed@${ip_addr} ${cmds}
|
3
azure/build_docker_image.sh
Executable file
3
azure/build_docker_image.sh
Executable file
@ -0,0 +1,3 @@
|
||||
#!/bin/bash
|
||||
|
||||
docker build -t deepspeed:0.1 -f Dockerfile .
|
55
azure/create_vms.sh
Executable file
55
azure/create_vms.sh
Executable file
@ -0,0 +1,55 @@
|
||||
#!/bin/bash
|
||||
|
||||
azure_config=azure_config.json
|
||||
|
||||
# Make sure jq is installed
|
||||
command -v jq
|
||||
if [ $? != 0 ]; then
|
||||
echo "Missing dependency of jq, please 'apt-get install jq'"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -f ${azure_config} ]; then
|
||||
echo "Cannot find $azure_config"
|
||||
exit 1
|
||||
fi
|
||||
cat $azure_config
|
||||
|
||||
num_vms=`cat ${azure_config} | jq .num_vms`
|
||||
if [ $num_vms == "null" ]; then echo 'missing num_vms in config'; exit 1; fi
|
||||
location=`cat ${azure_config} | jq .location | sed 's/"//g'`
|
||||
if [ $location == "null" ]; then echo 'missing location in config'; exit 1; fi
|
||||
azure_sku=`cat ${azure_config} | jq .azure_sku | sed 's/"//g'`
|
||||
if [ $azure_sku == "null" ]; then echo 'missing azure_sku in config'; exit 1; fi
|
||||
ssh_private_key=`cat ${azure_config} | jq .ssh_private_key | sed 's/"//g'`
|
||||
if [ $ssh_private_key == "null" ]; then echo 'missing ssh_private_key in config'; exit 1; fi
|
||||
ssh_key=${ssh_private_key}.pub
|
||||
|
||||
if [ ! -f ${ssh_private_key} ]; then
|
||||
echo "Cannot find $ssh_private_key"
|
||||
exit 1
|
||||
fi
|
||||
if [ ! -f ${ssh_key} ]; then
|
||||
echo "Cannot find $ssh_key"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
resource_group=deepspeed_rg_$location
|
||||
az group create --name ${resource_group} --location $location
|
||||
|
||||
base_vm_name=deepspeed
|
||||
vm_image="nvidia:ngc_azure_17_11:ngc_gpu_cloud_19_11_3:19.11.3"
|
||||
|
||||
az vm image terms accept --urn ${vm_image}
|
||||
|
||||
for i in `seq 0 $(( num_vms - 1))`; do
|
||||
vm_name=${base_vm_name}_$i
|
||||
echo "creating $vm_name"
|
||||
az vm create \
|
||||
--resource-group ${resource_group} \
|
||||
--name ${vm_name} \
|
||||
--image ${vm_image} \
|
||||
--admin-username deepspeed \
|
||||
--size ${azure_sku} \
|
||||
--ssh-key-values ${ssh_key}
|
||||
done
|
21
azure/setup_docker.sh
Executable file
21
azure/setup_docker.sh
Executable file
@ -0,0 +1,21 @@
|
||||
#!/bin/bash
|
||||
|
||||
azure_config=azure_config.json
|
||||
if [ ! -f ${azure_config} ]; then
|
||||
echo "Cannot find $azure_config"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
ssh_key=`cat ${azure_config} | jq .ssh_private_key | sed 's/"//g'`
|
||||
if [ $ssh_key == "null" ]; then echo 'missing ssh_private_key in config'; exit 1; fi
|
||||
num_vms=`cat ${azure_config} | jq .num_vms`
|
||||
if [ $num_vms == "null" ]; then echo 'missing num_vms in config'; exit 1; fi
|
||||
|
||||
args="-i ${ssh_key} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null"
|
||||
username=deepspeed
|
||||
for node_id in `seq 0 $((num_vms - 1))`; do
|
||||
ip_addr=`az vm list-ip-addresses | jq .[${node_id}].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'`
|
||||
addr=${username}@${ip_addr}
|
||||
ssh ${args} $addr "docker pull deepspeed/deepspeed:latest"
|
||||
ssh ${args} $addr "cd workdir/DeepSpeed; git pull; git submodule update --init --recursive; bash azure/start_container.sh"
|
||||
done
|
46
azure/setup_vms.sh
Executable file
46
azure/setup_vms.sh
Executable file
@ -0,0 +1,46 @@
|
||||
#!/bin/bash
|
||||
|
||||
azure_config=azure_config.json
|
||||
if [ ! -f ${azure_config} ]; then
|
||||
echo "Cannot find $azure_config"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
ssh_key=`cat ${azure_config} | jq .ssh_private_key | sed 's/"//g'`
|
||||
if [ $ssh_key == "null" ]; then echo 'missing ssh_private_key in config'; exit 1; fi
|
||||
docker_ssh_port=`cat ${azure_config} | jq .docker_ssh_port`
|
||||
if [ $docker_ssh_port == "null" ]; then echo 'missing docker_ssh_port in config'; exit 1; fi
|
||||
|
||||
username=deepspeed
|
||||
args="-i ${ssh_key} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null"
|
||||
|
||||
num_vms=`az vm list | jq '. | length'`
|
||||
first_ip_addr=`az vm list-ip-addresses | jq .[0].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'`
|
||||
num_slots=`ssh $args ${username}@${first_ip_addr} 'nvidia-smi -L | wc -l'`
|
||||
echo "number of slots per vm: $num_slots"
|
||||
|
||||
hostfile=hostfile
|
||||
ssh_config=config
|
||||
echo -n "" > $hostfile
|
||||
echo -n "" > $ssh_config
|
||||
for node_id in `seq 0 $((num_vms - 1))`; do
|
||||
private_ip_addr=`az vm list-ip-addresses | jq .[${node_id}].virtualMachine.network.privateIpAddresses[0] | sed 's/"//g'`
|
||||
echo "worker-${node_id} slots=${num_slots}" >> hostfile
|
||||
echo "Host worker-${node_id}
|
||||
HostName ${private_ip_addr}
|
||||
Port ${docker_ssh_port}
|
||||
StrictHostKeyChecking no
|
||||
" >> ${ssh_config}
|
||||
done
|
||||
|
||||
for node_id in `seq 0 $((num_vms - 1))`; do
|
||||
ip_addr=`az vm list-ip-addresses | jq .[${node_id}].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'`
|
||||
addr=${username}@${ip_addr}
|
||||
echo "copying ssh keys, ssh config, hostfile to worker-${node_id}"
|
||||
scp $args ${ssh_key}* ${addr}:.ssh/
|
||||
scp $args ${ssh_config} ${addr}:.ssh/
|
||||
ssh $args ${addr} "sudo mkdir -p /job/; sudo chmod -R 777 /job; mkdir -p workdir"
|
||||
scp $args ${hostfile} ${addr}:/job/
|
||||
ssh $args ${addr} 'git clone https://github.com/microsoft/DeepSpeed.git workdir/DeepSpeed'
|
||||
done
|
||||
rm $hostfile $ssh_config
|
23
azure/shutdown_vms.sh
Executable file
23
azure/shutdown_vms.sh
Executable file
@ -0,0 +1,23 @@
|
||||
#!/bin/bash
|
||||
|
||||
azure_config=azure_config.json
|
||||
if [ ! -f ${azure_config} ]; then
|
||||
echo "Cannot find $azure_config"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
num_vms=`cat ${azure_config} | jq .num_vms`
|
||||
if [ $num_vms == "null" ]; then echo 'missing num_vms in config'; exit 1; fi
|
||||
location=`cat ${azure_config} | jq .location | sed 's/"//g'`
|
||||
if [ $location == "null" ]; then echo 'missing location in config'; exit 1; fi
|
||||
|
||||
base_vm_name=deepspeed
|
||||
resource_group=deepspeed_rg_$location
|
||||
|
||||
for i in `seq 0 $(( num_vms - 1))`; do
|
||||
vm_name=${base_vm_name}_$i
|
||||
echo "shutting down $vm_name"
|
||||
az vm stop --resource-group $resource_group --name $vm_name
|
||||
echo "deleting $vm_name"
|
||||
az vm delete -y --resource-group $resource_group --name $vm_name
|
||||
done
|
11
azure/start_container.sh
Executable file
11
azure/start_container.sh
Executable file
@ -0,0 +1,11 @@
|
||||
#!/bin/bash
|
||||
|
||||
name=${1-deepspeed}
|
||||
image=deepspeed/deepspeed:latest
|
||||
echo "starting docker image named $name"
|
||||
docker run -d -t --name $name \
|
||||
--network host \
|
||||
-v ${HOME}/workdir:/home/deepspeed/workdir \
|
||||
-v ${HOME}/.ssh:/home/deepspeed/.ssh \
|
||||
-v /job/hostfile:/job/hostfile \
|
||||
--gpus all $image bash -c 'sudo service ssh start && sleep infinity'
|
Reference in New Issue
Block a user