Azure tutorial/scripts (#42)

Add Azure tutorial text and scripts
This commit is contained in:
Jeff Rasley
2020-02-07 17:10:28 -08:00
committed by GitHub
parent 11a426acea
commit 23a08aebe7
12 changed files with 350 additions and 1 deletions

View File

@ -166,7 +166,8 @@ overview](./docs/features.md) for descriptions and usage.
### Installation
**TODO**
Please see our [Azure tutorial](azure/README.md) to get started with DeepSpeed on Azure!
### Writing DeepSpeed Models

112
azure/Dockerfile Normal file
View File

@ -0,0 +1,112 @@
FROM nvidia/cuda:10.0-devel-ubuntu18.04
##############################################################################
# Installation/Basic Utilities
##############################################################################
RUN apt-get update && \
apt-get install -y --no-install-recommends \
software-properties-common \
openssh-client openssh-server \
pdsh curl sudo net-tools \
vim iputils-ping wget
##############################################################################
# Installation Latest Git
##############################################################################
RUN add-apt-repository ppa:git-core/ppa -y && \
apt-get update && \
apt-get install -y git && \
git --version
##############################################################################
# Python
##############################################################################
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHON_VERSION=3
RUN apt-get install -y python3 python3-dev && \
rm -f /usr/bin/python && \
ln -s /usr/bin/python3 /usr/bin/python && \
curl -O https://bootstrap.pypa.io/get-pip.py && \
python get-pip.py && \
rm get-pip.py && \
pip install --upgrade pip && \
# Print python an pip version
python -V && pip -V
##############################################################################
# TensorFlow
##############################################################################
ENV TENSORFLOW_VERSION=1.14.0
RUN pip install tensorflow-gpu==${TENSORFLOW_VERSION}
##############################################################################
# PyTorch
##############################################################################
ENV PYTORCH_VERSION=1.2.0
ENV TORCHVISION_VERSION=0.4.0
ENV TENSORBOARDX_VERSION=1.8
RUN pip install torch==${PYTORCH_VERSION}
RUN pip install torchvision==${TORCHVISION_VERSION}
RUN pip install tensorboardX==${TENSORBOARDX_VERSION}
##############################################################################
# Temporary Installation Directory
##############################################################################
ENV STAGE_DIR=/tmp
RUN mkdir -p ${STAGE_DIR}
##############################################################################
# Mellanox OFED
##############################################################################
ENV MLNX_OFED_VERSION=4.6-1.0.1.1
RUN apt-get install -y libnuma-dev
RUN cd ${STAGE_DIR} && \
wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64.tgz | tar xzf - && \
cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64 && \
./mlnxofedinstall --user-space-only --without-fw-update --all -q && \
cd ${STAGE_DIR} && \
rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64*
##############################################################################
# nv_peer_mem
##############################################################################
RUN mkdir -p ${STAGE_DIR} && \
git clone https://github.com/Mellanox/nv_peer_memory.git ${STAGE_DIR}/nv_peer_memory && \
cd ${STAGE_DIR}/nv_peer_memory && \
./build_module.sh && \
cd ${STAGE_DIR} && \
tar xzf ${STAGE_DIR}/nvidia-peer-memory_1.0.orig.tar.gz && \
cd ${STAGE_DIR}/nvidia-peer-memory-1.0 && \
apt-get install -y dkms && \
dpkg-buildpackage -us -uc && \
dpkg -i ${STAGE_DIR}/nvidia-peer-memory_1.0-8_all.deb
##############################################################################
## Ucomment and set SSH Daemon port
###############################################################################
ENV SSH_PORT=2222
RUN cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
sed "0,/^#Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
##############################################################################
## Add deepspeed user
###############################################################################
# Add a deepspeed user with user id 8877
#RUN useradd --create-home --uid 8877 deepspeed
RUN useradd --create-home --uid 1000 --shell /bin/bash deepspeed
RUN usermod -aG sudo deepspeed
RUN echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
# # Change to non-root privilege
USER deepspeed
##############################################################################
# DeepSpeed
# TODO: once repo is public we can install latest deepspeed via this command
##############################################################################
#RUN git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
#RUN cd ${STAGE_DIR}/DeepSpeed && \
# git checkout . && \
# git checkout master && \
# sudo ./install.sh
#RUN rm -rf ${STAGE_DIR}/DeepSpeed
#RUN python -c "import deepspeed; print(deepspeed.__version__)"

45
azure/README.md Normal file
View File

@ -0,0 +1,45 @@
# DeepSpeed with Azure
This tutorial will help you get started running DeepSpeed on [Azure VMs](https://azure.microsoft.com/en-us/services/virtual-machines/), support for [Azure ML](https://azure.microsoft.com/en-us/services/machine-learning/) will be coming soon!
To help with launching Azure instances we suggest using the [Azure CLI](https://docs.microsoft.com/en-us/cli/azure/?view=azure-cli-latest). We have created several helper scripts to get you quickly started using DeepSpeed with Azure.
* Install Azure CLI on your local box: https://docs.microsoft.com/en-us/cli/azure/install-azure-cli
* Alternatively you can use the Azure in-browser shell: https://shell.azure.com/
## Create an SSH key
Generate a SSH key that will be used across this tutorial to SSH into your VMs and between Docker containers. `ssh-keygen` is the recommended way of doing this. Our scripts assume your key is located inside the same directory as the Azure scripts.
## Azure Config JSON
Our helper scripts depend on the following a configuration JSON for deployment and setup. We have provided a simple example JSON (see: [azure_config.json](azure_config.json)) that sets up a basic environment with two VMs, see the example below.
```json
{
"num_vms": 2,
"location": "southcentralus",
"azure_sku": "Standard_NV6_Promo",
"ssh_private_key": "id_rsa",
"docker_ssh_port": 2222
}
```
## Create Azure VMs
[./create_vms.sh](create_vms.sh) will create VMs with the Azure SKU you choose and in the region you specify in your config JSON. Feel free to customize your JSON to your desired region/SKU. This step will take a few minutes to complete while it sets up all of your VMs on Azure.
## Setup VM environment to use DeepSpeed
[./setup_vms.sh](setup_vms.sh) will generate the MPI-style hostfile and SSH config that all of your VMs will use so that your Docker containers can talk to one another after they are setup.
## Start the DeepSpeed docker container
[./setup_docker.sh](setup_docker.sh) will pull the DeepSpeed docker image on all VMs and start a container instance in the background. This will take several minutes since it needs to pull the entire Docker image.
## Access VMs
[./azure_ssh.sh](azure_ssh.sh) will let you SSH into any of your VMs with this syntax: `./azure_ssh.sh <node-id> [command]`, the node-id is a number between [0, num_vms). This script will find the public IP address of your VM and use the SSH key you provided in the azure config JSON.
## Access DeepSpeed container
Everything should be up and running at this point, let's access the running DeepSpeed container on the first VM and make sure we can talk to the other containers in our setup. Let's complete the following steps:
* SSH into the first VM via: `./azure_ssh.sh 0`
* Change directories into the azure folder of this repo via: `cd ~/workdir/DeepSpeed/azure`
* Attach the running docker container via: `./attach.sh`
* You should now be able to ssh into any other docker container, the containers can be accessed via their SSH alias of 'worker-N' where N is the VM number between [0, num_vms). In this example we should be able to successfully run `ssh worker-1 hostname`.
## Run Cifar example model
TODO

4
azure/attach.sh Executable file
View File

@ -0,0 +1,4 @@
#!/bin/bash
name=${1-deepspeed}
docker exec -i -t $name /bin/bash

7
azure/azure_config.json Normal file
View File

@ -0,0 +1,7 @@
{
"num_vms": 2,
"location": "southcentralus",
"azure_sku": "Standard_NV6_Promo",
"ssh_private_key": "id_rsa",
"docker_ssh_port": 2222
}

21
azure/azure_ssh.sh Executable file
View File

@ -0,0 +1,21 @@
#!/bin/bash
config_file=azure_config.json
while getopts 'c:' flag; do
case "${flag}" in
c) config_file="${OPTARG}" ;;
*) error "Unexpected option ${flag}" ;;
esac
done
shift $(expr $OPTIND - 1)
echo "Using $config_file"
nodeid=$1
cmds=${@:2}
echo $nodeid $cmds
ip_addr=`az vm list-ip-addresses | jq .[${nodeid}].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'`
ssh_private_key=`cat ${config_file} | jq .ssh_private_key | sed 's/"//g'`
if [ $ssh_private_key == "null" ]; then echo 'missing ssh_private_key in config'; exit 1; fi
ssh -i ${ssh_private_key} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null deepspeed@${ip_addr} ${cmds}

3
azure/build_docker_image.sh Executable file
View File

@ -0,0 +1,3 @@
#!/bin/bash
docker build -t deepspeed:0.1 -f Dockerfile .

55
azure/create_vms.sh Executable file
View File

@ -0,0 +1,55 @@
#!/bin/bash
azure_config=azure_config.json
# Make sure jq is installed
command -v jq
if [ $? != 0 ]; then
echo "Missing dependency of jq, please 'apt-get install jq'"
exit 1
fi
if [ ! -f ${azure_config} ]; then
echo "Cannot find $azure_config"
exit 1
fi
cat $azure_config
num_vms=`cat ${azure_config} | jq .num_vms`
if [ $num_vms == "null" ]; then echo 'missing num_vms in config'; exit 1; fi
location=`cat ${azure_config} | jq .location | sed 's/"//g'`
if [ $location == "null" ]; then echo 'missing location in config'; exit 1; fi
azure_sku=`cat ${azure_config} | jq .azure_sku | sed 's/"//g'`
if [ $azure_sku == "null" ]; then echo 'missing azure_sku in config'; exit 1; fi
ssh_private_key=`cat ${azure_config} | jq .ssh_private_key | sed 's/"//g'`
if [ $ssh_private_key == "null" ]; then echo 'missing ssh_private_key in config'; exit 1; fi
ssh_key=${ssh_private_key}.pub
if [ ! -f ${ssh_private_key} ]; then
echo "Cannot find $ssh_private_key"
exit 1
fi
if [ ! -f ${ssh_key} ]; then
echo "Cannot find $ssh_key"
exit 1
fi
resource_group=deepspeed_rg_$location
az group create --name ${resource_group} --location $location
base_vm_name=deepspeed
vm_image="nvidia:ngc_azure_17_11:ngc_gpu_cloud_19_11_3:19.11.3"
az vm image terms accept --urn ${vm_image}
for i in `seq 0 $(( num_vms - 1))`; do
vm_name=${base_vm_name}_$i
echo "creating $vm_name"
az vm create \
--resource-group ${resource_group} \
--name ${vm_name} \
--image ${vm_image} \
--admin-username deepspeed \
--size ${azure_sku} \
--ssh-key-values ${ssh_key}
done

21
azure/setup_docker.sh Executable file
View File

@ -0,0 +1,21 @@
#!/bin/bash
azure_config=azure_config.json
if [ ! -f ${azure_config} ]; then
echo "Cannot find $azure_config"
exit 1
fi
ssh_key=`cat ${azure_config} | jq .ssh_private_key | sed 's/"//g'`
if [ $ssh_key == "null" ]; then echo 'missing ssh_private_key in config'; exit 1; fi
num_vms=`cat ${azure_config} | jq .num_vms`
if [ $num_vms == "null" ]; then echo 'missing num_vms in config'; exit 1; fi
args="-i ${ssh_key} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null"
username=deepspeed
for node_id in `seq 0 $((num_vms - 1))`; do
ip_addr=`az vm list-ip-addresses | jq .[${node_id}].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'`
addr=${username}@${ip_addr}
ssh ${args} $addr "docker pull deepspeed/deepspeed:latest"
ssh ${args} $addr "cd workdir/DeepSpeed; git pull; git submodule update --init --recursive; bash azure/start_container.sh"
done

46
azure/setup_vms.sh Executable file
View File

@ -0,0 +1,46 @@
#!/bin/bash
azure_config=azure_config.json
if [ ! -f ${azure_config} ]; then
echo "Cannot find $azure_config"
exit 1
fi
ssh_key=`cat ${azure_config} | jq .ssh_private_key | sed 's/"//g'`
if [ $ssh_key == "null" ]; then echo 'missing ssh_private_key in config'; exit 1; fi
docker_ssh_port=`cat ${azure_config} | jq .docker_ssh_port`
if [ $docker_ssh_port == "null" ]; then echo 'missing docker_ssh_port in config'; exit 1; fi
username=deepspeed
args="-i ${ssh_key} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null"
num_vms=`az vm list | jq '. | length'`
first_ip_addr=`az vm list-ip-addresses | jq .[0].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'`
num_slots=`ssh $args ${username}@${first_ip_addr} 'nvidia-smi -L | wc -l'`
echo "number of slots per vm: $num_slots"
hostfile=hostfile
ssh_config=config
echo -n "" > $hostfile
echo -n "" > $ssh_config
for node_id in `seq 0 $((num_vms - 1))`; do
private_ip_addr=`az vm list-ip-addresses | jq .[${node_id}].virtualMachine.network.privateIpAddresses[0] | sed 's/"//g'`
echo "worker-${node_id} slots=${num_slots}" >> hostfile
echo "Host worker-${node_id}
HostName ${private_ip_addr}
Port ${docker_ssh_port}
StrictHostKeyChecking no
" >> ${ssh_config}
done
for node_id in `seq 0 $((num_vms - 1))`; do
ip_addr=`az vm list-ip-addresses | jq .[${node_id}].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'`
addr=${username}@${ip_addr}
echo "copying ssh keys, ssh config, hostfile to worker-${node_id}"
scp $args ${ssh_key}* ${addr}:.ssh/
scp $args ${ssh_config} ${addr}:.ssh/
ssh $args ${addr} "sudo mkdir -p /job/; sudo chmod -R 777 /job; mkdir -p workdir"
scp $args ${hostfile} ${addr}:/job/
ssh $args ${addr} 'git clone https://github.com/microsoft/DeepSpeed.git workdir/DeepSpeed'
done
rm $hostfile $ssh_config

23
azure/shutdown_vms.sh Executable file
View File

@ -0,0 +1,23 @@
#!/bin/bash
azure_config=azure_config.json
if [ ! -f ${azure_config} ]; then
echo "Cannot find $azure_config"
exit 1
fi
num_vms=`cat ${azure_config} | jq .num_vms`
if [ $num_vms == "null" ]; then echo 'missing num_vms in config'; exit 1; fi
location=`cat ${azure_config} | jq .location | sed 's/"//g'`
if [ $location == "null" ]; then echo 'missing location in config'; exit 1; fi
base_vm_name=deepspeed
resource_group=deepspeed_rg_$location
for i in `seq 0 $(( num_vms - 1))`; do
vm_name=${base_vm_name}_$i
echo "shutting down $vm_name"
az vm stop --resource-group $resource_group --name $vm_name
echo "deleting $vm_name"
az vm delete -y --resource-group $resource_group --name $vm_name
done

11
azure/start_container.sh Executable file
View File

@ -0,0 +1,11 @@
#!/bin/bash
name=${1-deepspeed}
image=deepspeed/deepspeed:latest
echo "starting docker image named $name"
docker run -d -t --name $name \
--network host \
-v ${HOME}/workdir:/home/deepspeed/workdir \
-v ${HOME}/.ssh:/home/deepspeed/.ssh \
-v /job/hostfile:/job/hostfile \
--gpus all $image bash -c 'sudo service ssh start && sleep infinity'