From 20ff66a0c1ec07e527626ee42b8f3afe8f75493d Mon Sep 17 00:00:00 2001 From: Jeff Rasley Date: Sat, 8 Feb 2020 22:00:24 -0800 Subject: [PATCH] Azure tutorial updates and cleanup (#43) --- README.md | 14 ++--- azure/Dockerfile | 112 --------------------------------- azure/README.md | 46 +------------- azure/attach.sh | 2 +- azure/build_docker_image.sh | 2 +- azure/setup_docker.sh | 39 ++++++++++-- azure/setup_vms.sh | 10 ++- azure/shutdown_vms.sh | 4 +- bin/deepspeed.pt | 1 + bin/ds_ssh | 20 ++++++ docs/azure.md | 121 ++++++++++++++++++++++++++++++++++++ setup.py | 4 +- 12 files changed, 195 insertions(+), 180 deletions(-) delete mode 100644 azure/Dockerfile mode change 100644 => 120000 azure/README.md create mode 120000 bin/deepspeed.pt create mode 100755 bin/ds_ssh create mode 100644 docs/azure.md diff --git a/README.md b/README.md index 9bbd2e000..b86e7dcb9 100755 --- a/README.md +++ b/README.md @@ -10,10 +10,7 @@ efficient, and effective. DeepSpeed can train DL models with over a hundred billion parameters on current generation of GPU clusters, while achieving over 5x in system performance -compared to the state-of-art. Early adopters of DeepSpeed have already produced -a language model (LM) with over 17B parameters establishing a new SOTA in the LM -category. - +compared to the state-of-art. ## Table of Contents @@ -87,10 +84,6 @@ replicated across data-parallel processes, ZeRO partitions model states to save significant memory. The current implementation (stage 1 of ZeRO) reduces memory by up to 4x relative to the state-of-art. You can read more about ZeRO in our [paper](https://arxiv.org/abs/1910.02054). -With this impressive memory reduction, early adopters of DeepSpeed have already -produced alanguage model (LM) with over 17B parameters called -[Turing-NLG](link-to-turing-blog), establishing a new SOTA in the LM category. - ### Scalability DeepSpeed supports efficient data parallelism, model parallelism, and their combination. ZeRO boosts the scaling capability and efficiency further. @@ -167,8 +160,9 @@ overview](./docs/features.md) for descriptions and usage. ### Installation -Please see our [Azure tutorial](azure/README.md) to get started with DeepSpeed on Azure! - +* Please see our [Azure tutorial](docs/azure.md) to get started with DeepSpeed on Azure! +* If you're not on Azure we recommend using our docker image via `docker pull deepspeed/deepspeed:latest` which contains a pre-installed version of DeepSpeed and all the necessary dependencies. +* If you want to install DeepSpeed manually we provide an install script [install.sh](install.sh) to help install on a local machine or across an entire cluster. ### Writing DeepSpeed Models DeepSpeed model training is accomplished using the DeepSpeed engine. The engine diff --git a/azure/Dockerfile b/azure/Dockerfile deleted file mode 100644 index 115314576..000000000 --- a/azure/Dockerfile +++ /dev/null @@ -1,112 +0,0 @@ -FROM nvidia/cuda:10.0-devel-ubuntu18.04 - -############################################################################## -# Installation/Basic Utilities -############################################################################## -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - software-properties-common \ - openssh-client openssh-server \ - pdsh curl sudo net-tools \ - vim iputils-ping wget - -############################################################################## -# Installation Latest Git -############################################################################## -RUN add-apt-repository ppa:git-core/ppa -y && \ - apt-get update && \ - apt-get install -y git && \ - git --version - -############################################################################## -# Python -############################################################################## -ENV DEBIAN_FRONTEND=noninteractive -ENV PYTHON_VERSION=3 -RUN apt-get install -y python3 python3-dev && \ - rm -f /usr/bin/python && \ - ln -s /usr/bin/python3 /usr/bin/python && \ - curl -O https://bootstrap.pypa.io/get-pip.py && \ - python get-pip.py && \ - rm get-pip.py && \ - pip install --upgrade pip && \ - # Print python an pip version - python -V && pip -V - -############################################################################## -# TensorFlow -############################################################################## -ENV TENSORFLOW_VERSION=1.14.0 -RUN pip install tensorflow-gpu==${TENSORFLOW_VERSION} - -############################################################################## -# PyTorch -############################################################################## -ENV PYTORCH_VERSION=1.2.0 -ENV TORCHVISION_VERSION=0.4.0 -ENV TENSORBOARDX_VERSION=1.8 -RUN pip install torch==${PYTORCH_VERSION} -RUN pip install torchvision==${TORCHVISION_VERSION} -RUN pip install tensorboardX==${TENSORBOARDX_VERSION} - -############################################################################## -# Temporary Installation Directory -############################################################################## -ENV STAGE_DIR=/tmp -RUN mkdir -p ${STAGE_DIR} - -############################################################################## -# Mellanox OFED -############################################################################## -ENV MLNX_OFED_VERSION=4.6-1.0.1.1 -RUN apt-get install -y libnuma-dev -RUN cd ${STAGE_DIR} && \ - wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64.tgz | tar xzf - && \ - cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64 && \ - ./mlnxofedinstall --user-space-only --without-fw-update --all -q && \ - cd ${STAGE_DIR} && \ - rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64* - -############################################################################## -# nv_peer_mem -############################################################################## -RUN mkdir -p ${STAGE_DIR} && \ - git clone https://github.com/Mellanox/nv_peer_memory.git ${STAGE_DIR}/nv_peer_memory && \ - cd ${STAGE_DIR}/nv_peer_memory && \ - ./build_module.sh && \ - cd ${STAGE_DIR} && \ - tar xzf ${STAGE_DIR}/nvidia-peer-memory_1.0.orig.tar.gz && \ - cd ${STAGE_DIR}/nvidia-peer-memory-1.0 && \ - apt-get install -y dkms && \ - dpkg-buildpackage -us -uc && \ - dpkg -i ${STAGE_DIR}/nvidia-peer-memory_1.0-8_all.deb - -############################################################################## -## Ucomment and set SSH Daemon port -############################################################################### -ENV SSH_PORT=2222 -RUN cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \ - sed "0,/^#Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config - -############################################################################## -## Add deepspeed user -############################################################################### -# Add a deepspeed user with user id 8877 -#RUN useradd --create-home --uid 8877 deepspeed -RUN useradd --create-home --uid 1000 --shell /bin/bash deepspeed -RUN usermod -aG sudo deepspeed -RUN echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers -# # Change to non-root privilege -USER deepspeed - -############################################################################## -# DeepSpeed -# TODO: once repo is public we can install latest deepspeed via this command -############################################################################## -#RUN git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed -#RUN cd ${STAGE_DIR}/DeepSpeed && \ -# git checkout . && \ -# git checkout master && \ -# sudo ./install.sh -#RUN rm -rf ${STAGE_DIR}/DeepSpeed -#RUN python -c "import deepspeed; print(deepspeed.__version__)" diff --git a/azure/README.md b/azure/README.md deleted file mode 100644 index c4adda9ec..000000000 --- a/azure/README.md +++ /dev/null @@ -1,45 +0,0 @@ -# DeepSpeed with Azure - -This tutorial will help you get started running DeepSpeed on [Azure VMs](https://azure.microsoft.com/en-us/services/virtual-machines/), support for [Azure ML](https://azure.microsoft.com/en-us/services/machine-learning/) will be coming soon! - -To help with launching Azure instances we suggest using the [Azure CLI](https://docs.microsoft.com/en-us/cli/azure/?view=azure-cli-latest). We have created several helper scripts to get you quickly started using DeepSpeed with Azure. - * Install Azure CLI on your local box: https://docs.microsoft.com/en-us/cli/azure/install-azure-cli - * Alternatively you can use the Azure in-browser shell: https://shell.azure.com/ - - ## Create an SSH key - Generate a SSH key that will be used across this tutorial to SSH into your VMs and between Docker containers. `ssh-keygen` is the recommended way of doing this. Our scripts assume your key is located inside the same directory as the Azure scripts. - - ## Azure Config JSON - Our helper scripts depend on the following a configuration JSON for deployment and setup. We have provided a simple example JSON (see: [azure_config.json](azure_config.json)) that sets up a basic environment with two VMs, see the example below. - ```json -{ - "num_vms": 2, - "location": "southcentralus", - "azure_sku": "Standard_NV6_Promo", - "ssh_private_key": "id_rsa", - "docker_ssh_port": 2222 -} - ``` - - ## Create Azure VMs - [./create_vms.sh](create_vms.sh) will create VMs with the Azure SKU you choose and in the region you specify in your config JSON. Feel free to customize your JSON to your desired region/SKU. This step will take a few minutes to complete while it sets up all of your VMs on Azure. - - ## Setup VM environment to use DeepSpeed - [./setup_vms.sh](setup_vms.sh) will generate the MPI-style hostfile and SSH config that all of your VMs will use so that your Docker containers can talk to one another after they are setup. - - ## Start the DeepSpeed docker container - [./setup_docker.sh](setup_docker.sh) will pull the DeepSpeed docker image on all VMs and start a container instance in the background. This will take several minutes since it needs to pull the entire Docker image. - - ## Access VMs - [./azure_ssh.sh](azure_ssh.sh) will let you SSH into any of your VMs with this syntax: `./azure_ssh.sh [command]`, the node-id is a number between [0, num_vms). This script will find the public IP address of your VM and use the SSH key you provided in the azure config JSON. - -## Access DeepSpeed container -Everything should be up and running at this point, let's access the running DeepSpeed container on the first VM and make sure we can talk to the other containers in our setup. Let's complete the following steps: - - * SSH into the first VM via: `./azure_ssh.sh 0` - * Change directories into the azure folder of this repo via: `cd ~/workdir/DeepSpeed/azure` - * Attach the running docker container via: `./attach.sh` - * You should now be able to ssh into any other docker container, the containers can be accessed via their SSH alias of 'worker-N' where N is the VM number between [0, num_vms). In this example we should be able to successfully run `ssh worker-1 hostname`. - -## Run Cifar example model -TODO diff --git a/azure/README.md b/azure/README.md new file mode 120000 index 000000000..d2c8befff --- /dev/null +++ b/azure/README.md @@ -0,0 +1 @@ +../docs/azure.md \ No newline at end of file diff --git a/azure/attach.sh b/azure/attach.sh index 231bde5cd..c23127b0f 100755 --- a/azure/attach.sh +++ b/azure/attach.sh @@ -1,4 +1,4 @@ #!/bin/bash name=${1-deepspeed} -docker exec -i -t $name /bin/bash +docker exec -i -w /home/deepspeed -t $name /bin/bash diff --git a/azure/build_docker_image.sh b/azure/build_docker_image.sh index c0930533d..e8617f084 100755 --- a/azure/build_docker_image.sh +++ b/azure/build_docker_image.sh @@ -1,3 +1,3 @@ #!/bin/bash -docker build -t deepspeed:0.1 -f Dockerfile . +docker build -t deepspeed:0.1 -f ../Dockerfile . diff --git a/azure/setup_docker.sh b/azure/setup_docker.sh index 287e25a6e..ab8f93ee6 100755 --- a/azure/setup_docker.sh +++ b/azure/setup_docker.sh @@ -6,6 +6,13 @@ if [ ! -f ${azure_config} ]; then exit 1 fi +parallel=true +command -v pdsh +if [ $? != 0 ]; then + echo "Installing pdsh will allow for the docker pull to be done in parallel across the cluster. See: 'apt-get install pdsh'" + parallel=false +fi + ssh_key=`cat ${azure_config} | jq .ssh_private_key | sed 's/"//g'` if [ $ssh_key == "null" ]; then echo 'missing ssh_private_key in config'; exit 1; fi num_vms=`cat ${azure_config} | jq .num_vms` @@ -13,9 +20,29 @@ if [ $num_vms == "null" ]; then echo 'missing num_vms in config'; exit 1; fi args="-i ${ssh_key} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null" username=deepspeed -for node_id in `seq 0 $((num_vms - 1))`; do - ip_addr=`az vm list-ip-addresses | jq .[${node_id}].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'` - addr=${username}@${ip_addr} - ssh ${args} $addr "docker pull deepspeed/deepspeed:latest" - ssh ${args} $addr "cd workdir/DeepSpeed; git pull; git submodule update --init --recursive; bash azure/start_container.sh" -done + +update_script=" +docker pull deepspeed/deepspeed:latest; +ln -s workdir/DeepSpeed/azure/attach.sh attach.sh; +cd workdir/DeepSpeed; +git pull; +git submodule update --init --recursive; +bash azure/start_container.sh; +" + +if [ $parallel == true ]; then + echo "parallel docker pull" + hosts="" + for node_id in {0..1}; do + addr=`az vm list-ip-addresses | jq .[${node_id}].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'` + hosts="${addr},${hosts}" + done + PDSH_SSH_ARGS_APPEND=${args} pdsh -w $hosts -l ${username} $update_script +else + echo "sequential docker pull" + for node_id in `seq 0 $((num_vms - 1))`; do + ip_addr=`az vm list-ip-addresses | jq .[${node_id}].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'` + addr=${username}@${ip_addr} + ssh ${args} $addr $update_script + done +fi diff --git a/azure/setup_vms.sh b/azure/setup_vms.sh index 892263890..396acbb01 100755 --- a/azure/setup_vms.sh +++ b/azure/setup_vms.sh @@ -33,14 +33,20 @@ for node_id in `seq 0 $((num_vms - 1))`; do " >> ${ssh_config} done +update_script=" +sudo mkdir -p /job; +sudo chmod -R 777 /job; +mkdir -p workdir; +git clone https://github.com/microsoft/DeepSpeed.git workdir/DeepSpeed; +" + for node_id in `seq 0 $((num_vms - 1))`; do ip_addr=`az vm list-ip-addresses | jq .[${node_id}].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'` addr=${username}@${ip_addr} echo "copying ssh keys, ssh config, hostfile to worker-${node_id}" + ssh $args ${addr} $update_script scp $args ${ssh_key}* ${addr}:.ssh/ scp $args ${ssh_config} ${addr}:.ssh/ - ssh $args ${addr} "sudo mkdir -p /job/; sudo chmod -R 777 /job; mkdir -p workdir" scp $args ${hostfile} ${addr}:/job/ - ssh $args ${addr} 'git clone https://github.com/microsoft/DeepSpeed.git workdir/DeepSpeed' done rm $hostfile $ssh_config diff --git a/azure/shutdown_vms.sh b/azure/shutdown_vms.sh index 9a68e3034..916ab5533 100755 --- a/azure/shutdown_vms.sh +++ b/azure/shutdown_vms.sh @@ -16,8 +16,8 @@ resource_group=deepspeed_rg_$location for i in `seq 0 $(( num_vms - 1))`; do vm_name=${base_vm_name}_$i - echo "shutting down $vm_name" - az vm stop --resource-group $resource_group --name $vm_name + echo "deallocating $vm_name" + az vm deallocate --resource-group $resource_group --name $vm_name echo "deleting $vm_name" az vm delete -y --resource-group $resource_group --name $vm_name done diff --git a/bin/deepspeed.pt b/bin/deepspeed.pt new file mode 120000 index 000000000..6b7685641 --- /dev/null +++ b/bin/deepspeed.pt @@ -0,0 +1 @@ +ds \ No newline at end of file diff --git a/bin/ds_ssh b/bin/ds_ssh new file mode 100755 index 000000000..c2330e31e --- /dev/null +++ b/bin/ds_ssh @@ -0,0 +1,20 @@ +#!/bin/bash + +# Copyright 2020 The Microsoft DeepSpeed Team + +command -v pdsh +if [ $? != 0 ]; then + echo "Cannot find pdsh, please install via 'apt-get install -y pdsh'" + exit 1 +fi + +hostfile=/job/hostfile + +if [ -f $hostfile ]; then + hosts=`cat $hostfile | awk '{print $1}' | paste -sd "," -` + export PDSH_RCMD_TYPE=ssh + pdsh -w ${hosts} $@ +else + echo "Missing hostfile at ${hostfile}, executing command locally" + $@ +fi diff --git a/docs/azure.md b/docs/azure.md new file mode 100644 index 000000000..374c6359e --- /dev/null +++ b/docs/azure.md @@ -0,0 +1,121 @@ +# DeepSpeed with Azure + +This tutorial will help you get started running DeepSpeed on [Azure virtual +machines](https://azure.microsoft.com/en-us/services/virtual-machines/). Support for +[Azure ML](https://azure.microsoft.com/en-us/services/machine-learning/) will be coming +soon! + +To help with launching Azure instances we suggest using the [Azure +CLI](https://docs.microsoft.com/en-us/cli/azure/?view=azure-cli-latest). We have created +several helper scripts to get you quickly started using DeepSpeed with Azure. + * Install Azure CLI on your local box: https://docs.microsoft.com/en-us/cli/azure/install-azure-cli + * Alternatively you can use the Azure in-browser shell: https://shell.azure.com/ + +## Create an SSH key +Generate an SSH key that will be used across this tutorial to SSH into your VMs and +between Docker containers. `ssh-keygen` is the recommended way of doing this. Our scripts +assume your key is located inside the same directory as the Azure scripts. + +## Azure Config JSON +Our helper scripts depend on the following a configuration JSON for deployment and setup. +We have provided a simple example JSON in `azure_config.json` that sets up a basic +environment with two VMs. See the example below: + ```json +{ + "num_vms": 2, + "location": "southcentralus", + "azure_sku": "Standard_NV6_Promo", + "ssh_private_key": "id_rsa", + "docker_ssh_port": 2222 +} +``` + +## Dependencies +The scripts in this tutorial require [jq](https://stedolan.github.io/jq/) to help with +parsing JSON from the command line. Also it is recommended to install +[pdsh](https://linux.die.net/man/1/pdsh) to help launch ssh connections in parallel. + +## Create Azure VMs +We first need to allocate the VMs. We provide a script +```bash +./create_vms.sh +``` +to create VMs with the Azure SKU in the region specified in `azure_config.json`. Feel +free to customize your JSON to your desired region/SKU. This step will take a few minutes +to complete while it sets up all of your VMs on Azure. + +## Setup VM environment to use DeepSpeed +Next, we need to configure the VM environment for DeepSpeed. We provide a script +```bash +./setup_vms.sh +``` +to generate a [hostfile](../README.md#resource-configuration) and SSH +configuration on all of the VMs. This configuration will be used by the DeepSpeed +Docker containers in the next step. + +## Start the DeepSpeed docker container +We now setup the DeepSpeed Docker containers on the VMs. We provide a script +```bash +./setup_docker.sh +``` +to pull the DeepSpeed image onto all VMs and start a container instance in the +background. This will take several minutes since it needs to pull the entire Docker +image. + +## Access VMs +The tool [azure_ssh.sh](azure_ssh.sh) will let you SSH into any of the VMs with this +syntax: +```bash +./azure_ssh.sh [command] +``` +where the `node-id` is a number between `0` and `num_vms-1`. This script will find the +public IP address of your VM and use the SSH key provided in the Azure configuration +JSON. + +## Access DeepSpeed container +Everything should be up and running at this point. Set's access the running DeepSpeed +container on the first VM and make sure we can talk to the other containers in our setup. +Let's complete the following steps: + + * SSH into the first VM via: `./azure_ssh.sh 0` + * Change directories into the azure folder of this repo via: `cd ~/workdir/DeepSpeed/azure` + * Attach the running docker container via: `./attach.sh` + * You should now be able to `ssh` into any other docker container, the containers can be + accessed via their SSH alias of `worker-N`, where `N` is the VM number between `0` + and `num_vms-1`. In this example we should be able to successfully run `ssh worker-1 + hostname`. You can also use `ds_ssh` to execute a command in parallel on all of your + worker containers. + +## Run CIFAR-10 example model +We will now run the DeepSpeed CIFAR-10 model example to test the VM setup. From inside +the first DeepSpeed container: + + 1) Install the python dependencies necessary to run the CIFAR-10 example model. You can + do this across your cluster via: + ```bash + ds_ssh pip install -r ~/workdir/DeepSpeed/DeepSpeedExamples/cifar/requirements.txt + ``` + + 2) Now change directories to the CIFAR example: + ```bash + cd ~/workdir/DeepSpeed/DeepSpeedExamples/cifar + ``` + + 3) Finally, launch training across all VMs: + ```bash + deepspeed cifar10_deepspeed.py --deepspeed --deepspeed_config ds_config.json + ``` + Alternatively, we provide a helper script `./run_ds.sh`. + +This will train a simple CIFAR-10 example model. The accuracy that you will achieve will +be dependent on the number of GPUs you are training with, we are using this example +simply to demonstrate that everything is setup correctly and less on training a suitable +CIFAR-10 model. + + +## Megatron-LM GPT2 +DeepSpeed includes an example model using Megatron-LM's GPT2. Please refer to the full +[Megatron tutorial](tutorials/MegatronGPT2Tutorial.md) for more details. + * In order to fully train GPT2 with DeepSpeed and ZeRO we recommend using 8 instances of + Azure's Standard_ND40rs_v2 SKU for a total of 64 NVIDIA V100 GPUs. With this setup you + should be able to train 153.6 million samples in less than 2 weeks of training. diff --git a/setup.py b/setup.py index 486716cb0..769f9d85f 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,9 @@ setup(name='deepspeed', "third_party", "csrc"]), scripts=['bin/deepspeed', - 'bin/ds'], + 'bin/deepspeed.pt', + 'bin/ds', + 'bin/ds_ssh'], classifiers=['Programming Language :: Python :: 3.6'], ext_modules=ext_modules, cmdclass=cmdclass)