mirror of
https://github.com/deepspeedai/DeepSpeed.git
synced 2025-10-20 06:53:47 +08:00
update info and links. (#2122)
This commit is contained in:
@ -1,3 +1,3 @@
|
||||
# Getting Started with DeepSpeed on Azure
|
||||
|
||||
Please see our [Azure tutorial](https://www.deepspeed.ai/tutorials/azure/) to get started with DeepSpeed on Azure!
|
||||
The recommended and simplest method to try DeepSpeed on Azure is through [AzureML](https://azure.microsoft.com/en-us/services/machine-learning/). For more details, please see our [Azure tutorial](https://www.deepspeed.ai/tutorials/azure/).
|
||||
|
@ -1,4 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
name=${1-deepspeed}
|
||||
docker exec -i -w /home/deepspeed -t $name /bin/bash
|
@ -1,7 +0,0 @@
|
||||
{
|
||||
"num_vms": 2,
|
||||
"location": "southcentralus",
|
||||
"azure_sku": "Standard_NV6_Promo",
|
||||
"ssh_private_key": "id_rsa",
|
||||
"docker_ssh_port": 2222
|
||||
}
|
@ -1,29 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_file=azure_config.json
|
||||
if [ ! -f ${config_file} ]; then
|
||||
echo "Cannot find $config_file"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
location=`cat ${config_file} | jq .location | sed 's/"//g'`
|
||||
rg=deepspeed_rg_$location
|
||||
|
||||
while getopts 'c:' flag; do
|
||||
case "${flag}" in
|
||||
c) config_file="${OPTARG}" ;;
|
||||
*) error "Unexpected option ${flag}" ;;
|
||||
esac
|
||||
done
|
||||
shift $(expr $OPTIND - 1)
|
||||
echo "Using $config_file"
|
||||
|
||||
nodeid=$1
|
||||
cmds=${@:2}
|
||||
echo $nodeid $cmds
|
||||
ip_addr=`az vm list-ip-addresses -g $rg | jq .[${nodeid}].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'`
|
||||
|
||||
ssh_private_key=`cat ${config_file} | jq .ssh_private_key | sed 's/"//g'`
|
||||
if [ $ssh_private_key == "null" ]; then echo 'missing ssh_private_key in config'; exit 1; fi
|
||||
|
||||
ssh -i ${ssh_private_key} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null deepspeed@${ip_addr} ${cmds}
|
@ -1,3 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
docker build -t deepspeed:0.1 -f ../Dockerfile .
|
@ -1,55 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
azure_config=azure_config.json
|
||||
|
||||
# Make sure jq is installed
|
||||
command -v jq
|
||||
if [ $? != 0 ]; then
|
||||
echo "Missing dependency of jq, please 'apt-get install jq'"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -f ${azure_config} ]; then
|
||||
echo "Cannot find $azure_config"
|
||||
exit 1
|
||||
fi
|
||||
cat $azure_config
|
||||
|
||||
num_vms=`cat ${azure_config} | jq .num_vms`
|
||||
if [ $num_vms == "null" ]; then echo 'missing num_vms in config'; exit 1; fi
|
||||
location=`cat ${azure_config} | jq .location | sed 's/"//g'`
|
||||
if [ $location == "null" ]; then echo 'missing location in config'; exit 1; fi
|
||||
azure_sku=`cat ${azure_config} | jq .azure_sku | sed 's/"//g'`
|
||||
if [ $azure_sku == "null" ]; then echo 'missing azure_sku in config'; exit 1; fi
|
||||
ssh_private_key=`cat ${azure_config} | jq .ssh_private_key | sed 's/"//g'`
|
||||
if [ $ssh_private_key == "null" ]; then echo 'missing ssh_private_key in config'; exit 1; fi
|
||||
ssh_key=${ssh_private_key}.pub
|
||||
|
||||
if [ ! -f ${ssh_private_key} ]; then
|
||||
echo "Cannot find $ssh_private_key"
|
||||
exit 1
|
||||
fi
|
||||
if [ ! -f ${ssh_key} ]; then
|
||||
echo "Cannot find $ssh_key"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
resource_group=deepspeed_rg_$location
|
||||
az group create --name ${resource_group} --location $location
|
||||
|
||||
base_vm_name=deepspeed
|
||||
vm_image="nvidia:ngc_azure_17_11:ngc_gpu_cloud_19_11_3:19.11.3"
|
||||
|
||||
az vm image terms accept --urn ${vm_image}
|
||||
|
||||
for i in `seq 0 $(( num_vms - 1))`; do
|
||||
vm_name=${base_vm_name}_$i
|
||||
echo "creating $vm_name"
|
||||
az vm create \
|
||||
--resource-group ${resource_group} \
|
||||
--name ${vm_name} \
|
||||
--image ${vm_image} \
|
||||
--admin-username deepspeed \
|
||||
--size ${azure_sku} \
|
||||
--ssh-key-values ${ssh_key}
|
||||
done
|
@ -1,50 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
azure_config=azure_config.json
|
||||
if [ ! -f ${azure_config} ]; then
|
||||
echo "Cannot find $azure_config"
|
||||
exit 1
|
||||
fi
|
||||
location=`cat ${azure_config} | jq .location | sed 's/"//g'`
|
||||
rg=deepspeed_rg_$location
|
||||
|
||||
parallel=true
|
||||
command -v pdsh
|
||||
if [ $? != 0 ]; then
|
||||
echo "Installing pdsh will allow for the docker pull to be done in parallel across the cluster. See: 'apt-get install pdsh'"
|
||||
parallel=false
|
||||
fi
|
||||
|
||||
ssh_key=`cat ${azure_config} | jq .ssh_private_key | sed 's/"//g'`
|
||||
if [ $ssh_key == "null" ]; then echo 'missing ssh_private_key in config'; exit 1; fi
|
||||
num_vms=`cat ${azure_config} | jq .num_vms`
|
||||
if [ $num_vms == "null" ]; then echo 'missing num_vms in config'; exit 1; fi
|
||||
|
||||
args="-i ${ssh_key} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null"
|
||||
username=deepspeed
|
||||
|
||||
update_script="
|
||||
docker pull deepspeed/deepspeed:latest;
|
||||
ln -s workdir/DeepSpeed/azure/attach.sh attach.sh;
|
||||
cd workdir/DeepSpeed;
|
||||
git pull;
|
||||
git submodule update --init --recursive;
|
||||
bash azure/start_container.sh;
|
||||
"
|
||||
|
||||
if [ $parallel == true ]; then
|
||||
echo "parallel docker pull"
|
||||
hosts=""
|
||||
for node_id in {0..1}; do
|
||||
addr=`az vm list-ip-addresses -g $rg | jq .[${node_id}].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'`
|
||||
hosts="${addr},${hosts}"
|
||||
done
|
||||
PDSH_RCMD_TYPE=ssh PDSH_SSH_ARGS_APPEND=${args} pdsh -w $hosts -l ${username} $update_script
|
||||
else
|
||||
echo "sequential docker pull"
|
||||
for node_id in `seq 0 $((num_vms - 1))`; do
|
||||
ip_addr=`az vm list-ip-addresses -g $rg | jq .[${node_id}].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'`
|
||||
addr=${username}@${ip_addr}
|
||||
ssh ${args} $addr $update_script
|
||||
done
|
||||
fi
|
@ -1,54 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
azure_config=azure_config.json
|
||||
if [ ! -f ${azure_config} ]; then
|
||||
echo "Cannot find $azure_config"
|
||||
exit 1
|
||||
fi
|
||||
location=`cat ${azure_config} | jq .location | sed 's/"//g'`
|
||||
rg=deepspeed_rg_$location
|
||||
|
||||
ssh_key=`cat ${azure_config} | jq .ssh_private_key | sed 's/"//g'`
|
||||
if [ $ssh_key == "null" ]; then echo 'missing ssh_private_key in config'; exit 1; fi
|
||||
docker_ssh_port=`cat ${azure_config} | jq .docker_ssh_port`
|
||||
if [ $docker_ssh_port == "null" ]; then echo 'missing docker_ssh_port in config'; exit 1; fi
|
||||
|
||||
username=deepspeed
|
||||
args="-i ${ssh_key} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null"
|
||||
|
||||
num_vms=`az vm list -g $rg | jq '. | length'`
|
||||
first_ip_addr=`az vm list-ip-addresses -g $rg | jq .[0].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'`
|
||||
num_slots=`ssh $args ${username}@${first_ip_addr} 'nvidia-smi -L | wc -l'`
|
||||
echo "number of slots per vm: $num_slots"
|
||||
|
||||
hostfile=hostfile
|
||||
ssh_config=config
|
||||
echo -n "" > $hostfile
|
||||
echo -n "" > $ssh_config
|
||||
for node_id in `seq 0 $((num_vms - 1))`; do
|
||||
private_ip_addr=`az vm list-ip-addresses -g $rg | jq .[${node_id}].virtualMachine.network.privateIpAddresses[0] | sed 's/"//g'`
|
||||
echo "worker-${node_id} slots=${num_slots}" >> hostfile
|
||||
echo "Host worker-${node_id}
|
||||
HostName ${private_ip_addr}
|
||||
Port ${docker_ssh_port}
|
||||
StrictHostKeyChecking no
|
||||
" >> ${ssh_config}
|
||||
done
|
||||
|
||||
update_script="
|
||||
sudo mkdir -p /job;
|
||||
sudo chmod -R 777 /job;
|
||||
mkdir -p workdir;
|
||||
git clone https://github.com/microsoft/DeepSpeed.git workdir/DeepSpeed;
|
||||
"
|
||||
|
||||
for node_id in `seq 0 $((num_vms - 1))`; do
|
||||
ip_addr=`az vm list-ip-addresses -g $rg | jq .[${node_id}].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'`
|
||||
addr=${username}@${ip_addr}
|
||||
echo "copying ssh keys, ssh config, hostfile to worker-${node_id}"
|
||||
ssh $args ${addr} $update_script
|
||||
scp $args ${ssh_key}* ${addr}:.ssh/
|
||||
scp $args ${ssh_config} ${addr}:.ssh/
|
||||
scp $args ${hostfile} ${addr}:/job/
|
||||
done
|
||||
rm $hostfile $ssh_config
|
@ -1,37 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
azure_config=azure_config.json
|
||||
if [ ! -f ${azure_config} ]; then
|
||||
echo "Cannot find $azure_config"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
delete=0
|
||||
while getopts 'd' flag; do
|
||||
case "${flag}" in
|
||||
d) delete=1 ;;
|
||||
*)
|
||||
echo "Unexpected option ${flag}"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
num_vms=`cat ${azure_config} | jq .num_vms`
|
||||
if [ $num_vms == "null" ]; then echo 'missing num_vms in config'; exit 1; fi
|
||||
location=`cat ${azure_config} | jq .location | sed 's/"//g'`
|
||||
if [ $location == "null" ]; then echo 'missing location in config'; exit 1; fi
|
||||
|
||||
base_vm_name=deepspeed
|
||||
resource_group=deepspeed_rg_$location
|
||||
|
||||
for i in `seq 0 $(( num_vms - 1))`; do
|
||||
vm_name=${base_vm_name}_$i
|
||||
if [ $delete == 0 ]; then
|
||||
echo "deallocating $vm_name"
|
||||
az vm deallocate --resource-group $resource_group --name $vm_name --no-wait
|
||||
else
|
||||
echo "deleting $vm_name"
|
||||
az vm delete -y --resource-group $resource_group --name $vm_name --no-wait
|
||||
fi
|
||||
done
|
@ -1,11 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
name=${1-deepspeed}
|
||||
image=deepspeed/deepspeed:latest
|
||||
echo "starting docker image named $name"
|
||||
docker run -d -t --name $name \
|
||||
--network host \
|
||||
-v ${HOME}/workdir:/home/deepspeed/workdir \
|
||||
-v ${HOME}/.ssh:/home/deepspeed/.ssh \
|
||||
-v /job/hostfile:/job/hostfile \
|
||||
--gpus all $image bash -c 'sudo service ssh start && sleep infinity'
|
@ -3,132 +3,18 @@ title: "Getting Started with DeepSpeed on Azure"
|
||||
tags: getting-started
|
||||
---
|
||||
|
||||
This tutorial will help you get started running DeepSpeed on [Azure virtual
|
||||
machines](https://azure.microsoft.com/en-us/services/virtual-machines/).
|
||||
Looking forward, we will be integrating these techniques and additional enhancements
|
||||
into the [Azure ML](https://azure.microsoft.com/en-us/services/machine-learning/) platform to
|
||||
benefit all your large model training jobs.
|
||||
This tutorial will help you get started with DeepSpeed on Azure.
|
||||
|
||||
If you don't already have an Azure account please see more details here: [https://azure.microsoft.com/](https://azure.microsoft.com/).
|
||||
|
||||
To use DeepSpeed on [Azure ML](https://azure.microsoft.com/en-us/services/machine-learning/), please take a look at easy-to-use examples for Transformers and CIFAR training from [AzureML Examples GitHub](https://github.com/Azure/azureml-examples/tree/main/python-sdk/workflows/train/deepspeed).
|
||||
# DeepSpeed on Azure via AzureML
|
||||
|
||||
To help with launching Azure instances we suggest using the [Azure
|
||||
CLI](https://docs.microsoft.com/en-us/cli/azure/?view=azure-cli-latest). We have created
|
||||
several helper scripts to get you quickly started using DeepSpeed with Azure.
|
||||
* Install Azure CLI on your local box: [https://docs.microsoft.com/en-us/cli/azure/install-azure-cli](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli).
|
||||
* Alternatively, you can use the Azure in-browser shell: [https://shell.azure.com/](https://shell.azure.com/).
|
||||
The recommended and simplest method to try DeepSpeed on Azure is through [AzureML](https://azure.microsoft.com/en-us/services/machine-learning/). Please take a look at easy-to-use examples for Megatron-DeepSpeed, Transformers and CIFAR training [here](https://github.com/Azure/azureml-examples/tree/main/python-sdk/workflows/train/deepspeed).
|
||||
|
||||
## Create an SSH key
|
||||
Generate an SSH key that will be used across this tutorial to SSH into your VMs and
|
||||
between Docker containers. `ssh-keygen` is the recommended way of doing this. Our scripts
|
||||
assume your key is located inside the same directory as the Azure scripts.
|
||||
> Our [Megatron-DeepSpeed](https://github.com/microsoft/megatron-deepspeed) contains the most up to date [recipe](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples/azureml) for end-to-end training on AzureML.
|
||||
|
||||
## Azure Config JSON
|
||||
Our helper scripts depend on the following a configuration JSON for deployment
|
||||
and setup. We have provided a simple example JSON in `azure_config.json` that
|
||||
sets up a basic environment with two VMs. This config uses the NV6_Promo
|
||||
instance type which has one NVIDIA Tesla M60 GPU per VM. You can read more
|
||||
details about the VM on the [Linux Virtual Machines
|
||||
Pricing](https://azure.microsoft.com/en-us/pricing/details/virtual-machines/linux/)
|
||||
page.
|
||||
# DeepSpeed on Azure VMs
|
||||
|
||||
See the example below:
|
||||
```json
|
||||
{
|
||||
"num_vms": 2,
|
||||
"location": "southcentralus",
|
||||
"azure_sku": "Standard_NV6_Promo",
|
||||
"ssh_private_key": "id_rsa",
|
||||
"docker_ssh_port": 2222
|
||||
}
|
||||
```
|
||||
If you don't have access to AzureML or if want to build a custom environments using [Azure virtual machines](https://azure.microsoft.com/en-us/services/virtual-machines/) or Azure VM Scale-Sets ([VMSS](https://docs.microsoft.com/en-us/azure/virtual-machine-scale-sets/overview)), we are working on easy-to-use cluster setup scripts that will be published in the next few weeks.
|
||||
|
||||
## Dependencies
|
||||
The scripts in this tutorial require [jq](https://stedolan.github.io/jq/) to help with
|
||||
parsing JSON from the command line. Also it is recommended to install
|
||||
[pdsh](https://linux.die.net/man/1/pdsh) to help launch ssh connections in parallel.
|
||||
|
||||
## Create Azure VMs
|
||||
We first need to allocate the VMs. We provide a script
|
||||
```bash
|
||||
./create_vms.sh
|
||||
```
|
||||
to create VMs with the Azure SKU in the region specified in `azure_config.json`. Feel
|
||||
free to customize your JSON to your desired region/SKU. This step will take a few minutes
|
||||
to complete while it sets up all of your VMs on Azure.
|
||||
|
||||
## Setup VM environment to use DeepSpeed
|
||||
Next, we need to configure the VM environment for DeepSpeed. We provide a script
|
||||
```bash
|
||||
./setup_vms.sh
|
||||
```
|
||||
to generate a [hostfile](/getting-started/#resource-configuration-multi-node) and SSH
|
||||
configuration on all of the VMs. This configuration will be used by the DeepSpeed
|
||||
Docker containers in the next step.
|
||||
|
||||
## Start the DeepSpeed docker container
|
||||
We now setup the DeepSpeed Docker containers on the VMs. We provide a script
|
||||
```bash
|
||||
./setup_docker.sh
|
||||
```
|
||||
to pull the DeepSpeed image onto all VMs and start a container instance in the
|
||||
background. This will take several minutes since it needs to pull the entire Docker
|
||||
image.
|
||||
|
||||
## Access VMs
|
||||
The tool `azure_ssh.sh` will let you SSH into any of the VMs with this
|
||||
syntax:
|
||||
```bash
|
||||
./azure_ssh.sh <node-id> [command]
|
||||
```
|
||||
where the `node-id` is a number between `0` and `num_vms-1`. This script will find the
|
||||
public IP address of your VM and use the SSH key provided in the Azure configuration
|
||||
JSON.
|
||||
|
||||
## Access DeepSpeed container
|
||||
Everything should be up and running at this point. Let's access the running DeepSpeed
|
||||
container on the first VM and make sure we can talk to the other containers in our deployment.
|
||||
|
||||
* SSH into the first VM via: `./azure_ssh.sh 0`
|
||||
* Change directories into the azure folder of this repo via: `cd ~/workdir/DeepSpeed/azure`
|
||||
* Attach the running docker container via: `./attach.sh`
|
||||
* You should now be able to `ssh` into any other docker container, the containers can be
|
||||
accessed via their SSH alias of `worker-N`, where `N` is the VM number between `0`
|
||||
and `num_vms-1`. In this example we should be able to successfully run `ssh worker-1
|
||||
hostname` which will return the hostname of worker-1.
|
||||
|
||||
## Parallel SSH across containers
|
||||
DeepSpeed comes installed with a helper script `ds_ssh` which is a wrapper around
|
||||
the [pdsh](https://linux.die.net/man/1/pdsh) command that lets you issue commands
|
||||
to groups of hosts (via SSH) in parallel. This wrapper simply connects with the
|
||||
hostfile that defines all the containers in your deployment. For example if you run
|
||||
`ds_ssh hostname` you should see a list of all the hostnames in your deployment.
|
||||
|
||||
## Run CIFAR-10 example model
|
||||
We will now run the DeepSpeed CIFAR-10 model example to test the VM setup. From inside
|
||||
the first DeepSpeed container:
|
||||
|
||||
1) Install the python dependencies necessary to run the CIFAR-10 example model. You can
|
||||
do this across your cluster via:
|
||||
```bash
|
||||
ds_ssh pip install -r ~/workdir/DeepSpeed/DeepSpeedExamples/cifar/requirements.txt
|
||||
```
|
||||
|
||||
2) Now change directories to the CIFAR example:
|
||||
```bash
|
||||
cd ~/workdir/DeepSpeed/DeepSpeedExamples/cifar
|
||||
```
|
||||
|
||||
3) Finally, launch training across all VMs:
|
||||
```bash
|
||||
deepspeed cifar10_deepspeed.py --deepspeed --deepspeed_config ds_config.json
|
||||
```
|
||||
|
||||
## Megatron-LM GPT2
|
||||
DeepSpeed includes an example model using Megatron-LM's GPT2. Please refer to the full
|
||||
[Megatron tutorial](/tutorials/megatron/) for more details.
|
||||
* In order to fully train GPT2 with DeepSpeed and ZeRO we recommend using 8 instances of
|
||||
Azure's Standard_ND40rs_v2 SKU for a total of 64 NVIDIA V100 GPUs. With this setup and
|
||||
a batch size of 1536 you should be able to complete 100k training steps (153.6 million
|
||||
samples) in less than 2 weeks of training.
|
||||
If you already have a cluster setup, you can use the [azure recipes](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples/azure) that can easily be modified to train various model configurations.
|
||||
|
Reference in New Issue
Block a user