mirror of
https://github.com/huggingface/accelerate.git
synced 2025-10-20 18:13:46 +08:00
43 lines
1.4 KiB
Bash
43 lines
1.4 KiB
Bash
#!/bin/bash
|
|
|
|
#SBATCH --job-name=multinode
|
|
#SBATCH -D .
|
|
#SBATCH --output=O-%x.%j
|
|
#SBATCH --error=E-%x.%j
|
|
#SBATCH --nodes=4 # number of nodes
|
|
#SBATCH --ntasks-per-node=1 # number of MP tasks
|
|
#SBATCH --gres=gpu:4 # number of GPUs per node
|
|
#SBATCH --cpus-per-task=160 # number of cores per tasks
|
|
#SBATCH --time=01:59:00 # maximum execution time (HH:MM:SS)
|
|
|
|
######################
|
|
### Set environment ###
|
|
######################
|
|
source activateEnvironment.sh
|
|
export GPUS_PER_NODE=4
|
|
######################
|
|
|
|
######################
|
|
#### Set network #####
|
|
######################
|
|
head_node_ip=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
|
|
######################
|
|
export ACCELERATE_DIR="${ACCELERATE_DIR:-/accelerate}"
|
|
|
|
export LAUNCHER="accelerate launch \
|
|
--config_file ${ACCELERATE_DIR}/examples/slurm/fsdp_config.yaml \
|
|
--num_processes $((SLURM_NNODES * GPUS_PER_NODE)) \
|
|
--num_machines $SLURM_NNODES \
|
|
--rdzv_backend c10d \
|
|
--main_process_ip $head_node_ip \
|
|
--main_process_port 29500 \
|
|
"
|
|
export SCRIPT="${ACCELERATE_DIR}/examples/complete_nlp_example.py"
|
|
export SCRIPT_ARGS=" \
|
|
--mixed_precision fp16 \
|
|
--output_dir ${ACCELERATE_DIR}/examples/output \
|
|
"
|
|
|
|
# This step is necessary because accelerate launch does not handle multiline arguments properly
|
|
export CMD="$LAUNCHER $SCRIPT $SCRIPT_ARGS"
|
|
srun $CMD |