94 lines
3.0 KiB
Bash
94 lines
3.0 KiB
Bash
#!/bin/bash
|
|
#SBATCH --job-name=open-r1-sft
|
|
#SBATCH --ntasks-per-node=1
|
|
#SBATCH --exclusive
|
|
#SBATCH --gres=gpu:8
|
|
#SBATCH --partition=hopper-prod # Adjust this for your cluster
|
|
#SBATCH --output=./logs/%x-%j.out
|
|
#SBATCH --err=./logs/%x-%j.err
|
|
#SBATCH --requeue
|
|
|
|
# Specific configuration optimized for the Hugging Face Compute Cluster
|
|
# Be ye warned this may not work on other clusters!
|
|
module load cuda/12.4
|
|
|
|
|
|
set -x -e
|
|
|
|
source ~/.bashrc
|
|
source openr1/bin/activate
|
|
echo "START TIME: $(date)"
|
|
|
|
MODEL=$1
|
|
TASK=$2
|
|
CONFIG_SUFFIX=$3
|
|
ACCELERATOR=$4
|
|
OPTIONAL_ARGS=$5
|
|
|
|
# Training setup
|
|
NUM_NODES=$SLURM_NNODES
|
|
GPUS_PER_NODE=8
|
|
WORLD_SIZE=$(($NUM_NODES*$GPUS_PER_NODE))
|
|
# Due to conflicts between Accelerate's DeepSpeed configs and Transformers' TrainingArguments, we need to parse the gradient accumulation steps from the config file to ensure they match
|
|
CONFIG_FILE=recipes/$MODEL/$TASK/config_$CONFIG_SUFFIX.yaml
|
|
GRAD_ACC_STEPS=$(grep 'gradient_accumulation_steps' $CONFIG_FILE | awk '{print $2}')
|
|
USE_VLLM=$(grep 'use_vllm:\s*true' $CONFIG_FILE) # Match "use_vllm: true" (with optional whitespace)
|
|
|
|
if [ -n "$USE_VLLM" ]; then # Check if USE_VLLM is *not* empty (found)
|
|
WORLD_SIZE=$(($WORLD_SIZE-1))
|
|
fi
|
|
|
|
# Split the string into individual arguments
|
|
IFS=' ' read -ra ARGS <<< "$OPTIONAL_ARGS"
|
|
|
|
# Loop through the arguments and find the one with "--gradient_accumulation_steps"
|
|
for arg in "${ARGS[@]}"; do
|
|
if [[ "$arg" == "--gradient_accumulation_steps="* ]]; then
|
|
# Extract the value after the equals sign
|
|
GRAD_ACC_STEPS="${arg#*=}"
|
|
break # Exit the loop once we find the desired argument
|
|
fi
|
|
done
|
|
|
|
echo "Gradient accumulation steps: $GRAD_ACC_STEPS"
|
|
# so processes know who to talk to
|
|
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
|
|
MASTER_PORT=6000
|
|
|
|
export CMD=" \
|
|
src/open_r1/$TASK.py --config $CONFIG_FILE $OPTIONAL_ARGS
|
|
"
|
|
|
|
export LAUNCHER="HF_HUB_ENABLE_HF_TRANSFER=1 ACCELERATE_LOG_LEVEL=info TRANSFORMERS_VERBOSITY=info accelerate launch \
|
|
--config_file recipes/accelerate_configs/$ACCELERATOR.yaml \
|
|
--gradient_accumulation_steps $GRAD_ACC_STEPS \
|
|
--num_machines $NUM_NODES \
|
|
--num_processes $WORLD_SIZE \
|
|
--main_process_ip $MASTER_ADDR \
|
|
--main_process_port $MASTER_PORT \
|
|
--machine_rank \$SLURM_PROCID \
|
|
--rdzv_conf "rdzv_backend=c10d,rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT" \
|
|
--max_restarts 1 \
|
|
--role \$(hostname -s): \
|
|
--tee 3 \
|
|
"
|
|
|
|
# force crashing on nccl issues like hanging broadcast
|
|
export NCCL_ASYNC_ERROR_HANDLING=1
|
|
# export NCCL_DEBUG=INFO
|
|
# export NCCL_DEBUG_SUBSYS=COLL
|
|
# export NCCL_SOCKET_NTHREADS=1
|
|
# export NCCL_NSOCKS_PERTHREAD=1
|
|
# export CUDA_LAUNCH_BLOCKING=1
|
|
|
|
# srun error handling:
|
|
# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
|
|
# --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
|
|
SRUN_ARGS=" \
|
|
--wait=60 \
|
|
--kill-on-bad-exit=1 \
|
|
"
|
|
|
|
clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --role \$SLURMD_NODENAME: $CMD" 2>&1
|
|
|
|
echo "END TIME: $(date)" |