mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
Improve flexibility of auto_tune.sh execution. (#23766)
Signed-off-by: Anthony Su <50185138+anthonsu@users.noreply.github.com> Signed-off-by: anthonsu <50185138+anthonsu@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
@ -31,6 +31,12 @@ cd vllm
|
||||
|
||||
You must set the following variables at the top of the script before execution.
|
||||
|
||||
Note: You can also override the default values below via environment variables when running the script.
|
||||
|
||||
```bash
|
||||
MODEL=meta-llama/Llama-3.3-70B-Instruct SYSTEM=TPU TP=8 DOWNLOAD_DIR='' INPUT_LEN=128 OUTPUT_LEN=2048 MAX_MODEL_LEN=2300 MIN_CACHE_HIT_PCT=0 MAX_LATENCY_ALLOWED_MS=100000000000 NUM_SEQS_LIST="128 256" NUM_BATCHED_TOKENS_LIST="1024 2048 4096" VLLM_LOGGING_LEVEL=DEBUG bash auto_tune.sh
|
||||
```
|
||||
|
||||
| Variable | Description | Example Value |
|
||||
| --- | --- | --- |
|
||||
| `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` |
|
||||
|
@ -5,25 +5,41 @@
|
||||
|
||||
TAG=$(date +"%Y_%m_%d_%H_%M")
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
BASE="$SCRIPT_DIR/../../.."
|
||||
MODEL="meta-llama/Llama-3.1-8B-Instruct"
|
||||
SYSTEM="TPU"
|
||||
TP=1
|
||||
DOWNLOAD_DIR=""
|
||||
INPUT_LEN=4000
|
||||
OUTPUT_LEN=16
|
||||
MAX_MODEL_LEN=4096
|
||||
MIN_CACHE_HIT_PCT=0
|
||||
MAX_LATENCY_ALLOWED_MS=100000000000
|
||||
NUM_SEQS_LIST="128 256"
|
||||
NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
|
||||
VLLM_LOGGING_LEVEL=${VLLM_LOGGING_LEVEL:-INFO}
|
||||
BASE=${BASE:-"$SCRIPT_DIR/../../.."}
|
||||
MODEL=${MODEL:-"meta-llama/Llama-3.1-8B-Instruct"}
|
||||
SYSTEM=${SYSTEM:-"TPU"}
|
||||
TP=${TP:-1}
|
||||
DOWNLOAD_DIR=${DOWNLOAD_DIR:-""}
|
||||
INPUT_LEN=${INPUT_LEN:-4000}
|
||||
OUTPUT_LEN=${OUTPUT_LEN:-16}
|
||||
MAX_MODEL_LEN=${MAX_MODEL_LEN:-4096}
|
||||
MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
|
||||
MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
|
||||
NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
|
||||
NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
|
||||
|
||||
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
|
||||
RESULT="$LOG_FOLDER/result.txt"
|
||||
PROFILE_PATH="$LOG_FOLDER/profile"
|
||||
|
||||
echo "result file: $RESULT"
|
||||
echo "model: $MODEL"
|
||||
echo "====================== AUTO TUNE PARAMETERS ===================="
|
||||
echo "SCRIPT_DIR=$SCRIPT_DIR"
|
||||
echo "BASE=$BASE"
|
||||
echo "MODEL=$MODEL"
|
||||
echo "SYSTEM=$SYSTEM"
|
||||
echo "TP=$TP"
|
||||
echo "DOWNLOAD_DIR=$DOWNLOAD_DIR"
|
||||
echo "INPUT_LEN=$INPUT_LEN"
|
||||
echo "OUTPUT_LEN=$OUTPUT_LEN"
|
||||
echo "MAX_MODEL_LEN=$MAX_MODEL_LEN"
|
||||
echo "MIN_CACHE_HIT_PCT=$MIN_CACHE_HIT_PCT"
|
||||
echo "MAX_LATENCY_ALLOWED_MS=$MAX_LATENCY_ALLOWED_MS"
|
||||
echo "NUM_SEQS_LIST=$NUM_SEQS_LIST"
|
||||
echo "NUM_BATCHED_TOKENS_LIST=$NUM_BATCHED_TOKENS_LIST"
|
||||
echo "VLLM_LOGGING_LEVEL=$VLLM_LOGGING_LEVEL"
|
||||
echo "RESULT_FILE=$RESULT"
|
||||
echo "====================== AUTO TUNEPARAMETERS ===================="
|
||||
|
||||
rm -rf $LOG_FOLDER
|
||||
rm -rf $PROFILE_PATH
|
||||
|
Reference in New Issue
Block a user