mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
[V0 Deprecation] Remove VLLM_USE_V1
from docs and scripts (#26336)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@ -454,11 +454,6 @@ main() {
|
||||
fi
|
||||
check_hf_token
|
||||
|
||||
# Set to v1 to run v1 benchmark
|
||||
if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
|
||||
export VLLM_USE_V1=1
|
||||
fi
|
||||
|
||||
# dependencies
|
||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||
(which jq) || (apt-get update && apt-get -y install jq)
|
||||
|
@ -64,10 +64,9 @@ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git
|
||||
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
||||
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
||||
echo "--- Python dependencies installed ---"
|
||||
export VLLM_USE_V1=1
|
||||
|
||||
export VLLM_XLA_CHECK_RECOMPILATION=1
|
||||
export VLLM_XLA_CACHE_PATH=
|
||||
echo "Using VLLM V1"
|
||||
|
||||
echo "--- Hardware Information ---"
|
||||
# tpu-info
|
||||
|
@ -64,10 +64,9 @@ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git
|
||||
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
||||
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
||||
echo "--- Python dependencies installed ---"
|
||||
export VLLM_USE_V1=1
|
||||
|
||||
export VLLM_XLA_CHECK_RECOMPILATION=1
|
||||
export VLLM_XLA_CACHE_PATH=
|
||||
echo "Using VLLM V1"
|
||||
|
||||
echo "--- Hardware Information ---"
|
||||
# tpu-info
|
||||
|
@ -42,7 +42,7 @@ echo "lanching vllm..."
|
||||
echo "logging to $VLLM_LOG"
|
||||
echo
|
||||
|
||||
VLLM_USE_V1=1 vllm serve $MODEL \
|
||||
vllm serve $MODEL \
|
||||
--seed 42 \
|
||||
--max-num-seqs $MAX_NUM_SEQS \
|
||||
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
|
||||
|
@ -96,11 +96,11 @@ start_server() {
|
||||
# This correctly passes each element as a separate argument.
|
||||
if [[ -n "$profile_dir" ]]; then
|
||||
# Start server with profiling enabled
|
||||
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
|
||||
VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
|
||||
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
||||
else
|
||||
# Start server without profiling
|
||||
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \
|
||||
VLLM_SERVER_DEV_MODE=1 \
|
||||
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
||||
fi
|
||||
local server_pid=$!
|
||||
|
@ -97,7 +97,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
|
||||
??? console "Command"
|
||||
|
||||
```shell
|
||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
|
||||
CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
|
||||
--host 0.0.0.0 \
|
||||
--port 20001 \
|
||||
--tensor-parallel-size 1 \
|
||||
@ -118,7 +118,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
|
||||
??? console "Command"
|
||||
|
||||
```shell
|
||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
|
||||
CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
|
||||
--host 0.0.0.0 \
|
||||
--port 20002 \
|
||||
--tensor-parallel-size 1 \
|
||||
@ -139,7 +139,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
|
||||
??? console "Command"
|
||||
|
||||
```shell
|
||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
|
||||
CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
|
||||
--host 0.0.0.0 \
|
||||
--port 20003 \
|
||||
--tensor-parallel-size 1 \
|
||||
@ -160,7 +160,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
|
||||
??? console "Command"
|
||||
|
||||
```shell
|
||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
|
||||
CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
|
||||
--host 0.0.0.0 \
|
||||
--port 20004 \
|
||||
--tensor-parallel-size 1 \
|
||||
@ -190,7 +190,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
|
||||
??? console "Command"
|
||||
|
||||
```shell
|
||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
|
||||
CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
|
||||
--host 0.0.0.0 \
|
||||
--port 20001 \
|
||||
--tensor-parallel-size 1 \
|
||||
@ -211,7 +211,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
|
||||
??? console "Command"
|
||||
|
||||
```shell
|
||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
|
||||
CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
|
||||
--host 0.0.0.0 \
|
||||
--port 20002 \
|
||||
--tensor-parallel-size 1 \
|
||||
@ -232,7 +232,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
|
||||
??? console "Command"
|
||||
|
||||
```shell
|
||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
|
||||
CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
|
||||
--host 0.0.0.0 \
|
||||
--port 20003 \
|
||||
--tensor-parallel-size 1 \
|
||||
@ -253,7 +253,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
|
||||
??? console "Command"
|
||||
|
||||
```shell
|
||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
|
||||
CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
|
||||
--host 0.0.0.0 \
|
||||
--port 20004 \
|
||||
--tensor-parallel-size 1 \
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
In vLLM's V1 architecture, `torch.compile` is enabled by default and is a critical part of the framework. This document gives a simple walk-through example to show how to understand the `torch.compile` usage.
|
||||
|
||||
Throughout the example, we will run a common Llama model using v1, and turn on debug level logging to show all the details. The command to be used is `VLLM_USE_V1=1 VLLM_LOGGING_LEVEL=DEBUG vllm serve meta-llama/Llama-3.2-1B`.
|
||||
Throughout the example, we will run a common Llama model, and turn on debug level logging to show all the details. The command to be used is `VLLM_LOGGING_LEVEL=DEBUG vllm serve meta-llama/Llama-3.2-1B`.
|
||||
|
||||
## Compilation Cache
|
||||
|
||||
|
@ -166,7 +166,7 @@ main() {
|
||||
local kv_port=$((21001 + i))
|
||||
|
||||
echo " Prefill server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
|
||||
CUDA_VISIBLE_DEVICES=$gpu_id VLLM_USE_V1=1 vllm serve $MODEL \
|
||||
CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
|
||||
--enforce-eager \
|
||||
--host 0.0.0.0 \
|
||||
--port $port \
|
||||
@ -194,7 +194,7 @@ main() {
|
||||
local kv_port=$((22001 + i))
|
||||
|
||||
echo " Decode server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
|
||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
|
||||
CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
|
||||
--enforce-eager \
|
||||
--host 0.0.0.0 \
|
||||
--port $port \
|
||||
|
@ -55,7 +55,6 @@ done
|
||||
echo "Starting vLLM server for $MODEL_NAME with data parallel size: $DATA_PARALLEL_SIZE and redundant experts: $REDUNDANT_EXPERTS"
|
||||
|
||||
export RAY_DEDUP_LOGS=0
|
||||
export VLLM_USE_V1=1
|
||||
export VLLM_ALL2ALL_BACKEND="pplx"
|
||||
export VLLM_USE_DEEP_GEMM=1
|
||||
|
||||
|
@ -5,7 +5,7 @@ To run this example, you can start the vLLM server
|
||||
without any specific flags:
|
||||
|
||||
```bash
|
||||
VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \
|
||||
vllm serve unsloth/Llama-3.2-1B-Instruct \
|
||||
--structured-outputs-config.backend outlines
|
||||
```
|
||||
|
||||
|
@ -36,7 +36,6 @@ llm_config = LLMConfig(
|
||||
},
|
||||
# Set to the node's accelerator type.
|
||||
accelerator_type="H100",
|
||||
runtime_env={"env_vars": {"VLLM_USE_V1": "1"}},
|
||||
# Customize engine arguments as required (for example, vLLM engine kwargs).
|
||||
engine_kwargs={
|
||||
"tensor_parallel_size": 8,
|
||||
|
Reference in New Issue
Block a user