[V0 Deprecation] Remove VLLM_USE_V1 from docs and scripts (#26336)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-10-07 16:46:44 +08:00
committed by GitHub
parent 46b0779996
commit 7e4cd070b0
11 changed files with 17 additions and 26 deletions

View File

@ -454,11 +454,6 @@ main() {
fi
check_hf_token
# Set to v1 to run v1 benchmark
if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
export VLLM_USE_V1=1
fi
# dependencies
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get update && apt-get -y install jq)

View File

@ -64,10 +64,9 @@ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
echo "--- Python dependencies installed ---"
export VLLM_USE_V1=1
export VLLM_XLA_CHECK_RECOMPILATION=1
export VLLM_XLA_CACHE_PATH=
echo "Using VLLM V1"
echo "--- Hardware Information ---"
# tpu-info

View File

@ -64,10 +64,9 @@ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
echo "--- Python dependencies installed ---"
export VLLM_USE_V1=1
export VLLM_XLA_CHECK_RECOMPILATION=1
export VLLM_XLA_CACHE_PATH=
echo "Using VLLM V1"
echo "--- Hardware Information ---"
# tpu-info

View File

@ -42,7 +42,7 @@ echo "lanching vllm..."
echo "logging to $VLLM_LOG"
echo
VLLM_USE_V1=1 vllm serve $MODEL \
vllm serve $MODEL \
--seed 42 \
--max-num-seqs $MAX_NUM_SEQS \
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \

View File

@ -96,11 +96,11 @@ start_server() {
# This correctly passes each element as a separate argument.
if [[ -n "$profile_dir" ]]; then
# Start server with profiling enabled
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
else
# Start server without profiling
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \
VLLM_SERVER_DEV_MODE=1 \
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
fi
local server_pid=$!

View File

@ -97,7 +97,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
??? console "Command"
```shell
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
--host 0.0.0.0 \
--port 20001 \
--tensor-parallel-size 1 \
@ -118,7 +118,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
??? console "Command"
```shell
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
--host 0.0.0.0 \
--port 20002 \
--tensor-parallel-size 1 \
@ -139,7 +139,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
??? console "Command"
```shell
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
--host 0.0.0.0 \
--port 20003 \
--tensor-parallel-size 1 \
@ -160,7 +160,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
??? console "Command"
```shell
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
--host 0.0.0.0 \
--port 20004 \
--tensor-parallel-size 1 \
@ -190,7 +190,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
??? console "Command"
```shell
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
--host 0.0.0.0 \
--port 20001 \
--tensor-parallel-size 1 \
@ -211,7 +211,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
??? console "Command"
```shell
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
--host 0.0.0.0 \
--port 20002 \
--tensor-parallel-size 1 \
@ -232,7 +232,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
??? console "Command"
```shell
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
--host 0.0.0.0 \
--port 20003 \
--tensor-parallel-size 1 \
@ -253,7 +253,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
??? console "Command"
```shell
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
--host 0.0.0.0 \
--port 20004 \
--tensor-parallel-size 1 \

View File

@ -2,7 +2,7 @@
In vLLM's V1 architecture, `torch.compile` is enabled by default and is a critical part of the framework. This document gives a simple walk-through example to show how to understand the `torch.compile` usage.
Throughout the example, we will run a common Llama model using v1, and turn on debug level logging to show all the details. The command to be used is `VLLM_USE_V1=1 VLLM_LOGGING_LEVEL=DEBUG vllm serve meta-llama/Llama-3.2-1B`.
Throughout the example, we will run a common Llama model, and turn on debug level logging to show all the details. The command to be used is `VLLM_LOGGING_LEVEL=DEBUG vllm serve meta-llama/Llama-3.2-1B`.
## Compilation Cache

View File

@ -166,7 +166,7 @@ main() {
local kv_port=$((21001 + i))
echo " Prefill server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
CUDA_VISIBLE_DEVICES=$gpu_id VLLM_USE_V1=1 vllm serve $MODEL \
CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
--enforce-eager \
--host 0.0.0.0 \
--port $port \
@ -194,7 +194,7 @@ main() {
local kv_port=$((22001 + i))
echo " Decode server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
--enforce-eager \
--host 0.0.0.0 \
--port $port \

View File

@ -55,7 +55,6 @@ done
echo "Starting vLLM server for $MODEL_NAME with data parallel size: $DATA_PARALLEL_SIZE and redundant experts: $REDUNDANT_EXPERTS"
export RAY_DEDUP_LOGS=0
export VLLM_USE_V1=1
export VLLM_ALL2ALL_BACKEND="pplx"
export VLLM_USE_DEEP_GEMM=1

View File

@ -5,7 +5,7 @@ To run this example, you can start the vLLM server
without any specific flags:
```bash
VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \
vllm serve unsloth/Llama-3.2-1B-Instruct \
--structured-outputs-config.backend outlines
```

View File

@ -36,7 +36,6 @@ llm_config = LLMConfig(
},
# Set to the node's accelerator type.
accelerator_type="H100",
runtime_env={"env_vars": {"VLLM_USE_V1": "1"}},
# Customize engine arguments as required (for example, vLLM engine kwargs).
engine_kwargs={
"tensor_parallel_size": 8,