diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh index 34effbb607..c64e563802 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh @@ -454,11 +454,6 @@ main() { fi check_hf_token - # Set to v1 to run v1 benchmark - if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then - export VLLM_USE_V1=1 - fi - # dependencies (which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which jq) || (apt-get update && apt-get -y install jq) diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh index e76528a178..cbb2527a4f 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh @@ -64,10 +64,9 @@ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \ && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 echo "--- Python dependencies installed ---" -export VLLM_USE_V1=1 + export VLLM_XLA_CHECK_RECOMPILATION=1 export VLLM_XLA_CACHE_PATH= -echo "Using VLLM V1" echo "--- Hardware Information ---" # tpu-info diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh index 69366cd503..f022fa3672 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh @@ -64,10 +64,9 @@ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \ && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 echo "--- Python dependencies installed ---" -export VLLM_USE_V1=1 + export VLLM_XLA_CHECK_RECOMPILATION=1 export VLLM_XLA_CACHE_PATH= -echo "Using VLLM V1" echo "--- Hardware Information ---" # tpu-info diff --git a/.buildkite/scripts/tpu/run_bm.sh b/.buildkite/scripts/tpu/run_bm.sh index b1e17b4385..3364fce8e1 100755 --- a/.buildkite/scripts/tpu/run_bm.sh +++ b/.buildkite/scripts/tpu/run_bm.sh @@ -42,7 +42,7 @@ echo "lanching vllm..." echo "logging to $VLLM_LOG" echo -VLLM_USE_V1=1 vllm serve $MODEL \ +vllm serve $MODEL \ --seed 42 \ --max-num-seqs $MAX_NUM_SEQS \ --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \ diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh index 3753279467..56b721cbb4 100644 --- a/benchmarks/auto_tune/auto_tune.sh +++ b/benchmarks/auto_tune/auto_tune.sh @@ -96,11 +96,11 @@ start_server() { # This correctly passes each element as a separate argument. if [[ -n "$profile_dir" ]]; then # Start server with profiling enabled - VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \ + VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \ vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 & else # Start server without profiling - VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \ + VLLM_SERVER_DEV_MODE=1 \ vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 & fi local server_pid=$! diff --git a/docs/design/p2p_nccl_connector.md b/docs/design/p2p_nccl_connector.md index adf838306b..4674bef8d2 100644 --- a/docs/design/p2p_nccl_connector.md +++ b/docs/design/p2p_nccl_connector.md @@ -97,7 +97,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & ??? console "Command" ```shell - VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \ + CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \ --host 0.0.0.0 \ --port 20001 \ --tensor-parallel-size 1 \ @@ -118,7 +118,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & ??? console "Command" ```shell - VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \ + CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \ --host 0.0.0.0 \ --port 20002 \ --tensor-parallel-size 1 \ @@ -139,7 +139,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & ??? console "Command" ```shell - VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \ + CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \ --host 0.0.0.0 \ --port 20003 \ --tensor-parallel-size 1 \ @@ -160,7 +160,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & ??? console "Command" ```shell - VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \ + CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \ --host 0.0.0.0 \ --port 20004 \ --tensor-parallel-size 1 \ @@ -190,7 +190,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & ??? console "Command" ```shell - VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \ + CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \ --host 0.0.0.0 \ --port 20001 \ --tensor-parallel-size 1 \ @@ -211,7 +211,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & ??? console "Command" ```shell - VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \ + CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \ --host 0.0.0.0 \ --port 20002 \ --tensor-parallel-size 1 \ @@ -232,7 +232,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & ??? console "Command" ```shell - VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \ + CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \ --host 0.0.0.0 \ --port 20003 \ --tensor-parallel-size 1 \ @@ -253,7 +253,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & ??? console "Command" ```shell - VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \ + CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \ --host 0.0.0.0 \ --port 20004 \ --tensor-parallel-size 1 \ diff --git a/docs/design/torch_compile.md b/docs/design/torch_compile.md index 127768b74d..831df3c93a 100644 --- a/docs/design/torch_compile.md +++ b/docs/design/torch_compile.md @@ -2,7 +2,7 @@ In vLLM's V1 architecture, `torch.compile` is enabled by default and is a critical part of the framework. This document gives a simple walk-through example to show how to understand the `torch.compile` usage. -Throughout the example, we will run a common Llama model using v1, and turn on debug level logging to show all the details. The command to be used is `VLLM_USE_V1=1 VLLM_LOGGING_LEVEL=DEBUG vllm serve meta-llama/Llama-3.2-1B`. +Throughout the example, we will run a common Llama model, and turn on debug level logging to show all the details. The command to be used is `VLLM_LOGGING_LEVEL=DEBUG vllm serve meta-llama/Llama-3.2-1B`. ## Compilation Cache diff --git a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh index 7b0b12bb34..1e7acccb4f 100644 --- a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh +++ b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh @@ -166,7 +166,7 @@ main() { local kv_port=$((21001 + i)) echo " Prefill server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port" - CUDA_VISIBLE_DEVICES=$gpu_id VLLM_USE_V1=1 vllm serve $MODEL \ + CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \ --enforce-eager \ --host 0.0.0.0 \ --port $port \ @@ -194,7 +194,7 @@ main() { local kv_port=$((22001 + i)) echo " Decode server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port" - VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \ + CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \ --enforce-eager \ --host 0.0.0.0 \ --port $port \ diff --git a/examples/online_serving/elastic_ep/serve_deepseek_v2.sh b/examples/online_serving/elastic_ep/serve_deepseek_v2.sh index 1234ebba4d..6845545b6f 100644 --- a/examples/online_serving/elastic_ep/serve_deepseek_v2.sh +++ b/examples/online_serving/elastic_ep/serve_deepseek_v2.sh @@ -55,7 +55,6 @@ done echo "Starting vLLM server for $MODEL_NAME with data parallel size: $DATA_PARALLEL_SIZE and redundant experts: $REDUNDANT_EXPERTS" export RAY_DEDUP_LOGS=0 -export VLLM_USE_V1=1 export VLLM_ALL2ALL_BACKEND="pplx" export VLLM_USE_DEEP_GEMM=1 diff --git a/examples/online_serving/openai_chat_completion_client_with_tools_required.py b/examples/online_serving/openai_chat_completion_client_with_tools_required.py index 6ff65b18f6..c00d712b35 100644 --- a/examples/online_serving/openai_chat_completion_client_with_tools_required.py +++ b/examples/online_serving/openai_chat_completion_client_with_tools_required.py @@ -5,7 +5,7 @@ To run this example, you can start the vLLM server without any specific flags: ```bash -VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \ +vllm serve unsloth/Llama-3.2-1B-Instruct \ --structured-outputs-config.backend outlines ``` diff --git a/examples/online_serving/ray_serve_deepseek.py b/examples/online_serving/ray_serve_deepseek.py index d24b553df2..af53443b91 100644 --- a/examples/online_serving/ray_serve_deepseek.py +++ b/examples/online_serving/ray_serve_deepseek.py @@ -36,7 +36,6 @@ llm_config = LLMConfig( }, # Set to the node's accelerator type. accelerator_type="H100", - runtime_env={"env_vars": {"VLLM_USE_V1": "1"}}, # Customize engine arguments as required (for example, vLLM engine kwargs). engine_kwargs={ "tensor_parallel_size": 8,