From 067c34a1559400e956311f067ddd185f54207a2b Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sat, 2 Aug 2025 00:19:48 -0700 Subject: [PATCH] docs: remove deprecated disable-log-requests flag (#22113) Signed-off-by: Roger Wang --- .buildkite/scripts/tpu/run_bm.sh | 1 - benchmarks/README.md | 10 +++++----- benchmarks/auto_tune/auto_tune.sh | 1 - benchmarks/benchmark_serving.py | 3 +-- benchmarks/benchmark_serving_structured_output.py | 2 +- docs/design/p2p_nccl_connector.md | 8 -------- docs/models/supported_models.md | 2 +- .../disagg_example_p2p_nccl_xpyd.sh | 2 -- examples/online_serving/prometheus_grafana/README.md | 3 +-- .../disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh | 2 -- tests/entrypoints/openai/correctness/test_lmeval.py | 2 +- tests/entrypoints/openai/test_chunked_prompt.py | 2 -- tests/models/quantization/test_bitsandbytes.py | 1 - .../kv_connector/nixl_integration/run_accuracy_test.sh | 2 -- .../nixl_integration/run_edge_case_test.sh | 2 -- .../nixl_integration/run_tpu_disagg_accuracy_test.sh | 3 --- .../nixl_integration/run_tpu_edge_case_test.sh | 2 -- tests/v1/sample/test_logprobs_e2e.py | 2 +- vllm/utils/__init__.py | 5 +++-- 19 files changed, 14 insertions(+), 41 deletions(-) diff --git a/.buildkite/scripts/tpu/run_bm.sh b/.buildkite/scripts/tpu/run_bm.sh index beecaf7a74..b1e17b4385 100755 --- a/.buildkite/scripts/tpu/run_bm.sh +++ b/.buildkite/scripts/tpu/run_bm.sh @@ -44,7 +44,6 @@ echo VLLM_USE_V1=1 vllm serve $MODEL \ --seed 42 \ - --disable-log-requests \ --max-num-seqs $MAX_NUM_SEQS \ --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \ --tensor-parallel-size $TENSOR_PARALLEL_SIZE \ diff --git a/benchmarks/README.md b/benchmarks/README.md index 644517235b..d6442a4fc3 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -91,7 +91,7 @@ become available. First start serving your model ```bash -vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests +vllm serve NousResearch/Hermes-3-Llama-3.1-8B ``` Then run the benchmarking script @@ -146,7 +146,7 @@ If the dataset you want to benchmark is not supported yet in vLLM, even then you ```bash # start server -VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests +VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct ``` ```bash @@ -171,7 +171,7 @@ You can skip applying chat template if your data already has it by using `--cust ```bash # need a model with vision capability here -vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests +vllm serve Qwen/Qwen2-VL-7B-Instruct ``` ```bash @@ -205,7 +205,7 @@ vllm bench serve \ ### Other HuggingFaceDataset Examples ```bash -vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests +vllm serve Qwen/Qwen2-VL-7B-Instruct ``` `lmms-lab/LLaVA-OneVision-Data`: @@ -430,7 +430,7 @@ Benchmark the performance of structured output generation (JSON, grammar, regex) ### Server Setup ```bash -vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests +vllm serve NousResearch/Hermes-3-Llama-3.1-8B ``` ### JSON Schema Benchmark diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh index 3cd8580e06..df26376504 100644 --- a/benchmarks/auto_tune/auto_tune.sh +++ b/benchmarks/auto_tune/auto_tune.sh @@ -60,7 +60,6 @@ start_server() { pkill -f vllm VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \ - --disable-log-requests \ --port 8004 \ --gpu-memory-utilization $gpu_memory_utilization \ --max-num-seqs $max_num_seqs \ diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 3affa18ae3..93b72211eb 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -5,8 +5,7 @@ r"""Benchmark online serving throughput. On the server side, run one of the following commands: vLLM OpenAI API server vllm serve \ - --swap-space 16 \ - --disable-log-requests + --swap-space 16 On the client side, run: python benchmarks/benchmark_serving.py \ diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index 2a22f122c7..ca6843a72a 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -4,7 +4,7 @@ r"""Benchmark online serving throughput with structured outputs. On the server side, run one of the following commands: (vLLM OpenAI API server) - vllm serve --disable-log-requests + vllm serve On the client side, run: python benchmarks/benchmark_serving_structured_output.py \ diff --git a/docs/design/p2p_nccl_connector.md b/docs/design/p2p_nccl_connector.md index 94af8bedd2..adf838306b 100644 --- a/docs/design/p2p_nccl_connector.md +++ b/docs/design/p2p_nccl_connector.md @@ -109,7 +109,6 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.9 \ - --disable-log-request \ --kv-transfer-config \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20001"}}' > /var/vllm.log 2>&1 & ``` @@ -131,7 +130,6 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.7 \ - --disable-log-request \ --kv-transfer-config \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20002"}}' > /var/vllm.log 2>&1 & ``` @@ -153,7 +151,6 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.7 \ - --disable-log-request \ --kv-transfer-config \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003"}}' > /var/vllm.log 2>&1 & ``` @@ -175,7 +172,6 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.7 \ - --disable-log-request \ --kv-transfer-config \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20004"}}' > /var/vllm.log 2>&1 & ``` @@ -206,7 +202,6 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.9 \ - --disable-log-request \ --kv-transfer-config \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20001"}}' > /var/vllm.log 2>&1 & ``` @@ -228,7 +223,6 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.9 \ - --disable-log-request \ --kv-transfer-config \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20002"}}' > /var/vllm.log 2>&1 & ``` @@ -250,7 +244,6 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.9 \ - --disable-log-request \ --kv-transfer-config \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003"}}' > /var/vllm.log 2>&1 & ``` @@ -272,7 +265,6 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.7 \ - --disable-log-request \ --kv-transfer-config \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20004"}}' > /var/vllm.log 2>&1 & ``` diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 56c77a1e5f..bd7a57b436 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -255,7 +255,7 @@ export https_proxy=http://your.proxy.server:port https_proxy=http://your.proxy.server:port huggingface-cli download # or use vllm cmd directly -https_proxy=http://your.proxy.server:port vllm serve --disable-log-requests +https_proxy=http://your.proxy.server:port vllm serve ``` - Set the proxy in Python interpreter: diff --git a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh index 568f7a43b4..7b0b12bb34 100644 --- a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh +++ b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh @@ -178,7 +178,6 @@ main() { --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.9 \ - --disable-log-request \ --kv-transfer-config \ "{\"kv_connector\":\"P2pNcclConnector\",\"kv_role\":\"kv_producer\",\"kv_buffer_size\":\"1e1\",\"kv_port\":\"$kv_port\",\"kv_connector_extra_config\":{\"proxy_ip\":\"0.0.0.0\",\"proxy_port\":\"$PROXY_PORT\",\"http_port\":\"$port\",\"send_type\":\"PUT_ASYNC\",\"nccl_num_channels\":\"16\"}}" > prefill$((i+1)).log 2>&1 & PIDS+=($!) @@ -207,7 +206,6 @@ main() { --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.7 \ - --disable-log-request \ --kv-transfer-config \ "{\"kv_connector\":\"P2pNcclConnector\",\"kv_role\":\"kv_consumer\",\"kv_buffer_size\":\"8e9\",\"kv_port\":\"$kv_port\",\"kv_connector_extra_config\":{\"proxy_ip\":\"0.0.0.0\",\"proxy_port\":\"$PROXY_PORT\",\"http_port\":\"$port\",\"send_type\":\"PUT_ASYNC\",\"nccl_num_channels\":\"16\"}}" > decode$((i+1)).log 2>&1 & PIDS+=($!) diff --git a/examples/online_serving/prometheus_grafana/README.md b/examples/online_serving/prometheus_grafana/README.md index 7c4e649e6d..5cd4dab5a8 100644 --- a/examples/online_serving/prometheus_grafana/README.md +++ b/examples/online_serving/prometheus_grafana/README.md @@ -13,8 +13,7 @@ Prometheus metric logging is enabled by default in the OpenAI-compatible server. ```bash vllm serve mistralai/Mistral-7B-v0.1 \ - --max-model-len 2048 \ - --disable-log-requests + --max-model-len 2048 ``` Launch Prometheus and Grafana servers with `docker compose`: diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh index 5719fa8212..1284466a45 100644 --- a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh +++ b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh @@ -28,7 +28,6 @@ if [[ $1 == "prefiller" ]]; then CUDA_VISIBLE_DEVICES=0 \ vllm serve $MODEL \ --port 8100 \ - --disable-log-requests \ --enforce-eager \ --kv-transfer-config \ '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_producer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "producer1"}}' @@ -46,7 +45,6 @@ elif [[ $1 == "decoder" ]]; then CUDA_VISIBLE_DEVICES=1 \ vllm serve $MODEL \ --port 8200 \ - --disable-log-requests \ --enforce-eager \ --kv-transfer-config \ '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_consumer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "consumer1"}}' diff --git a/tests/entrypoints/openai/correctness/test_lmeval.py b/tests/entrypoints/openai/correctness/test_lmeval.py index a07a147cdc..d75731637d 100644 --- a/tests/entrypoints/openai/correctness/test_lmeval.py +++ b/tests/entrypoints/openai/correctness/test_lmeval.py @@ -22,7 +22,7 @@ TASK = "gsm8k" FILTER = "exact_match,strict-match" RTOL = 0.03 EXPECTED_VALUE = 0.54 -DEFAULT_ARGS = ["--max-model-len", "4096", "--disable-log-requests"] +DEFAULT_ARGS = ["--max-model-len", "4096"] MORE_ARGS_LIST = [ [], # Default ["--enable-chunked-prefill"], # Chunked diff --git a/tests/entrypoints/openai/test_chunked_prompt.py b/tests/entrypoints/openai/test_chunked_prompt.py index 3c8ed955a6..c8160c5f2d 100644 --- a/tests/entrypoints/openai/test_chunked_prompt.py +++ b/tests/entrypoints/openai/test_chunked_prompt.py @@ -26,8 +26,6 @@ def server(): "--enable-chunked-prefill", "--max-num-batched-tokens", "1000", - # large prompts create a lot of output - "--disable-log-requests", ] with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: diff --git a/tests/models/quantization/test_bitsandbytes.py b/tests/models/quantization/test_bitsandbytes.py index 8cb269d7e9..e0e919b62b 100644 --- a/tests/models/quantization/test_bitsandbytes.py +++ b/tests/models/quantization/test_bitsandbytes.py @@ -102,7 +102,6 @@ def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts, def test_load_pp_4bit_bnb_model(model_name, description) -> None: common_args = [ "--disable-log-stats", - "--disable-log-requests", "--dtype", "bfloat16", "--enable-prefix-caching", diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh index b48655d80e..9322410ec9 100755 --- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh +++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh @@ -88,7 +88,6 @@ run_tests_for_model() { BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ --port $PORT \ --enforce-eager \ - --disable-log-requests \ --gpu-memory-utilization 0.2 \ --tensor-parallel-size $PREFILLER_TP_SIZE \ --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'" @@ -121,7 +120,6 @@ run_tests_for_model() { BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ --port $PORT \ --enforce-eager \ - --disable-log-requests \ --gpu-memory-utilization 0.2 \ --tensor-parallel-size $DECODER_TP_SIZE \ --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'" diff --git a/tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh b/tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh index 98903a176e..b644612929 100644 --- a/tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh +++ b/tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh @@ -57,7 +57,6 @@ run_tests_for_model() { BASE_CMD="CUDA_VISIBLE_DEVICES=0 VLLM_NIXL_SIDE_CHANNEL_PORT=5559 vllm serve $model_name \ --port $PREFILL_PORT \ --enforce-eager \ - --disable-log-requests \ --gpu-memory-utilization 0.2 \ --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'" @@ -76,7 +75,6 @@ run_tests_for_model() { BASE_CMD="CUDA_VISIBLE_DEVICES=1 VLLM_NIXL_SIDE_CHANNEL_PORT=6000 vllm serve $model_name \ --port $DECODE_PORT \ --enforce-eager \ - --disable-log-requests \ --gpu-memory-utilization 0.2 \ --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'" diff --git a/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh index 45779d1691..ea125f99fc 100644 --- a/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh +++ b/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh @@ -63,7 +63,6 @@ launch_baseline() { --seed 42 \ --block-size ${BLOCK_SIZE} \ --gpu-memory-utilization 0.5 \ - --disable-log-requests \ --enforce-eager" echo ${BASELINE_BASE_CMD} ssh -tt ${BASELINE_HOST} "${BASELINE_BASE_CMD}" & @@ -87,7 +86,6 @@ launch_pd() { --block-size ${BLOCK_SIZE} \ --enforce-eager \ --gpu-memory-utilization 0.5 \ - --disable-log-requests \ --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'" @@ -106,7 +104,6 @@ launch_pd() { --block-size ${BLOCK_SIZE} \ --enforce-eager \ --gpu-memory-utilization 0.5 \ - --disable-log-requests \ --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'" echo ${PREFILL_BASE_CMD} diff --git a/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh b/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh index c37c92fdf5..8ba653770c 100644 --- a/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh +++ b/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh @@ -68,7 +68,6 @@ launch_pd() { --block-size ${BLOCK_SIZE} \ --enforce-eager \ --gpu-memory-utilization 0.5 \ - --disable-log-requests \ --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'" @@ -87,7 +86,6 @@ launch_pd() { --block-size ${BLOCK_SIZE} \ --enforce-eager \ --gpu-memory-utilization 0.5 \ - --disable-log-requests \ --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'" echo ${PREFILL_BASE_CMD} diff --git a/tests/v1/sample/test_logprobs_e2e.py b/tests/v1/sample/test_logprobs_e2e.py index 50b14a15dc..7f41355ff7 100644 --- a/tests/v1/sample/test_logprobs_e2e.py +++ b/tests/v1/sample/test_logprobs_e2e.py @@ -15,7 +15,7 @@ EXPECTED_VALUE = 0.62 MODEL = "meta-llama/Llama-3.2-1B-Instruct" MODEL_ARGS = f"pretrained={MODEL},enforce_eager=True,enable_prefix_caching=False,gpu_memory_utilization=0.8" # noqa: E501 SERVER_ARGS = [ - "--enforce_eager", "--no_enable_prefix_caching", "--disable-log-requests", + "--enforce_eager", "--no_enable_prefix_caching", "--gpu-memory-utilization=0.8" ] NUM_CONCURRENT = 100 diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 3318ae5106..ce62282c21 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -1673,8 +1673,9 @@ class FlexibleArgumentParser(ArgumentParser): # Special case warning because the warning below won't trigger # if –-disable-log-requests because its value is default. logger.warning_once( - "argument '--disable-log-requests' is deprecated. This " - "will be removed in v0.12.0.") + "argument '--disable-log-requests' is deprecated and " + "replaced with '--enable-log-requests'. This will be " + "removed in v0.12.0.") namespace, args = super().parse_known_args(args, namespace) for action in FlexibleArgumentParser._deprecated: if (hasattr(namespace, dest := action.dest)