docs: remove deprecated disable-log-requests flag (#22113)
Signed-off-by: Roger Wang <hey@rogerw.me>
This commit is contained in:
@ -44,7 +44,6 @@ echo
|
|||||||
|
|
||||||
VLLM_USE_V1=1 vllm serve $MODEL \
|
VLLM_USE_V1=1 vllm serve $MODEL \
|
||||||
--seed 42 \
|
--seed 42 \
|
||||||
--disable-log-requests \
|
|
||||||
--max-num-seqs $MAX_NUM_SEQS \
|
--max-num-seqs $MAX_NUM_SEQS \
|
||||||
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
|
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
|
||||||
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
|
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
|
||||||
|
@ -91,7 +91,7 @@ become available.
|
|||||||
First start serving your model
|
First start serving your model
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
|
vllm serve NousResearch/Hermes-3-Llama-3.1-8B
|
||||||
```
|
```
|
||||||
|
|
||||||
Then run the benchmarking script
|
Then run the benchmarking script
|
||||||
@ -146,7 +146,7 @@ If the dataset you want to benchmark is not supported yet in vLLM, even then you
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
# start server
|
# start server
|
||||||
VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
|
VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct
|
||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@ -171,7 +171,7 @@ You can skip applying chat template if your data already has it by using `--cust
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
# need a model with vision capability here
|
# need a model with vision capability here
|
||||||
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
|
vllm serve Qwen/Qwen2-VL-7B-Instruct
|
||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@ -205,7 +205,7 @@ vllm bench serve \
|
|||||||
### Other HuggingFaceDataset Examples
|
### Other HuggingFaceDataset Examples
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
|
vllm serve Qwen/Qwen2-VL-7B-Instruct
|
||||||
```
|
```
|
||||||
|
|
||||||
`lmms-lab/LLaVA-OneVision-Data`:
|
`lmms-lab/LLaVA-OneVision-Data`:
|
||||||
@ -430,7 +430,7 @@ Benchmark the performance of structured output generation (JSON, grammar, regex)
|
|||||||
### Server Setup
|
### Server Setup
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
|
vllm serve NousResearch/Hermes-3-Llama-3.1-8B
|
||||||
```
|
```
|
||||||
|
|
||||||
### JSON Schema Benchmark
|
### JSON Schema Benchmark
|
||||||
|
@ -60,7 +60,6 @@ start_server() {
|
|||||||
pkill -f vllm
|
pkill -f vllm
|
||||||
|
|
||||||
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \
|
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \
|
||||||
--disable-log-requests \
|
|
||||||
--port 8004 \
|
--port 8004 \
|
||||||
--gpu-memory-utilization $gpu_memory_utilization \
|
--gpu-memory-utilization $gpu_memory_utilization \
|
||||||
--max-num-seqs $max_num_seqs \
|
--max-num-seqs $max_num_seqs \
|
||||||
|
@ -5,8 +5,7 @@ r"""Benchmark online serving throughput.
|
|||||||
On the server side, run one of the following commands:
|
On the server side, run one of the following commands:
|
||||||
vLLM OpenAI API server
|
vLLM OpenAI API server
|
||||||
vllm serve <your_model> \
|
vllm serve <your_model> \
|
||||||
--swap-space 16 \
|
--swap-space 16
|
||||||
--disable-log-requests
|
|
||||||
|
|
||||||
On the client side, run:
|
On the client side, run:
|
||||||
python benchmarks/benchmark_serving.py \
|
python benchmarks/benchmark_serving.py \
|
||||||
|
@ -4,7 +4,7 @@ r"""Benchmark online serving throughput with structured outputs.
|
|||||||
|
|
||||||
On the server side, run one of the following commands:
|
On the server side, run one of the following commands:
|
||||||
(vLLM OpenAI API server)
|
(vLLM OpenAI API server)
|
||||||
vllm serve <your_model> --disable-log-requests
|
vllm serve <your_model>
|
||||||
|
|
||||||
On the client side, run:
|
On the client side, run:
|
||||||
python benchmarks/benchmark_serving_structured_output.py \
|
python benchmarks/benchmark_serving_structured_output.py \
|
||||||
|
@ -109,7 +109,6 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
|
|||||||
--max-num-seqs 256 \
|
--max-num-seqs 256 \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--gpu-memory-utilization 0.9 \
|
--gpu-memory-utilization 0.9 \
|
||||||
--disable-log-request \
|
|
||||||
--kv-transfer-config \
|
--kv-transfer-config \
|
||||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20001"}}' > /var/vllm.log 2>&1 &
|
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20001"}}' > /var/vllm.log 2>&1 &
|
||||||
```
|
```
|
||||||
@ -131,7 +130,6 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
|
|||||||
--max-num-seqs 256 \
|
--max-num-seqs 256 \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--gpu-memory-utilization 0.7 \
|
--gpu-memory-utilization 0.7 \
|
||||||
--disable-log-request \
|
|
||||||
--kv-transfer-config \
|
--kv-transfer-config \
|
||||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20002"}}' > /var/vllm.log 2>&1 &
|
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20002"}}' > /var/vllm.log 2>&1 &
|
||||||
```
|
```
|
||||||
@ -153,7 +151,6 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
|
|||||||
--max-num-seqs 256 \
|
--max-num-seqs 256 \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--gpu-memory-utilization 0.7 \
|
--gpu-memory-utilization 0.7 \
|
||||||
--disable-log-request \
|
|
||||||
--kv-transfer-config \
|
--kv-transfer-config \
|
||||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003"}}' > /var/vllm.log 2>&1 &
|
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003"}}' > /var/vllm.log 2>&1 &
|
||||||
```
|
```
|
||||||
@ -175,7 +172,6 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
|
|||||||
--max-num-seqs 256 \
|
--max-num-seqs 256 \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--gpu-memory-utilization 0.7 \
|
--gpu-memory-utilization 0.7 \
|
||||||
--disable-log-request \
|
|
||||||
--kv-transfer-config \
|
--kv-transfer-config \
|
||||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20004"}}' > /var/vllm.log 2>&1 &
|
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20004"}}' > /var/vllm.log 2>&1 &
|
||||||
```
|
```
|
||||||
@ -206,7 +202,6 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
|
|||||||
--max-num-seqs 256 \
|
--max-num-seqs 256 \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--gpu-memory-utilization 0.9 \
|
--gpu-memory-utilization 0.9 \
|
||||||
--disable-log-request \
|
|
||||||
--kv-transfer-config \
|
--kv-transfer-config \
|
||||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20001"}}' > /var/vllm.log 2>&1 &
|
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20001"}}' > /var/vllm.log 2>&1 &
|
||||||
```
|
```
|
||||||
@ -228,7 +223,6 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
|
|||||||
--max-num-seqs 256 \
|
--max-num-seqs 256 \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--gpu-memory-utilization 0.9 \
|
--gpu-memory-utilization 0.9 \
|
||||||
--disable-log-request \
|
|
||||||
--kv-transfer-config \
|
--kv-transfer-config \
|
||||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20002"}}' > /var/vllm.log 2>&1 &
|
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20002"}}' > /var/vllm.log 2>&1 &
|
||||||
```
|
```
|
||||||
@ -250,7 +244,6 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
|
|||||||
--max-num-seqs 256 \
|
--max-num-seqs 256 \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--gpu-memory-utilization 0.9 \
|
--gpu-memory-utilization 0.9 \
|
||||||
--disable-log-request \
|
|
||||||
--kv-transfer-config \
|
--kv-transfer-config \
|
||||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003"}}' > /var/vllm.log 2>&1 &
|
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003"}}' > /var/vllm.log 2>&1 &
|
||||||
```
|
```
|
||||||
@ -272,7 +265,6 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
|
|||||||
--max-num-seqs 256 \
|
--max-num-seqs 256 \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--gpu-memory-utilization 0.7 \
|
--gpu-memory-utilization 0.7 \
|
||||||
--disable-log-request \
|
|
||||||
--kv-transfer-config \
|
--kv-transfer-config \
|
||||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20004"}}' > /var/vllm.log 2>&1 &
|
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20004"}}' > /var/vllm.log 2>&1 &
|
||||||
```
|
```
|
||||||
|
@ -255,7 +255,7 @@ export https_proxy=http://your.proxy.server:port
|
|||||||
https_proxy=http://your.proxy.server:port huggingface-cli download <model_name>
|
https_proxy=http://your.proxy.server:port huggingface-cli download <model_name>
|
||||||
|
|
||||||
# or use vllm cmd directly
|
# or use vllm cmd directly
|
||||||
https_proxy=http://your.proxy.server:port vllm serve <model_name> --disable-log-requests
|
https_proxy=http://your.proxy.server:port vllm serve <model_name>
|
||||||
```
|
```
|
||||||
|
|
||||||
- Set the proxy in Python interpreter:
|
- Set the proxy in Python interpreter:
|
||||||
|
@ -178,7 +178,6 @@ main() {
|
|||||||
--max-num-seqs 256 \
|
--max-num-seqs 256 \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--gpu-memory-utilization 0.9 \
|
--gpu-memory-utilization 0.9 \
|
||||||
--disable-log-request \
|
|
||||||
--kv-transfer-config \
|
--kv-transfer-config \
|
||||||
"{\"kv_connector\":\"P2pNcclConnector\",\"kv_role\":\"kv_producer\",\"kv_buffer_size\":\"1e1\",\"kv_port\":\"$kv_port\",\"kv_connector_extra_config\":{\"proxy_ip\":\"0.0.0.0\",\"proxy_port\":\"$PROXY_PORT\",\"http_port\":\"$port\",\"send_type\":\"PUT_ASYNC\",\"nccl_num_channels\":\"16\"}}" > prefill$((i+1)).log 2>&1 &
|
"{\"kv_connector\":\"P2pNcclConnector\",\"kv_role\":\"kv_producer\",\"kv_buffer_size\":\"1e1\",\"kv_port\":\"$kv_port\",\"kv_connector_extra_config\":{\"proxy_ip\":\"0.0.0.0\",\"proxy_port\":\"$PROXY_PORT\",\"http_port\":\"$port\",\"send_type\":\"PUT_ASYNC\",\"nccl_num_channels\":\"16\"}}" > prefill$((i+1)).log 2>&1 &
|
||||||
PIDS+=($!)
|
PIDS+=($!)
|
||||||
@ -207,7 +206,6 @@ main() {
|
|||||||
--max-num-seqs 256 \
|
--max-num-seqs 256 \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--gpu-memory-utilization 0.7 \
|
--gpu-memory-utilization 0.7 \
|
||||||
--disable-log-request \
|
|
||||||
--kv-transfer-config \
|
--kv-transfer-config \
|
||||||
"{\"kv_connector\":\"P2pNcclConnector\",\"kv_role\":\"kv_consumer\",\"kv_buffer_size\":\"8e9\",\"kv_port\":\"$kv_port\",\"kv_connector_extra_config\":{\"proxy_ip\":\"0.0.0.0\",\"proxy_port\":\"$PROXY_PORT\",\"http_port\":\"$port\",\"send_type\":\"PUT_ASYNC\",\"nccl_num_channels\":\"16\"}}" > decode$((i+1)).log 2>&1 &
|
"{\"kv_connector\":\"P2pNcclConnector\",\"kv_role\":\"kv_consumer\",\"kv_buffer_size\":\"8e9\",\"kv_port\":\"$kv_port\",\"kv_connector_extra_config\":{\"proxy_ip\":\"0.0.0.0\",\"proxy_port\":\"$PROXY_PORT\",\"http_port\":\"$port\",\"send_type\":\"PUT_ASYNC\",\"nccl_num_channels\":\"16\"}}" > decode$((i+1)).log 2>&1 &
|
||||||
PIDS+=($!)
|
PIDS+=($!)
|
||||||
|
@ -13,8 +13,7 @@ Prometheus metric logging is enabled by default in the OpenAI-compatible server.
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
vllm serve mistralai/Mistral-7B-v0.1 \
|
vllm serve mistralai/Mistral-7B-v0.1 \
|
||||||
--max-model-len 2048 \
|
--max-model-len 2048
|
||||||
--disable-log-requests
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Launch Prometheus and Grafana servers with `docker compose`:
|
Launch Prometheus and Grafana servers with `docker compose`:
|
||||||
|
@ -28,7 +28,6 @@ if [[ $1 == "prefiller" ]]; then
|
|||||||
CUDA_VISIBLE_DEVICES=0 \
|
CUDA_VISIBLE_DEVICES=0 \
|
||||||
vllm serve $MODEL \
|
vllm serve $MODEL \
|
||||||
--port 8100 \
|
--port 8100 \
|
||||||
--disable-log-requests \
|
|
||||||
--enforce-eager \
|
--enforce-eager \
|
||||||
--kv-transfer-config \
|
--kv-transfer-config \
|
||||||
'{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_producer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "producer1"}}'
|
'{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_producer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "producer1"}}'
|
||||||
@ -46,7 +45,6 @@ elif [[ $1 == "decoder" ]]; then
|
|||||||
CUDA_VISIBLE_DEVICES=1 \
|
CUDA_VISIBLE_DEVICES=1 \
|
||||||
vllm serve $MODEL \
|
vllm serve $MODEL \
|
||||||
--port 8200 \
|
--port 8200 \
|
||||||
--disable-log-requests \
|
|
||||||
--enforce-eager \
|
--enforce-eager \
|
||||||
--kv-transfer-config \
|
--kv-transfer-config \
|
||||||
'{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_consumer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "consumer1"}}'
|
'{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_consumer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "consumer1"}}'
|
||||||
|
@ -22,7 +22,7 @@ TASK = "gsm8k"
|
|||||||
FILTER = "exact_match,strict-match"
|
FILTER = "exact_match,strict-match"
|
||||||
RTOL = 0.03
|
RTOL = 0.03
|
||||||
EXPECTED_VALUE = 0.54
|
EXPECTED_VALUE = 0.54
|
||||||
DEFAULT_ARGS = ["--max-model-len", "4096", "--disable-log-requests"]
|
DEFAULT_ARGS = ["--max-model-len", "4096"]
|
||||||
MORE_ARGS_LIST = [
|
MORE_ARGS_LIST = [
|
||||||
[], # Default
|
[], # Default
|
||||||
["--enable-chunked-prefill"], # Chunked
|
["--enable-chunked-prefill"], # Chunked
|
||||||
|
@ -26,8 +26,6 @@ def server():
|
|||||||
"--enable-chunked-prefill",
|
"--enable-chunked-prefill",
|
||||||
"--max-num-batched-tokens",
|
"--max-num-batched-tokens",
|
||||||
"1000",
|
"1000",
|
||||||
# large prompts create a lot of output
|
|
||||||
"--disable-log-requests",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||||
|
@ -102,7 +102,6 @@ def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
|||||||
def test_load_pp_4bit_bnb_model(model_name, description) -> None:
|
def test_load_pp_4bit_bnb_model(model_name, description) -> None:
|
||||||
common_args = [
|
common_args = [
|
||||||
"--disable-log-stats",
|
"--disable-log-stats",
|
||||||
"--disable-log-requests",
|
|
||||||
"--dtype",
|
"--dtype",
|
||||||
"bfloat16",
|
"bfloat16",
|
||||||
"--enable-prefix-caching",
|
"--enable-prefix-caching",
|
||||||
|
@ -88,7 +88,6 @@ run_tests_for_model() {
|
|||||||
BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
|
BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
|
||||||
--port $PORT \
|
--port $PORT \
|
||||||
--enforce-eager \
|
--enforce-eager \
|
||||||
--disable-log-requests \
|
|
||||||
--gpu-memory-utilization 0.2 \
|
--gpu-memory-utilization 0.2 \
|
||||||
--tensor-parallel-size $PREFILLER_TP_SIZE \
|
--tensor-parallel-size $PREFILLER_TP_SIZE \
|
||||||
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
|
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
|
||||||
@ -121,7 +120,6 @@ run_tests_for_model() {
|
|||||||
BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
|
BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
|
||||||
--port $PORT \
|
--port $PORT \
|
||||||
--enforce-eager \
|
--enforce-eager \
|
||||||
--disable-log-requests \
|
|
||||||
--gpu-memory-utilization 0.2 \
|
--gpu-memory-utilization 0.2 \
|
||||||
--tensor-parallel-size $DECODER_TP_SIZE \
|
--tensor-parallel-size $DECODER_TP_SIZE \
|
||||||
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
|
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
|
||||||
|
@ -57,7 +57,6 @@ run_tests_for_model() {
|
|||||||
BASE_CMD="CUDA_VISIBLE_DEVICES=0 VLLM_NIXL_SIDE_CHANNEL_PORT=5559 vllm serve $model_name \
|
BASE_CMD="CUDA_VISIBLE_DEVICES=0 VLLM_NIXL_SIDE_CHANNEL_PORT=5559 vllm serve $model_name \
|
||||||
--port $PREFILL_PORT \
|
--port $PREFILL_PORT \
|
||||||
--enforce-eager \
|
--enforce-eager \
|
||||||
--disable-log-requests \
|
|
||||||
--gpu-memory-utilization 0.2 \
|
--gpu-memory-utilization 0.2 \
|
||||||
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
|
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
|
||||||
|
|
||||||
@ -76,7 +75,6 @@ run_tests_for_model() {
|
|||||||
BASE_CMD="CUDA_VISIBLE_DEVICES=1 VLLM_NIXL_SIDE_CHANNEL_PORT=6000 vllm serve $model_name \
|
BASE_CMD="CUDA_VISIBLE_DEVICES=1 VLLM_NIXL_SIDE_CHANNEL_PORT=6000 vllm serve $model_name \
|
||||||
--port $DECODE_PORT \
|
--port $DECODE_PORT \
|
||||||
--enforce-eager \
|
--enforce-eager \
|
||||||
--disable-log-requests \
|
|
||||||
--gpu-memory-utilization 0.2 \
|
--gpu-memory-utilization 0.2 \
|
||||||
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
|
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
|
||||||
|
|
||||||
|
@ -63,7 +63,6 @@ launch_baseline() {
|
|||||||
--seed 42 \
|
--seed 42 \
|
||||||
--block-size ${BLOCK_SIZE} \
|
--block-size ${BLOCK_SIZE} \
|
||||||
--gpu-memory-utilization 0.5 \
|
--gpu-memory-utilization 0.5 \
|
||||||
--disable-log-requests \
|
|
||||||
--enforce-eager"
|
--enforce-eager"
|
||||||
echo ${BASELINE_BASE_CMD}
|
echo ${BASELINE_BASE_CMD}
|
||||||
ssh -tt ${BASELINE_HOST} "${BASELINE_BASE_CMD}" &
|
ssh -tt ${BASELINE_HOST} "${BASELINE_BASE_CMD}" &
|
||||||
@ -87,7 +86,6 @@ launch_pd() {
|
|||||||
--block-size ${BLOCK_SIZE} \
|
--block-size ${BLOCK_SIZE} \
|
||||||
--enforce-eager \
|
--enforce-eager \
|
||||||
--gpu-memory-utilization 0.5 \
|
--gpu-memory-utilization 0.5 \
|
||||||
--disable-log-requests \
|
|
||||||
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
|
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
|
||||||
|
|
||||||
|
|
||||||
@ -106,7 +104,6 @@ launch_pd() {
|
|||||||
--block-size ${BLOCK_SIZE} \
|
--block-size ${BLOCK_SIZE} \
|
||||||
--enforce-eager \
|
--enforce-eager \
|
||||||
--gpu-memory-utilization 0.5 \
|
--gpu-memory-utilization 0.5 \
|
||||||
--disable-log-requests \
|
|
||||||
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
|
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
|
||||||
|
|
||||||
echo ${PREFILL_BASE_CMD}
|
echo ${PREFILL_BASE_CMD}
|
||||||
|
@ -68,7 +68,6 @@ launch_pd() {
|
|||||||
--block-size ${BLOCK_SIZE} \
|
--block-size ${BLOCK_SIZE} \
|
||||||
--enforce-eager \
|
--enforce-eager \
|
||||||
--gpu-memory-utilization 0.5 \
|
--gpu-memory-utilization 0.5 \
|
||||||
--disable-log-requests \
|
|
||||||
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
|
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
|
||||||
|
|
||||||
|
|
||||||
@ -87,7 +86,6 @@ launch_pd() {
|
|||||||
--block-size ${BLOCK_SIZE} \
|
--block-size ${BLOCK_SIZE} \
|
||||||
--enforce-eager \
|
--enforce-eager \
|
||||||
--gpu-memory-utilization 0.5 \
|
--gpu-memory-utilization 0.5 \
|
||||||
--disable-log-requests \
|
|
||||||
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
|
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'"
|
||||||
|
|
||||||
echo ${PREFILL_BASE_CMD}
|
echo ${PREFILL_BASE_CMD}
|
||||||
|
@ -15,7 +15,7 @@ EXPECTED_VALUE = 0.62
|
|||||||
MODEL = "meta-llama/Llama-3.2-1B-Instruct"
|
MODEL = "meta-llama/Llama-3.2-1B-Instruct"
|
||||||
MODEL_ARGS = f"pretrained={MODEL},enforce_eager=True,enable_prefix_caching=False,gpu_memory_utilization=0.8" # noqa: E501
|
MODEL_ARGS = f"pretrained={MODEL},enforce_eager=True,enable_prefix_caching=False,gpu_memory_utilization=0.8" # noqa: E501
|
||||||
SERVER_ARGS = [
|
SERVER_ARGS = [
|
||||||
"--enforce_eager", "--no_enable_prefix_caching", "--disable-log-requests",
|
"--enforce_eager", "--no_enable_prefix_caching",
|
||||||
"--gpu-memory-utilization=0.8"
|
"--gpu-memory-utilization=0.8"
|
||||||
]
|
]
|
||||||
NUM_CONCURRENT = 100
|
NUM_CONCURRENT = 100
|
||||||
|
@ -1673,8 +1673,9 @@ class FlexibleArgumentParser(ArgumentParser):
|
|||||||
# Special case warning because the warning below won't trigger
|
# Special case warning because the warning below won't trigger
|
||||||
# if –-disable-log-requests because its value is default.
|
# if –-disable-log-requests because its value is default.
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
"argument '--disable-log-requests' is deprecated. This "
|
"argument '--disable-log-requests' is deprecated and "
|
||||||
"will be removed in v0.12.0.")
|
"replaced with '--enable-log-requests'. This will be "
|
||||||
|
"removed in v0.12.0.")
|
||||||
namespace, args = super().parse_known_args(args, namespace)
|
namespace, args = super().parse_known_args(args, namespace)
|
||||||
for action in FlexibleArgumentParser._deprecated:
|
for action in FlexibleArgumentParser._deprecated:
|
||||||
if (hasattr(namespace, dest := action.dest)
|
if (hasattr(namespace, dest := action.dest)
|
||||||
|
Reference in New Issue
Block a user