mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 23:03:52 +08:00
Construct KVTransferConfig properly from Python instead of using JSON blobs without CLI (#17994)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@ -49,9 +49,10 @@ def run_prefill(prefill_done, prompts):
|
|||||||
|
|
||||||
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
|
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
|
||||||
|
|
||||||
ktc = KVTransferConfig.from_cli(
|
ktc = KVTransferConfig(kv_connector="LMCacheConnector",
|
||||||
'{"kv_connector":"LMCacheConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}'
|
kv_role="kv_producer",
|
||||||
)
|
kv_rank=0,
|
||||||
|
kv_parallel_size=2)
|
||||||
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
|
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
|
||||||
# memory. Reduce the value if your GPU has less memory.
|
# memory. Reduce the value if your GPU has less memory.
|
||||||
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
|
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
|
||||||
@ -78,9 +79,10 @@ def run_decode(prefill_done, prompts, timeout=1):
|
|||||||
|
|
||||||
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
|
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
|
||||||
|
|
||||||
ktc = KVTransferConfig.from_cli(
|
ktc = KVTransferConfig(kv_connector="LMCacheConnector",
|
||||||
'{"kv_connector":"LMCacheConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}'
|
kv_role="kv_consumer",
|
||||||
)
|
kv_rank=1,
|
||||||
|
kv_parallel_size=2)
|
||||||
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
|
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
|
||||||
# of memory. Reduce the value if your GPU has less memory.
|
# of memory. Reduce the value if your GPU has less memory.
|
||||||
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
|
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
|
||||||
|
|||||||
@ -49,8 +49,8 @@ def run_store(store_done, prompts):
|
|||||||
|
|
||||||
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
|
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
|
||||||
|
|
||||||
ktc = KVTransferConfig.from_cli(
|
ktc = KVTransferConfig(kv_connector="LMCacheConnectorV1",
|
||||||
'{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}')
|
kv_role="kv_both")
|
||||||
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
|
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
|
||||||
# memory. Reduce the value if your GPU has less memory.
|
# memory. Reduce the value if your GPU has less memory.
|
||||||
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
|
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
|
||||||
@ -76,8 +76,8 @@ def run_retrieve(store_done, prompts, timeout=1):
|
|||||||
|
|
||||||
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
|
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
|
||||||
|
|
||||||
ktc = KVTransferConfig.from_cli(
|
ktc = KVTransferConfig(kv_connector="LMCacheConnectorV1",
|
||||||
'{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}')
|
kv_role="kv_both")
|
||||||
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
|
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
|
||||||
# of memory. Reduce the value if your GPU has less memory.
|
# of memory. Reduce the value if your GPU has less memory.
|
||||||
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
|
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
|
||||||
|
|||||||
@ -16,16 +16,17 @@ except FileNotFoundError:
|
|||||||
|
|
||||||
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
|
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
|
||||||
|
|
||||||
llm = LLM(
|
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
|
||||||
model="meta-llama/Llama-3.2-1B-Instruct",
|
enforce_eager=True,
|
||||||
enforce_eager=True,
|
gpu_memory_utilization=0.8,
|
||||||
gpu_memory_utilization=0.8,
|
max_num_batched_tokens=64,
|
||||||
max_num_batched_tokens=64,
|
max_num_seqs=16,
|
||||||
max_num_seqs=16,
|
kv_transfer_config=KVTransferConfig(
|
||||||
kv_transfer_config=KVTransferConfig.from_cli(
|
kv_connector="SharedStorageConnector",
|
||||||
'{"kv_connector":"SharedStorageConnector","kv_role":"kv_both",'
|
kv_role="kv_both",
|
||||||
'"kv_connector_extra_config": {"shared_storage_path": "local_storage"}}'
|
kv_connector_extra_config={
|
||||||
)) #, max_model_len=2048, max_num_batched_tokens=2048)
|
"shared_storage_path": "local_storage"
|
||||||
|
})) #, max_model_len=2048, max_num_batched_tokens=2048)
|
||||||
|
|
||||||
# 1ST generation (prefill instance)
|
# 1ST generation (prefill instance)
|
||||||
outputs = llm.generate(prompts, sampling_params)
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
|
|||||||
@ -17,11 +17,12 @@ sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
|
|||||||
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
|
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
|
||||||
enforce_eager=True,
|
enforce_eager=True,
|
||||||
gpu_memory_utilization=0.8,
|
gpu_memory_utilization=0.8,
|
||||||
kv_transfer_config=KVTransferConfig.from_cli(
|
kv_transfer_config=KVTransferConfig(
|
||||||
'{"kv_connector":"SharedStorageConnector","kv_role":"kv_both", '
|
kv_connector="SharedStorageConnector",
|
||||||
'"kv_connector_extra_config": '
|
kv_role="kv_both",
|
||||||
'{"shared_storage_path": "local_storage"}}')
|
kv_connector_extra_config={
|
||||||
) #, max_model_len=2048, max_num_batched_tokens=2048)
|
"shared_storage_path": "local_storage"
|
||||||
|
})) #, max_model_len=2048, max_num_batched_tokens=2048)
|
||||||
|
|
||||||
# 1ST generation (prefill instance)
|
# 1ST generation (prefill instance)
|
||||||
outputs = llm.generate(
|
outputs = llm.generate(
|
||||||
|
|||||||
@ -32,9 +32,10 @@ def run_prefill(prefill_done):
|
|||||||
# This instance is the prefill node (kv_producer, rank 0).
|
# This instance is the prefill node (kv_producer, rank 0).
|
||||||
# The number of parallel instances for KV cache transfer is set to 2,
|
# The number of parallel instances for KV cache transfer is set to 2,
|
||||||
# as required for PyNcclConnector.
|
# as required for PyNcclConnector.
|
||||||
ktc = KVTransferConfig.from_cli(
|
ktc = KVTransferConfig(kv_connector="PyNcclConnector",
|
||||||
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}'
|
kv_role="kv_producer",
|
||||||
)
|
kv_rank=0,
|
||||||
|
kv_parallel_size=2)
|
||||||
|
|
||||||
# Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB
|
# Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB
|
||||||
# memory. You may need to adjust the value to fit your GPU.
|
# memory. You may need to adjust the value to fit your GPU.
|
||||||
@ -71,9 +72,10 @@ def run_decode(prefill_done):
|
|||||||
# This instance is the decode node (kv_consumer, rank 1).
|
# This instance is the decode node (kv_consumer, rank 1).
|
||||||
# The number of parallel instances for KV cache transfer is set to 2,
|
# The number of parallel instances for KV cache transfer is set to 2,
|
||||||
# as required for PyNcclConnector.
|
# as required for PyNcclConnector.
|
||||||
ktc = KVTransferConfig.from_cli(
|
ktc = KVTransferConfig(kv_connector="PyNcclConnector",
|
||||||
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}'
|
kv_role="kv_consumer",
|
||||||
)
|
kv_rank=1,
|
||||||
|
kv_parallel_size=2)
|
||||||
|
|
||||||
# Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB
|
# Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB
|
||||||
# memory. You may need to adjust the value to fit your GPU.
|
# memory. You may need to adjust the value to fit your GPU.
|
||||||
|
|||||||
Reference in New Issue
Block a user