mirror of
https://github.com/vllm-project/vllm-ascend.git
synced 2025-10-20 13:43:53 +08:00
Fix of DeepSeek Error in KV Pool Mixed Deployment Scenario (#3087)
### What this PR does / why we need it?
A new kv_role "kv_both" is added to run mixed deployment scenarios. The
mixed deployment will involve a decode phase, where with_prefill should
be false.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.10.2
- vLLM main:
c60e6137f0
Signed-off-by: fems14 <1804143737@qq.com>
This commit is contained in:
@ -64,6 +64,7 @@ export MOONCAKE_CONFIG_PATH="/xxxxxx/mooncake.json"
|
||||
export VLLM_USE_V1=1
|
||||
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
|
||||
export ASCEND_TRANSPORT_PRINT=1
|
||||
export ACL_OP_INIT_MODE=1
|
||||
# The upper boundary environment variable for memory swap logging is set to mooncake, where 1 indicates enabled and 0 indicates disabled.
|
||||
export ASCEND_AGGREGATE_ENABLE=1
|
||||
# The upper-level environment variable is the switch for enabling the mooncake aggregation function, where 1 means on and 0 means off.
|
||||
@ -104,6 +105,7 @@ python3 -m vllm.entrypoints.openai.api_server \
|
||||
{
|
||||
"kv_connector": "MooncakeConnectorStoreV1",
|
||||
"kv_role": "kv_producer",
|
||||
"mooncake_rpc_port":"0"
|
||||
}
|
||||
]
|
||||
}
|
||||
@ -124,6 +126,7 @@ export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm
|
||||
export MOONCAKE_CONFIG_PATH="/xxxxx/mooncake.json"
|
||||
export VLLM_USE_V1=1
|
||||
export ASCEND_RT_VISIBLE_DEVICES=4,5,6,7
|
||||
export ACL_OP_INIT_MODE=1
|
||||
export ASCEND_TRANSPORT_PRINT=1
|
||||
# The upper boundary environment variable for memory swap logging is set to mooncake, where 1 indicates enabled and 0 indicates disabled.
|
||||
export ASCEND_AGGREGATE_ENABLE=1
|
||||
@ -165,6 +168,7 @@ python3 -m vllm.entrypoints.openai.api_server \
|
||||
{
|
||||
"kv_connector": "MooncakeConnectorStoreV1",
|
||||
"kv_role": "kv_consumer",
|
||||
"mooncake_rpc_port":"1"
|
||||
}
|
||||
]
|
||||
}
|
||||
@ -223,6 +227,7 @@ export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm
|
||||
export MOONCAKE_CONFIG_PATH="/xxxxxx/mooncake.json"
|
||||
export VLLM_USE_V1=1
|
||||
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
|
||||
export ACL_OP_INIT_MODE=1
|
||||
export ASCEND_TRANSPORT_PRINT=1
|
||||
# The upper boundary environment variable for memory swap logging is set to mooncake, where 1 indicates enabled and 0 indicates disabled.
|
||||
export ASCEND_AGGREGATE_ENABLE=1
|
||||
@ -242,9 +247,10 @@ python3 -m vllm.entrypoints.openai.api_server \
|
||||
--kv-transfer-config \
|
||||
'{
|
||||
"kv_connector": "MooncakeConnectorStoreV1",
|
||||
"kv_role": "kv_producer",
|
||||
"kv_role": "kv_both",
|
||||
"kv_connector_extra_config": {
|
||||
"use_layerwise": false
|
||||
"use_layerwise": false,
|
||||
"mooncake_rpc_port":"0"
|
||||
}
|
||||
}' > mix.log 2>&1
|
||||
```
|
||||
|
@ -119,7 +119,7 @@ class MooncakeEngine:
|
||||
|
||||
if self.use_layerwise:
|
||||
self.get_event = threading.Event()
|
||||
if self.kv_role == 'kv_producer':
|
||||
if self.kv_role in ['kv_producer', 'kv_both']:
|
||||
ready_event_sending = threading.Event()
|
||||
self.kv_send_thread = KVCacheStoreLayerSendingThread(
|
||||
self.tp_rank, self.tp_size, self.m_store,
|
||||
@ -135,7 +135,7 @@ class MooncakeEngine:
|
||||
self.kv_recv_thread.start()
|
||||
ready_event.wait()
|
||||
else:
|
||||
if self.kv_role == 'kv_producer':
|
||||
if self.kv_role in ['kv_producer', 'kv_both']:
|
||||
ready_event_sending = threading.Event()
|
||||
self.kv_send_thread = KVCacheStoreSendingThread(
|
||||
self.tp_rank, self.tp_size, self.m_store,
|
||||
@ -429,7 +429,7 @@ class MooncakeEngine:
|
||||
done_sending = (
|
||||
self.kv_send_thread.
|
||||
get_and_clear_finished_requests( # type: ignore[union-attr]
|
||||
) if self.kv_role == 'kv_producer' else set())
|
||||
) if self.kv_role in ['kv_producer', 'kv_both'] else set())
|
||||
done_recving = self.kv_recv_thread.get_and_clear_finished_requests( # type: ignore[union-attr]
|
||||
)
|
||||
|
||||
|
@ -2406,7 +2406,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
dtype=np.int32)
|
||||
|
||||
# Force dummy run on prefill stage when this node is deemed as kv producer.
|
||||
if self.is_kv_producer:
|
||||
if self.is_kv_producer and not self.is_kv_consumer:
|
||||
with_prefill = True
|
||||
|
||||
attn_metadata = self._build_attention_metadata(
|
||||
|
Reference in New Issue
Block a user