mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
Compare commits
2 Commits
8a81d776ce
...
debug-logs
Author | SHA1 | Date | |
---|---|---|---|
6f62c94d7e | |||
52a7d91980 |
@ -1050,6 +1050,7 @@ class DPEngineCoreProc(EngineCoreProc):
|
||||
self._maybe_publish_request_counts()
|
||||
|
||||
local_unfinished_reqs = self.scheduler.has_unfinished_requests()
|
||||
logger.info(f"{local_unfinished_reqs=}")
|
||||
if not executed:
|
||||
if not local_unfinished_reqs and not self.engines_running:
|
||||
# All engines are idle.
|
||||
@ -1057,6 +1058,7 @@ class DPEngineCoreProc(EngineCoreProc):
|
||||
|
||||
# We are in a running state and so must execute a dummy pass
|
||||
# if the model didn't execute any ready requests.
|
||||
logger.info("===EXECUTE_DUMMY_BATCH===")
|
||||
self.execute_dummy_batch()
|
||||
|
||||
# 3) All-reduce operation to determine global unfinished reqs.
|
||||
|
@ -1077,7 +1077,7 @@ class DPAsyncMPClient(AsyncMPClient):
|
||||
if counts is not None:
|
||||
sliced_counts = counts[count_slice]
|
||||
self.lb_engines = sliced_counts
|
||||
logger.debug("Received counts: %s (%s)", sliced_counts,
|
||||
logger.debug("Received counts: %s (%s)", counts,
|
||||
count_slice)
|
||||
|
||||
resources.stats_update_task = asyncio.create_task(
|
||||
|
@ -2400,6 +2400,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
|
||||
# Run the model.
|
||||
# Use persistent buffers for CUDA graphs.
|
||||
logger.info(f"====== EXECUTE {ubatch_slices=}, {num_input_tokens=}, {num_tokens_across_dp=}")
|
||||
with (set_forward_context(
|
||||
attn_metadata,
|
||||
self.vllm_config,
|
||||
@ -3046,6 +3047,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
(1 token) and prefill (multiple tokens) requests.
|
||||
remove_lora: If False, dummy LoRAs are not destroyed after the run
|
||||
"""
|
||||
logger.info("====== DUMMY RUN")
|
||||
assert cudagraph_runtime_mode is None or \
|
||||
cudagraph_runtime_mode.valid_runtime_modes()
|
||||
|
||||
|
@ -167,6 +167,7 @@ def ubatch_split(
|
||||
num_tokens_unpadded,
|
||||
uniform_decode=uniform_decode,
|
||||
)
|
||||
logger.info(f"==== {should_attempt_ubatching=}, {num_tokens_unpadded=}")
|
||||
|
||||
# Don't microbatch unless every other DP worker is also microbatching
|
||||
should_ubatch, num_tokens_after_padding = get_dp_padding_ubatch(
|
||||
@ -175,6 +176,7 @@ def ubatch_split(
|
||||
should_attempt_ubatching,
|
||||
vllm_config,
|
||||
)
|
||||
logger.info(f"==== {should_ubatch=}, {num_tokens_after_padding=}")
|
||||
|
||||
if not should_ubatch:
|
||||
return (None, None)
|
||||
|
Reference in New Issue
Block a user