Compare commits

...

2 Commits

Author SHA1 Message Date
6f62c94d7e updated
Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
2025-10-03 13:47:16 -04:00
52a7d91980 debug
Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
2025-10-03 13:25:00 -04:00
4 changed files with 7 additions and 1 deletions

View File

@ -1050,6 +1050,7 @@ class DPEngineCoreProc(EngineCoreProc):
self._maybe_publish_request_counts()
local_unfinished_reqs = self.scheduler.has_unfinished_requests()
logger.info(f"{local_unfinished_reqs=}")
if not executed:
if not local_unfinished_reqs and not self.engines_running:
# All engines are idle.
@ -1057,6 +1058,7 @@ class DPEngineCoreProc(EngineCoreProc):
# We are in a running state and so must execute a dummy pass
# if the model didn't execute any ready requests.
logger.info("===EXECUTE_DUMMY_BATCH===")
self.execute_dummy_batch()
# 3) All-reduce operation to determine global unfinished reqs.

View File

@ -1077,7 +1077,7 @@ class DPAsyncMPClient(AsyncMPClient):
if counts is not None:
sliced_counts = counts[count_slice]
self.lb_engines = sliced_counts
logger.debug("Received counts: %s (%s)", sliced_counts,
logger.debug("Received counts: %s (%s)", counts,
count_slice)
resources.stats_update_task = asyncio.create_task(

View File

@ -2400,6 +2400,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
# Run the model.
# Use persistent buffers for CUDA graphs.
logger.info(f"====== EXECUTE {ubatch_slices=}, {num_input_tokens=}, {num_tokens_across_dp=}")
with (set_forward_context(
attn_metadata,
self.vllm_config,
@ -3046,6 +3047,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
(1 token) and prefill (multiple tokens) requests.
remove_lora: If False, dummy LoRAs are not destroyed after the run
"""
logger.info("====== DUMMY RUN")
assert cudagraph_runtime_mode is None or \
cudagraph_runtime_mode.valid_runtime_modes()

View File

@ -167,6 +167,7 @@ def ubatch_split(
num_tokens_unpadded,
uniform_decode=uniform_decode,
)
logger.info(f"==== {should_attempt_ubatching=}, {num_tokens_unpadded=}")
# Don't microbatch unless every other DP worker is also microbatching
should_ubatch, num_tokens_after_padding = get_dp_padding_ubatch(
@ -175,6 +176,7 @@ def ubatch_split(
should_attempt_ubatching,
vllm_config,
)
logger.info(f"==== {should_ubatch=}, {num_tokens_after_padding=}")
if not should_ubatch:
return (None, None)