Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
This commit is contained in:
Robert Shaw
2025-10-03 13:25:00 -04:00
parent d76541a6c5
commit 52a7d91980
2 changed files with 4 additions and 0 deletions

View File

@ -2400,6 +2400,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
# Run the model.
# Use persistent buffers for CUDA graphs.
logger.info(f"====== EXECUTE {ubatch_slices=}, {num_input_tokens=}, {num_tokens_across_dp=}")
with (set_forward_context(
attn_metadata,
self.vllm_config,
@ -3046,6 +3047,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
(1 token) and prefill (multiple tokens) requests.
remove_lora: If False, dummy LoRAs are not destroyed after the run
"""
logger.info("====== DUMMY RUN")
assert cudagraph_runtime_mode is None or \
cudagraph_runtime_mode.valid_runtime_modes()

View File

@ -167,6 +167,7 @@ def ubatch_split(
num_tokens_unpadded,
uniform_decode=uniform_decode,
)
logger.info(f"==== {should_attempt_ubatching=}, {num_tokens_unpadded=}")
# Don't microbatch unless every other DP worker is also microbatching
should_ubatch, num_tokens_after_padding = get_dp_padding_ubatch(
@ -175,6 +176,7 @@ def ubatch_split(
should_attempt_ubatching,
vllm_config,
)
logger.info(f"==== {should_ubatch=}, {num_tokens_after_padding=}")
if not should_ubatch:
return (None, None)