mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
@ -2400,6 +2400,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
|
||||
# Run the model.
|
||||
# Use persistent buffers for CUDA graphs.
|
||||
logger.info(f"====== EXECUTE {ubatch_slices=}, {num_input_tokens=}, {num_tokens_across_dp=}")
|
||||
with (set_forward_context(
|
||||
attn_metadata,
|
||||
self.vllm_config,
|
||||
@ -3046,6 +3047,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
(1 token) and prefill (multiple tokens) requests.
|
||||
remove_lora: If False, dummy LoRAs are not destroyed after the run
|
||||
"""
|
||||
logger.info("====== DUMMY RUN")
|
||||
assert cudagraph_runtime_mode is None or \
|
||||
cudagraph_runtime_mode.valid_runtime_modes()
|
||||
|
||||
|
@ -167,6 +167,7 @@ def ubatch_split(
|
||||
num_tokens_unpadded,
|
||||
uniform_decode=uniform_decode,
|
||||
)
|
||||
logger.info(f"==== {should_attempt_ubatching=}, {num_tokens_unpadded=}")
|
||||
|
||||
# Don't microbatch unless every other DP worker is also microbatching
|
||||
should_ubatch, num_tokens_after_padding = get_dp_padding_ubatch(
|
||||
@ -175,6 +176,7 @@ def ubatch_split(
|
||||
should_attempt_ubatching,
|
||||
vllm_config,
|
||||
)
|
||||
logger.info(f"==== {should_ubatch=}, {num_tokens_after_padding=}")
|
||||
|
||||
if not should_ubatch:
|
||||
return (None, None)
|
||||
|
Reference in New Issue
Block a user