debug

Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
2025-10-20 14:53:52 +08:00 · 2025-10-03 13:25:00 -04:00
parent d76541a6c5
commit 52a7d91980
2 changed files with 4 additions and 0 deletions
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@ -2400,6 +2400,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):

        # Run the model.
        # Use persistent buffers for CUDA graphs.
+        logger.info(f"====== EXECUTE {ubatch_slices=}, {num_input_tokens=}, {num_tokens_across_dp=}")
        with (set_forward_context(
                attn_metadata,
                self.vllm_config,
@ -3046,6 +3047,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                (1 token) and prefill (multiple tokens) requests.
            remove_lora: If False, dummy LoRAs are not destroyed after the run
        """
+        logger.info("====== DUMMY RUN")
        assert cudagraph_runtime_mode is None or \
            cudagraph_runtime_mode.valid_runtime_modes()

--- a/vllm/v1/worker/ubatch_splitting.py
+++ b/vllm/v1/worker/ubatch_splitting.py
@ -167,6 +167,7 @@ def ubatch_split(
        num_tokens_unpadded,
        uniform_decode=uniform_decode,
    )
+    logger.info(f"==== {should_attempt_ubatching=}, {num_tokens_unpadded=}")

    # Don't microbatch unless every other DP worker is also microbatching
    should_ubatch, num_tokens_after_padding = get_dp_padding_ubatch(
@ -175,6 +176,7 @@ def ubatch_split(
        should_attempt_ubatching,
        vllm_config,
    )
+    logger.info(f"==== {should_ubatch=}, {num_tokens_after_padding=}")

    if not should_ubatch:
        return (None, None)