[Misc][Speculative decoding] Typos and typing fixes (#6467)

Co-authored-by: caishangming.csm <caishangming.csm@alibaba-inc.com>
2025-10-20 14:53:52 +08:00 · 2024-07-17 15:17:07 +08:00
parent 10383887e0
commit a19e8d3726
5 changed files with 7 additions and 7 deletions
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@ -43,7 +43,7 @@ class MultiStepWorker(Worker, ProposerWorkerBase):
        )

    def set_include_gpu_probs_tensor(self) -> None:
-        # Need include_gpu_probs_tensor for multi_step_worker
+        # Need include_gpu_probs_tensor for MultiStepWorker
        self.model_runner.model.sampler.include_gpu_probs_tensor = True

    @torch.inference_mode()
--- a/vllm/spec_decode/ngram_worker.py
+++ b/vllm/spec_decode/ngram_worker.py
@ -13,7 +13,7 @@ from vllm.worker.worker_base import LoraNotSupportedWorkerBase
 class NGramWorker(NonLLMProposerWorkerBase, LoraNotSupportedWorkerBase):
    """NGramWorker provides a light drafter without need for model.

-    Current NGramWorker only implement prompt lookup decoding,
+    Current NGramWorker only implements prompt lookup decoding,
    and in future we may also do RAG type drafter and other scenarios
    which don't rely on LLM model to give proposals.
    """
@ -37,7 +37,7 @@ class NGramWorker(NonLLMProposerWorkerBase, LoraNotSupportedWorkerBase):
        self.device = torch.device(f"cuda:{self.local_rank}")
        self.load_model = lambda *args, **kwargs: None

-        # Current only support Top1Proposer
+        # Current NGramWorker only supports Top1Proposer
        self._proposer = Top1Proposer(
            weakref.proxy(self),  # type: ignore[arg-type]
            device=self.device,
--- a/vllm/spec_decode/proposer_worker_base.py
+++ b/vllm/spec_decode/proposer_worker_base.py
@ -24,7 +24,7 @@ class ProposerWorkerBase(LoraNotSupportedWorkerBase, SpeculativeProposer):
    ) -> Tuple[Optional[List[SamplerOutput]], bool]:
        raise NotImplementedError

-    def set_include_gpu_probs_tensor(self):
+    def set_include_gpu_probs_tensor(self) -> None:
        """Implementation optional"""
        pass

--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@ -206,7 +206,7 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):

        self.probs_dtype = self.spec_decode_sampler.probs_dtype
        self.token_id_dtype = self.spec_decode_sampler.token_id_dtype
-        # Lazy initiazliation.
+        # Lazy initialization.
        self.scorer: SpeculativeScorer

        # Hidden states from target model to pass to proposer
--- a/vllm/spec_decode/top1_proposer.py
+++ b/vllm/spec_decode/top1_proposer.py
@ -138,7 +138,7 @@ class Top1Proposer(SpeculativeProposer):

            # Currently only proposal lens of 0 or the global batch proposal len
            # are supported.
-            # If max_proposal_len is defined, then we shall no exccess this
+            # If max_proposal_len is defined, then we shall no exceed this
            # quota for nonzero_proposal
            new_k = 0
            if (self.max_proposal_len is None
@ -219,7 +219,7 @@ class Top1Proposer(SpeculativeProposer):
        proposal_lens: List[int],
        nonzero_proposal_len_indices: List[int],
        sampler_transposed: bool,
-    ) -> Tuple[torch.Tensor, torch.tensor, torch.Tensor]:
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """After speculations are produced, merge the speculation results with
        the skipped sequences.
        """