[Doc]: fix typos in Python comments (#24001)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
This commit is contained in:
@ -43,7 +43,7 @@ cudagraph_capturing_enabled: bool = True
|
||||
|
||||
|
||||
def validate_cudagraph_capturing_enabled():
|
||||
# used to monitor whether an cudagraph capturing is legal at runtime.
|
||||
# used to monitor whether a cudagraph capturing is legal at runtime.
|
||||
# should be called before any cudagraph capturing.
|
||||
# if an illegal cudagraph capturing happens, raise an error.
|
||||
global cudagraph_capturing_enabled
|
||||
|
@ -76,7 +76,7 @@ class LRUEvictor(Evictor):
|
||||
that's recorded in the Block. If there are multiple blocks with
|
||||
the same last_accessed time, then the one with the largest num_hashed_tokens
|
||||
will be evicted. If two blocks each have the lowest last_accessed time and
|
||||
highest num_hashed_tokens value, then one will be chose arbitrarily
|
||||
highest num_hashed_tokens value, then one will be chosen arbitrarily
|
||||
"""
|
||||
|
||||
# CLEANUP_THRESHOLD determines the maximum allowable size of the priority
|
||||
|
@ -1239,7 +1239,7 @@ class LLMEngine:
|
||||
|
||||
# Stop the execute model loop in parallel workers until there are
|
||||
# more requests to process. This avoids waiting indefinitely in
|
||||
# torch.distributed ops which may otherwise timeout, and unblocks
|
||||
# torch.distributed ops which may otherwise time out, and unblocks
|
||||
# the RPC thread in the workers so that they can process any other
|
||||
# queued control plane messages, such as add/remove lora adapters.
|
||||
logger.debug("Stopping remote worker execution loop.")
|
||||
|
@ -329,7 +329,7 @@ class LLM:
|
||||
Args:
|
||||
prompts: The prompts to the LLM. You may pass a sequence of prompts
|
||||
for batch inference. See [PromptType][vllm.inputs.PromptType]
|
||||
for more details about the format of each prompts.
|
||||
for more details about the format of each prompt.
|
||||
sampling_params: The sampling parameters for text generation. If
|
||||
None, we use the default sampling parameters.
|
||||
When it is a single value, it is applied to every prompt.
|
||||
@ -853,7 +853,7 @@ class LLM:
|
||||
Args:
|
||||
prompts: The prompts to the LLM. You may pass a sequence of prompts
|
||||
for batch inference. See [PromptType][vllm.inputs.PromptType]
|
||||
for more details about the format of each prompts.
|
||||
for more details about the format of each prompt.
|
||||
pooling_params: The pooling parameters for pooling. If None, we
|
||||
use the default pooling parameters.
|
||||
use_tqdm: If `True`, shows a tqdm progress bar.
|
||||
@ -946,7 +946,7 @@ class LLM:
|
||||
Args:
|
||||
prompts: The prompts to the LLM. You may pass a sequence of prompts
|
||||
for batch inference. See [PromptType][vllm.inputs.PromptType]
|
||||
for more details about the format of each prompts.
|
||||
for more details about the format of each prompt.
|
||||
pooling_params: The pooling parameters for pooling. If None, we
|
||||
use the default pooling parameters.
|
||||
use_tqdm: If `True`, shows a tqdm progress bar.
|
||||
@ -994,7 +994,7 @@ class LLM:
|
||||
Args:
|
||||
prompts: The prompts to the LLM. You may pass a sequence of prompts
|
||||
for batch inference. See [PromptType][vllm.inputs.PromptType]
|
||||
for more details about the format of each prompts.
|
||||
for more details about the format of each prompt.
|
||||
use_tqdm: If `True`, shows a tqdm progress bar.
|
||||
If a callable (e.g., `functools.partial(tqdm, leave=False)`),
|
||||
it is used to create the progress bar.
|
||||
@ -1038,7 +1038,7 @@ class LLM:
|
||||
Args:
|
||||
prompts: The prompts to the LLM. You may pass a sequence of prompts
|
||||
for batch inference. See [PromptType][vllm.inputs.PromptType]
|
||||
for more details about the format of each prompts.
|
||||
for more details about the format of each prompt.
|
||||
use_tqdm: If `True`, shows a tqdm progress bar.
|
||||
If a callable (e.g., `functools.partial(tqdm, leave=False)`),
|
||||
it is used to create the progress bar.
|
||||
|
@ -101,7 +101,7 @@ class MultiprocessingDistributedExecutor(DistributedExecutorBase):
|
||||
result_handler.start()
|
||||
self.worker_monitor.start()
|
||||
|
||||
# Set up signal handlers to shutdown the executor cleanly
|
||||
# Set up signal handlers to shut down the executor cleanly
|
||||
# sometimes gc does not work well
|
||||
|
||||
self.driver_worker = WorkerWrapperBase(self.vllm_config, 0)
|
||||
|
@ -605,7 +605,7 @@ class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
|
||||
|
||||
class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
|
||||
"""ColumnParallelLinear layer that is composed of 2 sublayers (slices)
|
||||
packed together (eg. gate_proj + up_proj -> gate_up_proj).
|
||||
packed together (e.g. gate_proj + up_proj -> gate_up_proj).
|
||||
|
||||
This means we have 2 LoRAs, each applied to one half of the layer.
|
||||
|
||||
|
@ -537,7 +537,7 @@ class Platform:
|
||||
|
||||
def get_global_graph_pool(self) -> Any:
|
||||
"""
|
||||
Return the global graph pool for the this platform.
|
||||
Return the global graph pool for this platform.
|
||||
"""
|
||||
cls = self.__class__
|
||||
if cls._global_graph_pool is None:
|
||||
|
@ -30,7 +30,7 @@ class HunyuanA13BReasoningParser(ReasoningParser):
|
||||
Key Features:
|
||||
- For non-stream output , Recognizes and extracts reasoning ("think")
|
||||
and answer ("answer") sections from text using regular expressions.
|
||||
- For stream process, it require a token id sequences to change the
|
||||
- For stream process, it requires a token id sequences to change the
|
||||
reasoning state and other state so it maintains internal state to
|
||||
manage parsing across multiple token.
|
||||
|
||||
|
@ -2734,7 +2734,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
layer_names)
|
||||
attn_backends = {}
|
||||
attn_backend_layers = defaultdict(list)
|
||||
# Dedupe based on full class name; this is a bit safer than using
|
||||
# Dedupe based on full class name; this is a bit safer than
|
||||
# using the class itself as the key because when we create dynamic
|
||||
# attention backend subclasses (e.g. ChunkedLocalAttention) unless
|
||||
# they are cached correctly, there will be different objects per
|
||||
|
@ -224,7 +224,7 @@ class Worker(WorkerBase):
|
||||
memory can be used for KV cache without OOMs.
|
||||
|
||||
The engine will first conduct a profiling of the existing memory usage.
|
||||
Then, it calculate the free memory that can be used for KV cache in
|
||||
Then, it calculates the free memory that can be used for KV cache in
|
||||
bytes.
|
||||
|
||||
Tip:
|
||||
|
Reference in New Issue
Block a user