mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 23:03:52 +08:00
[Doc]: fix typos in various files (#24821)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
This commit is contained in:
@ -8,7 +8,7 @@ This benchmark aims to:
|
||||
|
||||
Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
|
||||
|
||||
Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
|
||||
Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
|
||||
|
||||
## Setup
|
||||
|
||||
|
@ -190,7 +190,7 @@ class MoeWNA16Method(FusedMoEMethodBase):
|
||||
group_size = self.quant_config.group_size
|
||||
group_size_div_factor = 1
|
||||
|
||||
# make intermediate_size and hidden_size diviable by group_size
|
||||
# make intermediate_size and hidden_size divisible by group_size
|
||||
# we reduce the group size to ensure that
|
||||
# and we would repeat the loaded_weight later
|
||||
while intermediate_size_per_partition % group_size or \
|
||||
|
@ -19,7 +19,7 @@ class MarlinWorkspace:
|
||||
|
||||
def __init__(self, out_features, min_thread_n, max_parallel):
|
||||
assert (out_features % min_thread_n == 0), (
|
||||
"out_features = {} is undivisible by min_thread_n = {}".format(
|
||||
"out_features = {} is indivisible by min_thread_n = {}".format(
|
||||
out_features, min_thread_n))
|
||||
|
||||
max_workspace_size = ((out_features // min_thread_n) * max_parallel)
|
||||
|
@ -649,7 +649,7 @@ def _sample_with_torch(
|
||||
else:
|
||||
sampled_token_ids_tensor = None
|
||||
|
||||
# Counterintiutively, having two loops here is actually faster.
|
||||
# Counterintuitively, having two loops here is actually faster.
|
||||
# The first loop can run without waiting on GPU<->CPU sync.
|
||||
for sampling_type in SamplingType:
|
||||
sample_indices = categorized_sample_indices[sampling_type]
|
||||
|
@ -1524,7 +1524,7 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
return None
|
||||
|
||||
# The result multimodal_embeddings is tuple of tensors, with each
|
||||
# tensor correspoending to a multimodal data item (image or video).
|
||||
# tensor corresponding to a multimodal data item (image or video).
|
||||
multimodal_embeddings: tuple[torch.Tensor, ...] = ()
|
||||
|
||||
# NOTE: It is important to iterate over the keys in this dictionary
|
||||
|
@ -738,7 +738,7 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
return []
|
||||
|
||||
# The result multimodal_embeddings is tuple of tensors, with each
|
||||
# tensor correspoending to a multimodal data item (image or video).
|
||||
# tensor corresponding to a multimodal data item (image or video).
|
||||
multimodal_embeddings: tuple[torch.Tensor, ...] = ()
|
||||
|
||||
# NOTE: It is important to iterate over the keys in this dictionary
|
||||
|
@ -662,7 +662,7 @@ def pad_and_concat_to_dim3(
|
||||
max_len = max(f.shape[-1] for f in features)
|
||||
# Ensure all features have dim=3
|
||||
features = [f.view(-1, *f.shape[-2:]) for f in features]
|
||||
# Pad and oncatenate:
|
||||
# Pad and concatenate:
|
||||
# [[B1, 80, M1], [B2, 80, M2]] -> [B1+B2, 80, max(M1, M2)]
|
||||
features = [F.pad(f, (0, max_len - f.shape[-1])) for f in features]
|
||||
return torch.cat(features)
|
||||
|
Reference in New Issue
Block a user