mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
fix some typos (#24616)
Signed-off-by: co63oc <co63oc@users.noreply.github.com>
This commit is contained in:
@ -23,7 +23,7 @@ class TestSetting:
|
||||
fullgraph: bool
|
||||
|
||||
|
||||
# we cannot afford testing the full Catesian product
|
||||
# we cannot afford testing the full Cartesian product
|
||||
# of all models and all levels
|
||||
@pytest.mark.parametrize(
|
||||
"test_setting",
|
||||
|
@ -345,7 +345,7 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
|
||||
# in the mamba2 ssd kernels, by comparing concatenation (in the sequence
|
||||
# dimension) of chunked results with the full sequence result.
|
||||
# It is different from test_mamba_chunk_scan_cont_batch by:
|
||||
# 1. Not using the naive torch implementaion (ssd_minimal_discrete) to get
|
||||
# 1. Not using the naive torch implementation (ssd_minimal_discrete) to get
|
||||
# reference outputs. Instead, it compares chunked kernel outputs to full
|
||||
# sequence kernel outputs. This is the most straightforward way to
|
||||
# assert chunked prefill correctness.
|
||||
|
@ -179,7 +179,7 @@ def chunk_local_cumsum_vector(
|
||||
def grid(meta):
|
||||
return (triton.cdiv(meta['S'], meta['BS']), NT, B * H)
|
||||
|
||||
# keep cummulative normalizer in fp32
|
||||
# keep cumulative normalizer in fp32
|
||||
# this kernel is equivalent to
|
||||
# g = g.view(B, H, NT, BT, -1).cumsum(-2).view(B, H, T, -1)
|
||||
chunk_local_cumsum_vector_kernel[grid](g_org,
|
||||
|
@ -1322,7 +1322,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
|
||||
k_scale: torch.Tensor,
|
||||
dcp_world_size: int,
|
||||
):
|
||||
assert k_scale is None, "DCP not support sacled kvcache now."
|
||||
assert k_scale is None, "DCP not support scaled kvcache now."
|
||||
assert attn_metadata.prefill is not None
|
||||
prefill_metadata = attn_metadata.prefill
|
||||
assert prefill_metadata.chunked_context is not None
|
||||
|
@ -112,9 +112,9 @@ class BlockTable:
|
||||
# tokens.
|
||||
virtual_block_offsets = positions % virtual_block_size
|
||||
mask = virtual_block_offsets % self.dcp_world_size == self.dcp_rank
|
||||
# Calcuate local block_offsets
|
||||
# Calculate local block_offsets
|
||||
block_offsets = virtual_block_offsets // self.dcp_world_size
|
||||
# Calcuate slot_mapping
|
||||
# Calculate slot_mapping
|
||||
slot_mapping = block_numbers * self.block_size + block_offsets
|
||||
# Write final slots, use -1 for not-local
|
||||
self.slot_mapping_np[:req_indices.shape[0]] = np.where(
|
||||
|
Reference in New Issue
Block a user