[CI] change spell checker from codespell to typos (#18711)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
This commit is contained in:
Ning Xie
2025-06-12 10:57:10 +08:00
committed by GitHub
parent 42f52cc95b
commit 2f1c19b245
57 changed files with 335 additions and 163 deletions

2
.gitignore vendored
View File

@ -200,5 +200,5 @@ benchmarks/**/*.json
actionlint
shellcheck*/
# Ingore moe/marlin_moe gen code
# Ignore moe/marlin_moe gen code
csrc/moe/marlin_moe_wna16/kernel_*

View File

@ -20,12 +20,10 @@ repos:
args: [--output-format, github, --fix]
- id: ruff-format
files: ^(.buildkite|benchmarks|examples)/.*
- repo: https://github.com/codespell-project/codespell
rev: v2.4.1
- repo: https://github.com/crate-ci/typos
rev: v1.32.0
hooks:
- id: codespell
additional_dependencies: ['tomli']
args: ['--toml', 'pyproject.toml']
- id: typos
- repo: https://github.com/PyCQA/isort
rev: 6.0.1
hooks:

View File

@ -137,8 +137,8 @@ FORCE_INLINE std::pair<T, T> reduceSoftmaxAlibi(T* data, const int size,
}
template <typename T>
FORCE_INLINE void reducePartitonSoftmax(const T* max_data, T* sum_data,
const int size) {
FORCE_INLINE void reducePartitionSoftmax(const T* max_data, T* sum_data,
const int size) {
T max = max_data[0];
for (int i = 1; i < size; ++i) {
max = max >= max_data[i] ? max : max_data[i];
@ -634,7 +634,7 @@ struct paged_attention_v2_impl {
if (partition_num == 1) continue;
reducePartitonSoftmax(
reducePartitionSoftmax(
max_logits + seq_idx * num_heads * max_num_partitions +
head_idx * max_num_partitions,
exp_sums + seq_idx * num_heads * max_num_partitions +

View File

@ -83,7 +83,7 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
explicit FP16Vec16(const void* ptr)
: reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
// non-temproal load
// non-temporal load
explicit FP16Vec16(bool, void* ptr)
: reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
@ -120,7 +120,7 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
explicit BF16Vec16(const void* ptr)
: reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
// non-temproal load
// non-temporal load
explicit BF16Vec16(bool, void* ptr)
: reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
@ -327,7 +327,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
// normal load
explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {}
// non-temproal load
// non-temporal load
explicit FP32Vec16(bool, void* ptr)
: reg((__m512)_mm512_stream_load_si512(ptr)) {}
@ -576,7 +576,7 @@ struct INT8Vec64 : public Vec<INT8Vec64> {
// normal load
explicit INT8Vec64(void* ptr) : reg(_mm512_loadu_epi8(ptr)) {}
// non-temproal load
// non-temporal load
explicit INT8Vec64(bool, void* ptr) : reg(_mm512_stream_load_si512(ptr)) {}
void save(void* ptr) const { _mm512_storeu_epi8(ptr, reg); }
@ -587,7 +587,7 @@ struct INT8Vec64 : public Vec<INT8Vec64> {
_mm512_mask_storeu_epi8(ptr, mask, reg);
}
// non-temproal save
// non-temporal save
void nt_save(int8_t* ptr) { _mm512_stream_si512((__m512i*)ptr, reg); }
};
#endif

View File

@ -12,7 +12,7 @@ void moe_permute(
const torch::Tensor& input, // [n_token, hidden]
const torch::Tensor& topk_weights, //[n_token, topk]
torch::Tensor& topk_ids, // [n_token, topk]
const torch::Tensor& token_expert_indicies, // [n_token, topk]
const torch::Tensor& token_expert_indices, // [n_token, topk]
const std::optional<torch::Tensor>& expert_map, // [n_expert]
int64_t n_expert, int64_t n_local_expert, int64_t topk,
const std::optional<int64_t>& align_block_size,
@ -27,15 +27,15 @@ void moe_permute(
"expert_first_token_offset must be int64");
TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
"topk_ids must be int32");
TORCH_CHECK(token_expert_indicies.scalar_type() == at::ScalarType::Int,
"token_expert_indicies must be int32");
TORCH_CHECK(token_expert_indices.scalar_type() == at::ScalarType::Int,
"token_expert_indices must be int32");
TORCH_CHECK(src_row_id2dst_row_id_map.scalar_type() == at::ScalarType::Int,
"src_row_id2dst_row_id_map must be int32");
TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1,
"expert_first_token_offset shape != n_local_expert+1")
TORCH_CHECK(
src_row_id2dst_row_id_map.sizes() == token_expert_indicies.sizes(),
"token_expert_indicies shape must be same as src_row_id2dst_row_id_map");
src_row_id2dst_row_id_map.sizes() == token_expert_indices.sizes(),
"token_expert_indices shape must be same as src_row_id2dst_row_id_map");
auto n_token = input.sizes()[0];
auto n_hidden = input.sizes()[1];
auto align_block_size_value =
@ -71,7 +71,7 @@ void moe_permute(
expert_map_ptr, n_expert, stream);
}
// expert sort topk expert id and scan expert id get expert_first_token_offset
sortAndScanExpert(get_ptr<int>(topk_ids), get_ptr<int>(token_expert_indicies),
sortAndScanExpert(get_ptr<int>(topk_ids), get_ptr<int>(token_expert_indices),
get_ptr<int>(permuted_experts_id),
get_ptr<int>(dst_row_id2src_row_id_map),
get_ptr<int64_t>(expert_first_token_offset), n_token,
@ -190,7 +190,7 @@ void shuffle_rows(const torch::Tensor& input_tensor,
void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
torch::Tensor& topk_ids,
const torch::Tensor& token_expert_indicies,
const torch::Tensor& token_expert_indices,
const std::optional<torch::Tensor>& expert_map,
int64_t n_expert, int64_t n_local_expert, int64_t topk,
const std::optional<int64_t>& align_block_size,
@ -203,7 +203,7 @@ void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
void moe_unpermute(const torch::Tensor& input,
const torch::Tensor& topk_weights, torch::Tensor& topk_ids,
const torch::Tensor& token_expert_indicies,
const torch::Tensor& token_expert_indices,
const std::optional<torch::Tensor>& expert_map,
int64_t n_expert, int64_t n_local_expert, int64_t topk,
const std::optional<int64_t>& align_block_size,

View File

@ -425,7 +425,7 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB) \
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB>( \
gating_output, nullptr, topk_weights, topk_indicies, \
gating_output, nullptr, topk_weights, topk_indices, \
token_expert_indices, num_tokens, topk, 0, num_experts, \
stream);
@ -433,7 +433,7 @@ template <typename IndType>
void topkGatingSoftmaxKernelLauncher(
const float* gating_output,
float* topk_weights,
IndType* topk_indicies,
IndType* topk_indices,
int* token_expert_indices,
float* softmax_workspace,
const int num_tokens,
@ -476,7 +476,7 @@ void topkGatingSoftmaxKernelLauncher(
moeSoftmax<TPB><<<num_tokens, TPB, 0, stream>>>(
gating_output, nullptr, softmax_workspace, num_experts);
moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>(
softmax_workspace, nullptr, topk_weights, topk_indicies, token_expert_indices,
softmax_workspace, nullptr, topk_weights, topk_indices, token_expert_indices,
num_experts, topk, 0, num_experts);
}
}

View File

@ -66,7 +66,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
m.def(
"moe_permute(Tensor input, Tensor topk_weight, Tensor! topk_ids,"
"Tensor token_expert_indicies, Tensor? expert_map, int n_expert,"
"Tensor token_expert_indices, Tensor? expert_map, int n_expert,"
"int n_local_expert,"
"int topk, int? align_block_size,Tensor! permuted_input, Tensor! "
"expert_first_token_offset, Tensor! src_row_id2dst_row_id_map, Tensor! "

View File

@ -1003,7 +1003,7 @@ struct MacheteCollectiveMma {
static constexpr int A_CPY_VEC =
decltype(max_common_vector(tCsA, tCrA_load)){};
static constexpr int COVERSION_WIDTH =
static constexpr int CONVERSION_WIDTH =
std::min(A_CPY_VEC, int(size<0>(tCrA_mma)));
auto load_A_to_registers = [&](int read_stage) {
@ -1026,8 +1026,8 @@ struct MacheteCollectiveMma {
// PIPELINED MAIN LOOP
//
auto convert_A = [&, a_vec = Int<COVERSION_WIDTH>{}](int k_block,
int read_stage) {
auto convert_A = [&, a_vec = Int<CONVERSION_WIDTH>{}](int k_block,
int read_stage) {
load_extra_info_to_registers(partitioned_extra_info,
copy_partitions_extra_info, k_block,
read_stage);

View File

@ -320,7 +320,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
// Goal is to bring the activation matrix A to the LDS
// and use it across the lifetime of the work group
// TODO: When activation matrix is larger than 64 KB
// then this is not goint to work!
// then this is not going to work!
//----------------------------------------------------
__shared__ scalar_t s[max_lds_len];
@ -581,7 +581,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
// Goal is to bring the activation matrix A to the LDS
// and use it across the lifetime of the work group
// TODO: When activation matrix is larger than 64 KB
// then this is not goint to work!
// then this is not going to work!
//----------------------------------------------------
__shared__ scalar_t s[max_lds_len];
@ -601,7 +601,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
// int _WvPrGrp = mindiv(N, CuCount * YTILE, WvPrGrp);
uint32_t m = (blockIdx.x * _WvPrGrp + threadIdx.y) * YTILE;
// Check whether there will be fragmenation!
// Check whether there will be fragmentation!
// This will happen only for the last wave!
if (m < M && (m + YTILE) >= M) {
uint32_t startColumn = M - YTILE;
@ -827,7 +827,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
m += CuCount * _WvPrGrp * YTILE;
// Check whether there will be fragmenation!
// Check whether there will be fragmentation!
// This will happen only for the last wave!
if (m < M && (m + YTILE) >= M) {
uint32_t startColumn = M - YTILE;
@ -882,7 +882,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
// Goal is to bring the activation matrix A to the LDS
// and use it across the lifetime of the work group
// TODO: When activation matrix is larger than 64 KB
// then this is not goint to work!
// then this is not going to work!
//----------------------------------------------------
__shared__ scalar_t s[max_lds_len];
@ -904,7 +904,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
//----------------------------------------------------
uint32_t m = (blockIdx.x * _WvPrGrp + threadIdx.y) * YTILE;
// Check whether there will be fragmenation!
// Check whether there will be fragmentation!
// This will happen only for the last wave!
if (m < M && (m + YTILE) >= M) {
uint32_t startColumn = M - YTILE;
@ -1176,7 +1176,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
m += CuCount * _WvPrGrp * YTILE;
kBase = 0;
// Check whether there will be fragmenation!
// Check whether there will be fragmentation!
// This will happen only for the last wave!
if (m < M && (m + YTILE) >= M) {
uint32_t startColumn = M - YTILE;

View File

@ -277,7 +277,7 @@ CompressorResult cutlass_sparse_compress_sm90(torch::Tensor const& a) {
uint32_t const m = 1; // Set M to 1 for compression
uint32_t const n = a.size(1);
// Note: For correctess, the compressed format must be invariant in:
// Note: For correctness, the compressed format must be invariant in:
// - M, the flattened number of tokens
// - Whether output dtype is fp16 or bf16
// - CUTLASS epilogues

View File

@ -137,10 +137,6 @@ exclude = [
'vllm/attention/ops/.*\.py$'
]
[tool.codespell]
ignore-words-list = "dout, te, indicies, subtile, ElementE"
skip = "tests/models/fixtures/*,tests/prompts/*,benchmarks/sonnet.txt,tests/lora/data/*,build/*,vllm/third_party/*"
[tool.isort]
skip_glob = [
".buildkite/*",

View File

@ -223,7 +223,7 @@ def test_async_tp_pass_correctness(
"VLLM_USE_V1": "1",
}
aysnc_tp_args = [
async_tp_args = [
*common_args,
"--tensor-parallel-size",
str(tp_size),
@ -242,7 +242,7 @@ def test_async_tp_pass_correctness(
]
compare_two_settings(model_id,
aysnc_tp_args,
async_tp_args,
tp_args,
async_tp_env,
tp_env,

View File

@ -437,8 +437,8 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
"enable_prefix_caching": True,
}])
@pytest.mark.parametrize("seed", [1])
def test_auto_prefix_caching_after_evition_start(baseline_llm_generator,
test_llm_generator):
def test_auto_prefix_caching_after_eviction_start(baseline_llm_generator,
test_llm_generator):
"""Verify block manager v2 with auto prefix caching could works normal
even when eviction started.
With APC enabled, all blocks are held by native block at the beginning.

View File

@ -33,8 +33,8 @@ BLOCK_SIZE = 16
@pytest.mark.parametrize("batch_size", [5])
@pytest.mark.parametrize("seed", [1])
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
batch_size, seed, backend, monkeypatch):
def test_sliding_window_retrieval(baseline_llm_generator, test_llm_generator,
batch_size, seed, backend, monkeypatch):
"""
The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
asks for value of one of them (which is outside the sliding window).
@ -100,7 +100,7 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed,
backend, monkeypatch):
"""
This is similar to test_sliding_window_retrival, however, it doesn't
This is similar to test_sliding_window_retrieval, however, it doesn't
compare against the v1 block manager since v1 doesn't support
chunked prefill with sliding window.

View File

@ -594,8 +594,8 @@ def test_decode_schedule_preempted():
# should be preempted. 1 will also be preempted.
budget = create_token_budget()
output = scheduler._schedule_running(budget, curr_loras)
remainig_running = scheduler.running
assert len(remainig_running) == 0
remaining_running = scheduler.running
assert len(remaining_running) == 0
assert len(output.decode_seq_groups) == 1
assert len(output.prefill_seq_groups) == 0
assert output.decode_seq_groups[0].seq_group.request_id == "0"

View File

@ -16,7 +16,7 @@ chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
assert chatml_jinja_path.exists()
# Define models, templates, and their corresponding expected outputs
MODEL_TEMPLATE_GENERATON_OUTPUT = [
MODEL_TEMPLATE_GENERATION_OUTPUT = [
("facebook/opt-125m", chatml_jinja_path, True, False, """<|im_start|>user
Hello<|im_end|>
<|im_start|>assistant
@ -91,7 +91,7 @@ def test_no_load_chat_template_literallike():
@pytest.mark.parametrize(
"model,template,add_generation_prompt,continue_final_message,expected_output",
MODEL_TEMPLATE_GENERATON_OUTPUT)
MODEL_TEMPLATE_GENERATION_OUTPUT)
def test_get_gen_prompt(model, template, add_generation_prompt,
continue_final_message, expected_output):
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)

View File

@ -72,8 +72,8 @@ def test_copy_blocks(
# destination blocks.
assert 2 * num_mappings <= num_blocks
src_blocks = random.sample(range(num_blocks), num_mappings)
remainig_blocks = list(set(range(num_blocks)) - set(src_blocks))
dst_blocks = random.sample(remainig_blocks, 2 * num_mappings)
remaining_blocks = list(set(range(num_blocks)) - set(src_blocks))
dst_blocks = random.sample(remaining_blocks, 2 * num_mappings)
block_mapping: list[tuple[int, int]] = []
for i in range(num_mappings):
src = src_blocks[i]
@ -189,12 +189,12 @@ def test_reshape_and_cache(
# Run the reference implementation.
reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape)
block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
block_indicies_lst = block_indicies.cpu().tolist()
block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
block_indices_lst = block_indices.cpu().tolist()
block_offsets = slot_mapping % block_size
block_offsets_lst = block_offsets.cpu().tolist()
for i in range(num_tokens):
block_idx = block_indicies_lst[i]
block_idx = block_indices_lst[i]
block_offset = block_offsets_lst[i]
cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i]
cloned_value_cache[block_idx, :, :, block_offset] = value[i]
@ -322,12 +322,12 @@ def test_reshape_and_cache_flash(
kv_dtype=kv_cache_dtype)
# Run the reference implementation.
block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
block_indicies_lst = block_indicies.cpu().tolist()
block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
block_indices_lst = block_indices.cpu().tolist()
block_offsets = slot_mapping % block_size
block_offsets_lst = block_offsets.cpu().tolist()
for i in range(num_tokens):
block_idx = block_indicies_lst[i]
block_idx = block_indices_lst[i]
block_offset = block_offsets_lst[i]
if kv_cache_layout == "NHD":
cloned_key_cache[block_idx, block_offset, :, :] = key[i]

View File

@ -46,7 +46,7 @@ CUDA_DEVICE = "cuda:0"
MAX_DEC_SEQ_LENS = [128]
MAX_ENC_SEQ_LENS = [128]
# Narrow teest-cases for unsupported-scenario
# Narrow test-cases for unsupported-scenario
# tests
HEAD_SIZES_FOR_UNSUPP = [HEAD_SIZES[0]]

View File

@ -39,10 +39,10 @@ def rotary_embedding_opcheck(rot,
@pytest.mark.parametrize("head_size", [32, 108])
@pytest.mark.parametrize("seq_len", [11, 1024])
@pytest.mark.parametrize("use_key", [True, False])
@pytest.mark.parametrize("head_stride_is_contingous", [True, False])
@pytest.mark.parametrize("head_stride_is_contiguous", [True, False])
def test_rotary_embedding_opcheck(dist_init, device, max_position,
is_neox_style, rotary_dim, head_size,
seq_len, use_key, head_stride_is_contingous):
seq_len, use_key, head_stride_is_contiguous):
batch_size = 1
base = 10000
num_heads = 7
@ -52,7 +52,7 @@ def test_rotary_embedding_opcheck(dist_init, device, max_position,
positions = torch.randint(0,
max_position, (batch_size, seq_len),
device=device)
head_stride = head_size + (64 if head_stride_is_contingous else 0)
head_stride = head_size + (64 if head_stride_is_contiguous else 0)
query = torch.randn(batch_size,
seq_len,
@ -72,7 +72,7 @@ def test_rotary_embedding_opcheck(dist_init, device, max_position,
# if we have a contiguous head stride, test the alternate
# [..., num_heads * head_dim] shape/layout
if head_stride_is_contingous:
if head_stride_is_contiguous:
rotary_embedding_opcheck(
rot, positions, query.flatten(start_dim=-2),
key.flatten(start_dim=-2) if use_key else None)

View File

@ -107,15 +107,15 @@ def generate_random_inputs(batch_size,
return A, dt, X, B, C
def generate_continous_batched_examples(example_lens_by_batch,
num_examples,
full_length,
last_taken,
exhausted,
n_heads,
d_head,
itype,
device='cuda'):
def generate_continuous_batched_examples(example_lens_by_batch,
num_examples,
full_length,
last_taken,
exhausted,
n_heads,
d_head,
itype,
device='cuda'):
# this function generates a random examples of certain length
# and then cut according to "example_lens_by_batch" and feed
@ -269,11 +269,10 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
exhausted: dict = {} # map: eg -> boolean indicating example is exhausted
states = None
for Y_min, cu_seqlens, seq_idx, (A, dt, X, B,
C) in generate_continous_batched_examples(
cases, num_examples, seqlen,
last_taken, exhausted, n_heads,
d_head, itype):
for Y_min, cu_seqlens, seq_idx, (
A, dt, X, B, C) in generate_continuous_batched_examples(
cases, num_examples, seqlen, last_taken, exhausted, n_heads,
d_head, itype):
chunk_indices, chunk_offsets = \
_query_start_loc_to_chunk_indices_offsets(

View File

@ -118,7 +118,7 @@ def run_test(
# default to enforce_eager=True if enforce_eager
# is left unspecified. However, the
# VllmRunner test fixture (which wraps around the LLM class) defaults to
# enforce_eager=False (a behavior which a number of already-exisitng
# enforce_eager=False (a behavior which a number of already-existing
# decoder-only unit tests expect), so when testing an encoder/decoder
# model we must explicitly specify enforce_eager=True in the VllmRunner
# constructor.

View File

@ -248,7 +248,7 @@ def test_temperature_zero_target_distribution(seed: int, device: str):
size=(batch_size, 1),
dtype=torch.int64)
# The target probaility distribution is a temperature zero distribution
# with zero entroy. Since our draft token ids don't match the probability
# with zero entropy. Since our draft token ids don't match the probability
# 1.0 tokens in the target distribution we will reject all of them and
# fallback to the greedy sampling for selecting 1 token for each sequence.
# Verify the same.

View File

@ -18,7 +18,7 @@ However, we still need to verify below scenario could be passed:
* Test greedy equality under various number of speculative tokens.
With those tests, we can say at least, EAGLE would not break the
correctess for the target model outputs.
correctness for the target model outputs.
"""
import pytest

View File

@ -18,7 +18,7 @@ However, we still need to verify below scenario could be passed:
* Test greedy equality under various number of speculative tokens.
With those tests, we can say at least, Medusa would not break the
correctess for the target model outputs.
correctness for the target model outputs.
"""
import pytest

View File

@ -18,7 +18,7 @@ However, we still need to verify below scenario could be passed:
* Test greedy equality under various number of speculative tokens.
With those tests, we can say at least, mtp would not break the
correctess for the target model outputs.
correctness for the target model outputs.
"""
import pytest

View File

@ -22,8 +22,8 @@ However, we still need to verify below scenario could be passed:
* Test greedy equality under preemption
* Test greedy equality under various ngram sizes / speculative sizes
With those tests, we can say at least, ngram spec would not break the correctess
for the target model outputs.
With those tests, we can say at least, ngram spec would not break the
correctness for the target model outputs.
"""
import pytest

View File

@ -30,7 +30,7 @@ model_config = {
])
@pytest.mark.parametrize("batch_size", [5])
@pytest.mark.parametrize("seed", [1])
def test_sliding_window_retrival(monkeypatch, model, batch_size, seed):
def test_sliding_window_retrieval(monkeypatch, model, batch_size, seed):
"""
The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
asks for value of one of them (which is outside the sliding window).

View File

@ -7,7 +7,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import (
from .utils import create_request, create_scheduler, create_vllm_config
def test_basic_inferface():
def test_basic_interface():
"""Unit test for basic NixlConnector interface functionality."""
vllm_config = create_vllm_config()
@ -25,7 +25,7 @@ def test_basic_inferface():
scheduler.add_request(request)
# Remote Prefill, triggers NixlConnectorMetdata.
# Remote Prefill, triggers NixlConnectorMetadata.
scheduler_output = scheduler.schedule()
kv_connector_metadata = scheduler_output.kv_connector_metadata
assert kv_connector_metadata is not None

View File

@ -32,7 +32,7 @@ def test_prompt_logprobs_e2e():
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
def test_promt_logprobs_e2e_server():
def test_prompt_logprobs_e2e_server():
with RemoteOpenAIServer(MODEL, SERVER_ARGS) as remote_server:
url = f"{remote_server.url_for('v1')}/completions"

View File

@ -209,32 +209,32 @@ def test_multi_step_model_runner_input():
received_model_input = (StatefulModelInput.from_broadcasted_tensor_dict(
tensor_dict, attn_backend=attn_backend))
receieved_frozen_input = received_model_input.frozen_model_input
received_frozen_input = received_model_input.frozen_model_input
# Check that received copy has correct values.
assert isinstance(received_model_input, StatefulModelInput)
assert receieved_frozen_input.input_tokens is not None
assert (receieved_frozen_input.input_tokens ==
assert received_frozen_input.input_tokens is not None
assert (received_frozen_input.input_tokens ==
frozen_model_input.input_tokens).all()
assert receieved_frozen_input.input_positions is not None
assert (receieved_frozen_input.input_positions ==
assert received_frozen_input.input_positions is not None
assert (received_frozen_input.input_positions ==
frozen_model_input.input_positions).all()
assert receieved_frozen_input.multi_modal_kwargs is None
assert received_frozen_input.multi_modal_kwargs is None
assert (frozen_model_input.multi_modal_kwargs ==
frozen_model_input.multi_modal_kwargs)
assert receieved_frozen_input.lora_requests is None
assert (receieved_frozen_input.lora_requests ==
assert received_frozen_input.lora_requests is None
assert (received_frozen_input.lora_requests ==
frozen_model_input.lora_requests)
assert receieved_frozen_input.lora_mapping is None
assert received_frozen_input.lora_mapping is None
assert (
receieved_frozen_input.lora_mapping == frozen_model_input.lora_mapping)
received_frozen_input.lora_mapping == frozen_model_input.lora_mapping)
for field in dataclasses.fields(AttentionMetadata):
assert getattr(receieved_frozen_input.attn_metadata, field.name,
assert getattr(received_frozen_input.attn_metadata, field.name,
None) == getattr(attn_metadata, field.name, None)
# For sampling metadata, only selected_token_indices is copied.
assert (receieved_frozen_input.sampling_metadata.selected_token_indices ==
assert (received_frozen_input.sampling_metadata.selected_token_indices ==
sampling_metadata.selected_token_indices)
assert receieved_frozen_input.sampling_metadata.seq_groups is None
assert received_frozen_input.sampling_metadata.seq_groups is None
# check non frozen fields
assert received_model_input.is_last_step == model_input.is_last_step

View File

@ -116,7 +116,7 @@ def ReadTargets(log, show_all):
# If ninja.exe is rudely halted then the .ninja_log file may be
# corrupt. Silently continue.
continue
start, end, _, name, cmdhash = parts # Ignore restat.
start, end, _, name, cmdhash = parts # Ignore restart.
# Convert from integral milliseconds to float seconds.
start = int(start) / 1000.0
end = int(end) / 1000.0

179
typos.toml Normal file
View File

@ -0,0 +1,179 @@
[files]
# these files may be written in non english words
extend-exclude = ["tests/models/fixtures/*", "tests/prompts/*",
"benchmarks/sonnet.txt", "tests/lora/data/*", "build/*",
"vllm/third_party/*"]
ignore-hidden = true
ignore-files = true
ignore-dot = true
ignore-vcs = true
ignore-global = true
ignore-parent = true
[default]
binary = false
check-filename = false
check-file = true
unicode = true
ignore-hex = true
identifier-leading-digits = false
locale = "en"
extend-ignore-identifiers-re = ["NVML_*", ".*Unc.*", ".*_thw",
".*UE8M0.*", ".*[UE4M3|ue4m3].*", ".*eles.*", ".*fo.*", ".*ba.*",
".*ot.*", ".*[Tt]h[rR].*"]
extend-ignore-words-re = []
extend-ignore-re = []
[default.extend-identifiers]
bbc5b7ede = "bbc5b7ede"
womens_doubles = "womens_doubles"
v_2nd = "v_2nd"
splitted_input = "splitted_input"
NOOPs = "NOOPs"
typ = "typ"
nin_shortcut = "nin_shortcut"
UperNetDecoder = "UperNetDecoder"
subtile = "subtile"
cudaDevAttrMaxSharedMemoryPerBlockOptin = "cudaDevAttrMaxSharedMemoryPerBlockOptin"
SFOuput = "SFOuput"
# huggingface transformers repo uses these words
depthwise_seperable_out_channel = "depthwise_seperable_out_channel"
DepthWiseSeperableConv1d = "DepthWiseSeperableConv1d"
depthwise_seperable_CNN = "depthwise_seperable_CNN"
[default.extend-words]
iy = "iy"
tendencias = "tendencias"
# intel cpu features
tme = "tme"
dout = "dout"
Pn = "Pn"
arange = "arange"
[type.py]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[type.py.extend-identifiers]
arange = "arange"
NDArray = "NDArray"
EOFError = "EOFError"
[type.py.extend-words]
[type.cpp]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[type.cpp.extend-identifiers]
countr_one = "countr_one"
[type.cpp.extend-words]
[type.rust]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[type.rust.extend-identifiers]
flate2 = "flate2"
[type.rust.extend-words]
ser = "ser"
[type.lock]
extend-glob = []
check-file = false
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[type.lock.extend-identifiers]
[type.lock.extend-words]
[type.jl]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[type.jl.extend-identifiers]
[type.jl.extend-words]
modul = "modul"
egals = "egals"
usig = "usig"
egal = "egal"
[type.go]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[type.go.extend-identifiers]
flate = "flate"
[type.go.extend-words]
[type.css]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[type.css.extend-identifiers]
nd = "nd"
[type.css.extend-words]
[type.man]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[type.man.extend-identifiers]
Nd = "Nd"
[type.man.extend-words]
[type.cert]
extend-glob = []
check-file = false
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[type.cert.extend-identifiers]
[type.cert.extend-words]
[type.sh]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[type.sh.extend-identifiers]
stap = "stap"
ot = "ot"
[type.sh.extend-words]
[type.vimscript]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[type.vimscript.extend-identifiers]
windo = "windo"
[type.vimscript.extend-words]

View File

@ -1550,10 +1550,10 @@ def moe_wna16_gemm(input: torch.Tensor, output: torch.Tensor,
def topk_softmax(topk_weights: torch.Tensor, topk_ids: torch.Tensor,
token_expert_indicies: torch.Tensor,
token_expert_indices: torch.Tensor,
gating_output: torch.Tensor) -> None:
torch.ops._moe_C.topk_softmax(topk_weights, topk_ids,
token_expert_indicies, gating_output)
torch.ops._moe_C.topk_softmax(topk_weights, topk_ids, token_expert_indices,
gating_output)
def moe_wna16_marlin_gemm(input: torch.Tensor, output: Optional[torch.Tensor],

View File

@ -373,7 +373,7 @@ class CommonAttentionState(AttentionState):
f"Expected attn_backend name to be either 'XFORMERS'," \
f"'ROCM_FLASH', or 'FLASH_ATTN', but " \
f"got '{self.runner.attn_backend.get_name()}'"
self._add_additonal_input_buffers_for_enc_dec_model(
self._add_additional_input_buffers_for_enc_dec_model(
attn_metadata=attn_metadata, input_buffers=input_buffers)
return input_buffers
@ -427,7 +427,7 @@ class CommonAttentionState(AttentionState):
attn_metadata.max_encoder_seq_len = self.runner.max_seq_len_to_capture
attn_metadata.num_encoder_tokens = 0
def _add_additonal_input_buffers_for_enc_dec_model(
def _add_additional_input_buffers_for_enc_dec_model(
self, attn_metadata, input_buffers: Dict[str, Any]):
"""
Saves additional input buffers specific to the encoder-decoder model

View File

@ -40,7 +40,7 @@ class Internlm2ToolParser(ToolParser):
request.skip_special_tokens = False
return request
def get_argments(self, obj):
def get_arguments(self, obj):
if "parameters" in obj:
return obj.get("parameters")
elif "arguments" in obj:
@ -119,9 +119,9 @@ class Internlm2ToolParser(ToolParser):
# now we know we're on the same tool call and we're streaming
# arguments
else:
prev_arguments = self.get_argments(
prev_arguments = self.get_arguments(
self.prev_tool_call_arr[self.current_tool_id])
cur_arguments = self.get_argments(tool_call_arr)
cur_arguments = self.get_arguments(tool_call_arr)
# not arguments generated
if not cur_arguments and not prev_arguments:
@ -170,7 +170,7 @@ class Internlm2ToolParser(ToolParser):
# check to see if the name is defined and has been sent. if so,
# stream the name - otherwise keep waiting
# finish by setting old and returning None as base case
tool_call_arr["arguments"] = self.get_argments(tool_call_arr)
tool_call_arr["arguments"] = self.get_arguments(tool_call_arr)
self.prev_tool_call_arr = [tool_call_arr]
return delta
except Exception:

View File

@ -1202,7 +1202,7 @@ class LinearScalingRotaryEmbeddingWithLoRA(BaseLayerWithLoRA):
multiple LoRA adapters with a specialized kernel.
Replace LinearScalingRotaryEmbedding with MultiLinearScalingRotaryEmbedding
which can handle multi lora adapters in a specialied kernel.
which can handle multi lora adapters in a specialized kernel.
"""
def __init__(self, base_layer: RotaryEmbedding) -> None:

View File

@ -68,11 +68,11 @@ def convert_mapping(
LoRA indices.
sampler_indices: Tensor of shape [batch_size] mapping requests to
LoRA indices for sampler. For generation, this will be the
same as base_indicies. For prefill, this will map requests
same as base_indices. For prefill, this will map requests
to LoRA indices.
sampler_indices_padded: Tensor of shape [batch_size] mapping
requests to LoRA indices for sampler with padding.
Same as sampler_indicies, but -1 is replaced with
Same as sampler_indices, but -1 is replaced with
max_loras.
embeddings_indices: Tensor of shape [2, batch_size] mapping
requests to embedding indices. First row is for embeddings

View File

@ -319,7 +319,7 @@ class MambaMixer2(CustomOp):
n_groups == 1, # if there was only one group
)
intermediate_settings = (intermediate_size, 0, False)
head_setings = (self.num_heads, 0, False)
head_settings = (self.num_heads, 0, False)
# - the weight already has a "weight_loader" attribute
# which set_weight_attrs will raise if we do not
@ -372,7 +372,7 @@ class MambaMixer2(CustomOp):
intermediate_settings,
group_shard_settings,
group_shard_settings,
head_setings, # for dt
head_settings, # for dt
],
self.tp_size,
tp_rank,

View File

@ -516,7 +516,7 @@ def _chunk_state_varlen_kernel(
offs_n[None, :] * stride_chunk_states_dstate)
else:
# - this seems repetitve, buts its to help the compiler
# - this seems repetitive, buts its to help the compiler
if start_idx < pid_c * chunk_size:
past_states_ptrs = chunk_states_ptr + (
offs_m[:, None] * stride_chunk_states_hdim +

View File

@ -219,7 +219,7 @@ def per_token_group_quant_int8(
quantized tensor along with the scaling factor used for quantization.
Args:
x: The input tenosr with ndim >= 2.
x: The input tensor with ndim >= 2.
group_size: The group size used for quantization.
eps: The minimum to avoid dividing zero.
dtype: The dype of output tensor. Note that only `torch.int8`

View File

@ -401,7 +401,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
self.target_modules.append(
name.replace(rep_name, sub_name))
# Add original module name even if the module has stacked map,
# in case model has a mixture of disk-merged and disk-splitted
# in case model has a mixture of disk-merged and disk-split
# weights with same last name.
self.target_modules.append(name)

View File

@ -131,7 +131,7 @@ class BaiChuanAttention(nn.Module):
self.num_heads = (self.total_num_heads //
tensor_model_parallel_world_size)
self.head_dim = hidden_size // self.total_num_heads
self.postion_embedding = position_embedding
self.position_embedding = position_embedding
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
@ -151,7 +151,7 @@ class BaiChuanAttention(nn.Module):
quant_config=quant_config,
)
# Create the alibi slopes and slice them.
if self.postion_embedding == "ALIBI":
if self.position_embedding == "ALIBI":
tp_rank = get_tensor_model_parallel_rank()
head_start = tp_rank * self.num_heads
head_end = (tp_rank + 1) * self.num_heads
@ -187,7 +187,7 @@ class BaiChuanAttention(nn.Module):
) -> torch.Tensor:
qkv, _ = self.W_pack(hidden_states)
q, k, v = qkv.chunk(chunks=3, dim=-1)
if self.postion_embedding != "ALIBI":
if self.position_embedding != "ALIBI":
q, k = self.rotary_emb(positions, q, k)
attn_output = self.attn(q, k, v)
output, _ = self.o_proj(attn_output)

View File

@ -344,7 +344,7 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
self.image_newline = nn.Parameter(
torch.randn(self.projector_config.n_embed) * embed_std)
# This is a typo in original implementation
self.view_seperator = nn.Parameter(
self.view_separator = nn.Parameter(
torch.randn(self.projector_config.n_embed) * embed_std)
else:
raise ValueError(
@ -549,13 +549,13 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
if self.global_view_pos == "head":
global_local_features = torch.cat([
global_features,
self.view_seperator[None, :],
self.view_separator[None, :],
local_features,
])
else:
global_local_features = torch.cat([
local_features,
self.view_seperator[None, :],
self.view_separator[None, :],
global_features,
])

View File

@ -197,7 +197,7 @@ class EAGLE(nn.Module):
return logits
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
# This implementation is incompitable with https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B
# This implementation is incompatible with https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B
# due to missing lm_head weights and its config being that of a
# Llama model. Here's a compatible version with the same weights:
# https://huggingface.co/abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm

View File

@ -634,13 +634,13 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
kwargs["has_images"] = True
# NOTE(woosuk): Here, we distinguish the sequences by the position id 0.
# This is a HACK. Fix this.
start_idices = (positions == 0).cpu().nonzero()
num_seqs = len(start_idices)
start_indices = (positions == 0).cpu().nonzero()
num_seqs = len(start_indices)
seq_lens = []
for i in range(num_seqs):
start_idx = start_idices[i].item()
start_idx = start_indices[i].item()
if i < num_seqs - 1:
end_idx = start_idices[i + 1].item()
end_idx = start_indices[i + 1].item()
else:
end_idx = len(input_ids)
seq_lens.append(end_idx - start_idx)

View File

@ -52,7 +52,7 @@ class Llama4MoE(nn.Module):
renormalize: bool,
) -> tuple[torch.Tensor, torch.Tensor]:
router_scores, router_indices = fast_topk(gating_output, topk, dim=-1)
# psuedo-standard is that the router scores are floats
# pseudo-standard is that the router scores are floats
router_scores = torch.sigmoid(router_scores.float())
return (router_scores, router_indices.to(torch.int32))

View File

@ -114,9 +114,9 @@ class MixtralMoE(nn.Module):
f"Tensor parallel size {self.tp_size} is greater than "
f"the number of experts {self.num_total_experts}.")
# Split experts equally between ranks
self.expert_indicies = np.array_split(range(
self.num_total_experts), self.tp_size)[self.rank].tolist()
if not self.expert_indicies:
self.expert_indices = np.array_split(range(self.num_total_experts),
self.tp_size)[self.rank].tolist()
if not self.expert_indices:
raise ValueError(
f"Rank {self.rank} has no experts assigned to it.")
@ -125,7 +125,7 @@ class MixtralMoE(nn.Module):
config.hidden_size,
config.intermediate_size,
quant_config=quant_config)
if idx in self.expert_indicies else None
if idx in self.expert_indices else None
for idx in range(self.num_total_experts)
])
self.gate = ReplicatedLinear(config.hidden_size,
@ -146,7 +146,7 @@ class MixtralMoE(nn.Module):
routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
final_hidden_states = None
for expert_idx in self.expert_indicies:
for expert_idx in self.expert_indices:
expert_layer = self.experts[expert_idx]
expert_mask = (selected_experts == expert_idx)
expert_weights = (routing_weights * expert_mask).sum(dim=-1,

View File

@ -283,7 +283,7 @@ class OvisProcessingInfo(BaseProcessingInfo):
def get_image_size_with_most_features(self) -> ImageSize:
height, width = self.get_hf_processor().get_image_size()
hs = self.get_hf_config().visual_tokenizer_config.hidden_stride
# NOTE(Isotr0py): 9 is `max_partion` hardcoded in original code
# NOTE(Isotr0py): 9 is `max_partition` hardcoded in original code
# https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/modeling_ovis.py#L96
return ImageSize(width=width * hs * 9, height=height * hs * 9)

View File

@ -145,7 +145,7 @@ class Phi3SmallSelfAttention(nn.Module):
self.num_q_per_kv = self.num_heads // self.num_key_value_heads
if self.tp_size > 1:
assert self.num_key_value_heads % self.tp_size == 0
self.num_kv_heads_per_partion = max(
self.num_kv_heads_per_partition = max(
1, self.num_key_value_heads // self.tp_size)
self.num_heads_per_partition = self.num_heads // self.tp_size
@ -212,7 +212,7 @@ class Phi3SmallSelfAttention(nn.Module):
bs_params = {
'max_seqlen': self.max_position_embeddings,
'num_heads': self.num_heads_per_partition,
"num_kv_heads": self.num_kv_heads_per_partion,
"num_kv_heads": self.num_kv_heads_per_partition,
"block_size": self.sparse_block_size,
"local_blocks": self.local_blocks,
"vert_stride": self.vert_stride,
@ -222,7 +222,7 @@ class Phi3SmallSelfAttention(nn.Module):
self.attn = Attention(self.num_heads_per_partition,
self.head_dim,
self.scale,
num_kv_heads=self.num_kv_heads_per_partion,
num_kv_heads=self.num_kv_heads_per_partition,
cache_config=cache_config,
quant_config=quant_config,
blocksparse_params=bs_params,
@ -243,8 +243,8 @@ class Phi3SmallSelfAttention(nn.Module):
# NOTE: this is required by RotaryEmbed, which indeed does not have to
# TODO: allow 3D QK for rotary forward
q = q.reshape(-1, self.head_dim * self.num_heads_per_partition)
k = k.reshape(-1, self.head_dim * self.num_kv_heads_per_partion)
v = v.reshape(-1, self.head_dim * self.num_kv_heads_per_partion)
k = k.reshape(-1, self.head_dim * self.num_kv_heads_per_partition)
v = v.reshape(-1, self.head_dim * self.num_kv_heads_per_partition)
q, k = self.rotary_emb(positions, q, k)
attn_output = self.attn(q, k, v)

View File

@ -41,7 +41,7 @@ class ConformerEncoderLayer(nn.Module):
for the last pointwise conv after swish activation.
depthwise_seperable_out_channel: int
if set different to 0, the number of
depthwise_seperable_out_channel will be used as a
depthwise_seperable_out_channel will be used as a
channel_out of the second conv1d layer.
otherwise, it equal to 0, the second conv1d layer is skipped.
depthwise_multiplier: int
@ -126,7 +126,7 @@ class ConformerEncoderLayer(nn.Module):
(Multi-Head Attention),
1 = typical Multi-Head Attention,
1 < attn_group_sizes < attention_heads = Grouped-Query Attention
attn_group_sizes = attenion_heads = Multi-Query Attention
attn_group_sizes = attention_heads = Multi-Query Attention
"""
def __init__(
@ -318,7 +318,7 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
1 = typical Multi-Head Attention,
1 < attention_group_size < attention_heads = Grouped-Query
Attention
attention_group_size = attenion_heads = Multi-Query Attention
attention_group_size = attention_heads = Multi-Query Attention
"""
def __init__(
@ -744,7 +744,7 @@ class ConformerEncoder(TransformerEncoderBase):
1 = typical Multi-Head Attention,
1 < attention_group_size < attention_heads = Grouped-Query
Attention
attention_group_size = attenion_heads = Multi-Query Attention
attention_group_size = attention_heads = Multi-Query Attention
"""
extra_multi_layer_output_idxs: list[int]

View File

@ -147,15 +147,15 @@ class mp(torch.autograd.Function):
grad_at_output = grad_at_output * multiplier
grad_at_scores_expaned = masked_gates * grad_at_output.mul(-1)
grad_at_scores_expaned.scatter_add_(
grad_at_scores_expanded = masked_gates * grad_at_output.mul(-1)
grad_at_scores_expanded.scatter_add_(
dim=-1,
index=selected_experts,
src=grad_at_output,
)
return (
grad_at_scores_expaned,
grad_at_scores_expanded,
None,
None,
None,

View File

@ -324,7 +324,7 @@ def merge_and_sort_multimodal_metadata(
Returns:
list[str]: List of item modalities in order of their positions in the
input sequence.
list[PlaceholderRange]: Sorted list of all PlaceholdeRanges from
list[PlaceholderRange]: Sorted list of all PlaceholderRanges from
mm_positions.
Optional[list[str]]: Sorted list of all hashes from mm_hashes if given,
None otherwise.

View File

@ -68,7 +68,7 @@ class OvisProcessor(ProcessorMixin):
"""
attributes = ["image_processor", "tokenizer"]
valid_kwargs = ["chat_template", "image_pad_token", "image_segement_len"]
valid_kwargs = ["chat_template", "image_pad_token", "image_segment_len"]
image_processor_class = "AutoImageProcessor"
tokenizer_class = "AutoTokenizer"

View File

@ -886,7 +886,7 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
num_decode_tokens=0,
slot_mapping=slot_mapping,
multi_modal_placeholder_index_maps=
None, # FIXME(kzawora): mutli-modality will not work here
None, # FIXME(kzawora): multi-modality will not work here
enable_kv_scales_calculation=False,
)
multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)

View File

@ -277,7 +277,7 @@ class StatefulModelInput(BroadcastableModelInput):
assert fmi.input_tokens.shape[0] >= self.num_seqs
fmi_new_input_tokens: torch.Tensor = fmi.input_tokens[:self.num_seqs]
# Update frozen_model_input::input_positons.
# Update frozen_model_input::input_positions.
assert fmi.input_positions is not None
assert fmi.input_positions.shape[0] >= self.num_seqs
fmi_new_input_positions: torch.Tensor = fmi.input_positions[:self.

View File

@ -798,9 +798,9 @@ class ModelWrapper(nn.Module):
"""
batch_size, seq_len = token_ids.shape
# Calculate the positions to sample from.
start_indicies = torch.arange(
start_indices = torch.arange(
batch_size, dtype=torch.int32, device=input_lens.device) * seq_len
logits_indices = start_indicies + input_lens - 1
logits_indices = start_indices + input_lens - 1
attn_metadata = get_forward_context().attn_metadata
# FIXME(woosuk): This is a temporary hack to avoid using the existing
@ -822,14 +822,14 @@ class ModelWrapper(nn.Module):
num_kv_heads, num_blocks, block_size, _ = kv_caches[0][0].shape
slot_mapping = attn_metadata.slot_mapping
slot_mapping = slot_mapping.flatten()
head_indicies = torch.arange(0,
num_kv_heads,
device=slot_mapping.device,
dtype=slot_mapping.dtype)
head_indicies *= block_size * num_blocks
head_indices = torch.arange(0,
num_kv_heads,
device=slot_mapping.device,
dtype=slot_mapping.dtype)
head_indices *= block_size * num_blocks
slot_mapping = slot_mapping.repeat_interleave(num_kv_heads).view(
-1, num_kv_heads)
slot_mapping = slot_mapping + head_indicies.view(1, -1)
slot_mapping = slot_mapping + head_indices.view(1, -1)
slot_mapping = slot_mapping.flatten()
attn_metadata.slot_mapping = slot_mapping