[CI] change spell checker from codespell to typos (#18711)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@ -200,5 +200,5 @@ benchmarks/**/*.json
|
||||
actionlint
|
||||
shellcheck*/
|
||||
|
||||
# Ingore moe/marlin_moe gen code
|
||||
# Ignore moe/marlin_moe gen code
|
||||
csrc/moe/marlin_moe_wna16/kernel_*
|
||||
|
@ -20,12 +20,10 @@ repos:
|
||||
args: [--output-format, github, --fix]
|
||||
- id: ruff-format
|
||||
files: ^(.buildkite|benchmarks|examples)/.*
|
||||
- repo: https://github.com/codespell-project/codespell
|
||||
rev: v2.4.1
|
||||
- repo: https://github.com/crate-ci/typos
|
||||
rev: v1.32.0
|
||||
hooks:
|
||||
- id: codespell
|
||||
additional_dependencies: ['tomli']
|
||||
args: ['--toml', 'pyproject.toml']
|
||||
- id: typos
|
||||
- repo: https://github.com/PyCQA/isort
|
||||
rev: 6.0.1
|
||||
hooks:
|
||||
|
@ -137,8 +137,8 @@ FORCE_INLINE std::pair<T, T> reduceSoftmaxAlibi(T* data, const int size,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
FORCE_INLINE void reducePartitonSoftmax(const T* max_data, T* sum_data,
|
||||
const int size) {
|
||||
FORCE_INLINE void reducePartitionSoftmax(const T* max_data, T* sum_data,
|
||||
const int size) {
|
||||
T max = max_data[0];
|
||||
for (int i = 1; i < size; ++i) {
|
||||
max = max >= max_data[i] ? max : max_data[i];
|
||||
@ -634,7 +634,7 @@ struct paged_attention_v2_impl {
|
||||
|
||||
if (partition_num == 1) continue;
|
||||
|
||||
reducePartitonSoftmax(
|
||||
reducePartitionSoftmax(
|
||||
max_logits + seq_idx * num_heads * max_num_partitions +
|
||||
head_idx * max_num_partitions,
|
||||
exp_sums + seq_idx * num_heads * max_num_partitions +
|
||||
|
@ -83,7 +83,7 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
|
||||
explicit FP16Vec16(const void* ptr)
|
||||
: reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
|
||||
|
||||
// non-temproal load
|
||||
// non-temporal load
|
||||
explicit FP16Vec16(bool, void* ptr)
|
||||
: reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
|
||||
|
||||
@ -120,7 +120,7 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
|
||||
explicit BF16Vec16(const void* ptr)
|
||||
: reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
|
||||
|
||||
// non-temproal load
|
||||
// non-temporal load
|
||||
explicit BF16Vec16(bool, void* ptr)
|
||||
: reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
|
||||
|
||||
@ -327,7 +327,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||
// normal load
|
||||
explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {}
|
||||
|
||||
// non-temproal load
|
||||
// non-temporal load
|
||||
explicit FP32Vec16(bool, void* ptr)
|
||||
: reg((__m512)_mm512_stream_load_si512(ptr)) {}
|
||||
|
||||
@ -576,7 +576,7 @@ struct INT8Vec64 : public Vec<INT8Vec64> {
|
||||
// normal load
|
||||
explicit INT8Vec64(void* ptr) : reg(_mm512_loadu_epi8(ptr)) {}
|
||||
|
||||
// non-temproal load
|
||||
// non-temporal load
|
||||
explicit INT8Vec64(bool, void* ptr) : reg(_mm512_stream_load_si512(ptr)) {}
|
||||
|
||||
void save(void* ptr) const { _mm512_storeu_epi8(ptr, reg); }
|
||||
@ -587,7 +587,7 @@ struct INT8Vec64 : public Vec<INT8Vec64> {
|
||||
_mm512_mask_storeu_epi8(ptr, mask, reg);
|
||||
}
|
||||
|
||||
// non-temproal save
|
||||
// non-temporal save
|
||||
void nt_save(int8_t* ptr) { _mm512_stream_si512((__m512i*)ptr, reg); }
|
||||
};
|
||||
#endif
|
||||
|
@ -12,7 +12,7 @@ void moe_permute(
|
||||
const torch::Tensor& input, // [n_token, hidden]
|
||||
const torch::Tensor& topk_weights, //[n_token, topk]
|
||||
torch::Tensor& topk_ids, // [n_token, topk]
|
||||
const torch::Tensor& token_expert_indicies, // [n_token, topk]
|
||||
const torch::Tensor& token_expert_indices, // [n_token, topk]
|
||||
const std::optional<torch::Tensor>& expert_map, // [n_expert]
|
||||
int64_t n_expert, int64_t n_local_expert, int64_t topk,
|
||||
const std::optional<int64_t>& align_block_size,
|
||||
@ -27,15 +27,15 @@ void moe_permute(
|
||||
"expert_first_token_offset must be int64");
|
||||
TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
|
||||
"topk_ids must be int32");
|
||||
TORCH_CHECK(token_expert_indicies.scalar_type() == at::ScalarType::Int,
|
||||
"token_expert_indicies must be int32");
|
||||
TORCH_CHECK(token_expert_indices.scalar_type() == at::ScalarType::Int,
|
||||
"token_expert_indices must be int32");
|
||||
TORCH_CHECK(src_row_id2dst_row_id_map.scalar_type() == at::ScalarType::Int,
|
||||
"src_row_id2dst_row_id_map must be int32");
|
||||
TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1,
|
||||
"expert_first_token_offset shape != n_local_expert+1")
|
||||
TORCH_CHECK(
|
||||
src_row_id2dst_row_id_map.sizes() == token_expert_indicies.sizes(),
|
||||
"token_expert_indicies shape must be same as src_row_id2dst_row_id_map");
|
||||
src_row_id2dst_row_id_map.sizes() == token_expert_indices.sizes(),
|
||||
"token_expert_indices shape must be same as src_row_id2dst_row_id_map");
|
||||
auto n_token = input.sizes()[0];
|
||||
auto n_hidden = input.sizes()[1];
|
||||
auto align_block_size_value =
|
||||
@ -71,7 +71,7 @@ void moe_permute(
|
||||
expert_map_ptr, n_expert, stream);
|
||||
}
|
||||
// expert sort topk expert id and scan expert id get expert_first_token_offset
|
||||
sortAndScanExpert(get_ptr<int>(topk_ids), get_ptr<int>(token_expert_indicies),
|
||||
sortAndScanExpert(get_ptr<int>(topk_ids), get_ptr<int>(token_expert_indices),
|
||||
get_ptr<int>(permuted_experts_id),
|
||||
get_ptr<int>(dst_row_id2src_row_id_map),
|
||||
get_ptr<int64_t>(expert_first_token_offset), n_token,
|
||||
@ -190,7 +190,7 @@ void shuffle_rows(const torch::Tensor& input_tensor,
|
||||
|
||||
void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
|
||||
torch::Tensor& topk_ids,
|
||||
const torch::Tensor& token_expert_indicies,
|
||||
const torch::Tensor& token_expert_indices,
|
||||
const std::optional<torch::Tensor>& expert_map,
|
||||
int64_t n_expert, int64_t n_local_expert, int64_t topk,
|
||||
const std::optional<int64_t>& align_block_size,
|
||||
@ -203,7 +203,7 @@ void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
|
||||
|
||||
void moe_unpermute(const torch::Tensor& input,
|
||||
const torch::Tensor& topk_weights, torch::Tensor& topk_ids,
|
||||
const torch::Tensor& token_expert_indicies,
|
||||
const torch::Tensor& token_expert_indices,
|
||||
const std::optional<torch::Tensor>& expert_map,
|
||||
int64_t n_expert, int64_t n_local_expert, int64_t topk,
|
||||
const std::optional<int64_t>& align_block_size,
|
||||
|
@ -425,7 +425,7 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
|
||||
|
||||
#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB) \
|
||||
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB>( \
|
||||
gating_output, nullptr, topk_weights, topk_indicies, \
|
||||
gating_output, nullptr, topk_weights, topk_indices, \
|
||||
token_expert_indices, num_tokens, topk, 0, num_experts, \
|
||||
stream);
|
||||
|
||||
@ -433,7 +433,7 @@ template <typename IndType>
|
||||
void topkGatingSoftmaxKernelLauncher(
|
||||
const float* gating_output,
|
||||
float* topk_weights,
|
||||
IndType* topk_indicies,
|
||||
IndType* topk_indices,
|
||||
int* token_expert_indices,
|
||||
float* softmax_workspace,
|
||||
const int num_tokens,
|
||||
@ -476,7 +476,7 @@ void topkGatingSoftmaxKernelLauncher(
|
||||
moeSoftmax<TPB><<<num_tokens, TPB, 0, stream>>>(
|
||||
gating_output, nullptr, softmax_workspace, num_experts);
|
||||
moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>(
|
||||
softmax_workspace, nullptr, topk_weights, topk_indicies, token_expert_indices,
|
||||
softmax_workspace, nullptr, topk_weights, topk_indices, token_expert_indices,
|
||||
num_experts, topk, 0, num_experts);
|
||||
}
|
||||
}
|
||||
|
@ -66,7 +66,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
||||
|
||||
m.def(
|
||||
"moe_permute(Tensor input, Tensor topk_weight, Tensor! topk_ids,"
|
||||
"Tensor token_expert_indicies, Tensor? expert_map, int n_expert,"
|
||||
"Tensor token_expert_indices, Tensor? expert_map, int n_expert,"
|
||||
"int n_local_expert,"
|
||||
"int topk, int? align_block_size,Tensor! permuted_input, Tensor! "
|
||||
"expert_first_token_offset, Tensor! src_row_id2dst_row_id_map, Tensor! "
|
||||
|
@ -1003,7 +1003,7 @@ struct MacheteCollectiveMma {
|
||||
static constexpr int A_CPY_VEC =
|
||||
decltype(max_common_vector(tCsA, tCrA_load)){};
|
||||
|
||||
static constexpr int COVERSION_WIDTH =
|
||||
static constexpr int CONVERSION_WIDTH =
|
||||
std::min(A_CPY_VEC, int(size<0>(tCrA_mma)));
|
||||
|
||||
auto load_A_to_registers = [&](int read_stage) {
|
||||
@ -1026,8 +1026,8 @@ struct MacheteCollectiveMma {
|
||||
// PIPELINED MAIN LOOP
|
||||
//
|
||||
|
||||
auto convert_A = [&, a_vec = Int<COVERSION_WIDTH>{}](int k_block,
|
||||
int read_stage) {
|
||||
auto convert_A = [&, a_vec = Int<CONVERSION_WIDTH>{}](int k_block,
|
||||
int read_stage) {
|
||||
load_extra_info_to_registers(partitioned_extra_info,
|
||||
copy_partitions_extra_info, k_block,
|
||||
read_stage);
|
||||
|
@ -320,7 +320,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
// Goal is to bring the activation matrix A to the LDS
|
||||
// and use it across the lifetime of the work group
|
||||
// TODO: When activation matrix is larger than 64 KB
|
||||
// then this is not goint to work!
|
||||
// then this is not going to work!
|
||||
//----------------------------------------------------
|
||||
__shared__ scalar_t s[max_lds_len];
|
||||
|
||||
@ -581,7 +581,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
// Goal is to bring the activation matrix A to the LDS
|
||||
// and use it across the lifetime of the work group
|
||||
// TODO: When activation matrix is larger than 64 KB
|
||||
// then this is not goint to work!
|
||||
// then this is not going to work!
|
||||
//----------------------------------------------------
|
||||
__shared__ scalar_t s[max_lds_len];
|
||||
|
||||
@ -601,7 +601,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
// int _WvPrGrp = mindiv(N, CuCount * YTILE, WvPrGrp);
|
||||
uint32_t m = (blockIdx.x * _WvPrGrp + threadIdx.y) * YTILE;
|
||||
|
||||
// Check whether there will be fragmenation!
|
||||
// Check whether there will be fragmentation!
|
||||
// This will happen only for the last wave!
|
||||
if (m < M && (m + YTILE) >= M) {
|
||||
uint32_t startColumn = M - YTILE;
|
||||
@ -827,7 +827,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
|
||||
m += CuCount * _WvPrGrp * YTILE;
|
||||
|
||||
// Check whether there will be fragmenation!
|
||||
// Check whether there will be fragmentation!
|
||||
// This will happen only for the last wave!
|
||||
if (m < M && (m + YTILE) >= M) {
|
||||
uint32_t startColumn = M - YTILE;
|
||||
@ -882,7 +882,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
// Goal is to bring the activation matrix A to the LDS
|
||||
// and use it across the lifetime of the work group
|
||||
// TODO: When activation matrix is larger than 64 KB
|
||||
// then this is not goint to work!
|
||||
// then this is not going to work!
|
||||
//----------------------------------------------------
|
||||
__shared__ scalar_t s[max_lds_len];
|
||||
|
||||
@ -904,7 +904,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
//----------------------------------------------------
|
||||
uint32_t m = (blockIdx.x * _WvPrGrp + threadIdx.y) * YTILE;
|
||||
|
||||
// Check whether there will be fragmenation!
|
||||
// Check whether there will be fragmentation!
|
||||
// This will happen only for the last wave!
|
||||
if (m < M && (m + YTILE) >= M) {
|
||||
uint32_t startColumn = M - YTILE;
|
||||
@ -1176,7 +1176,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
m += CuCount * _WvPrGrp * YTILE;
|
||||
kBase = 0;
|
||||
|
||||
// Check whether there will be fragmenation!
|
||||
// Check whether there will be fragmentation!
|
||||
// This will happen only for the last wave!
|
||||
if (m < M && (m + YTILE) >= M) {
|
||||
uint32_t startColumn = M - YTILE;
|
||||
|
@ -277,7 +277,7 @@ CompressorResult cutlass_sparse_compress_sm90(torch::Tensor const& a) {
|
||||
uint32_t const m = 1; // Set M to 1 for compression
|
||||
uint32_t const n = a.size(1);
|
||||
|
||||
// Note: For correctess, the compressed format must be invariant in:
|
||||
// Note: For correctness, the compressed format must be invariant in:
|
||||
// - M, the flattened number of tokens
|
||||
// - Whether output dtype is fp16 or bf16
|
||||
// - CUTLASS epilogues
|
||||
|
@ -137,10 +137,6 @@ exclude = [
|
||||
'vllm/attention/ops/.*\.py$'
|
||||
]
|
||||
|
||||
[tool.codespell]
|
||||
ignore-words-list = "dout, te, indicies, subtile, ElementE"
|
||||
skip = "tests/models/fixtures/*,tests/prompts/*,benchmarks/sonnet.txt,tests/lora/data/*,build/*,vllm/third_party/*"
|
||||
|
||||
[tool.isort]
|
||||
skip_glob = [
|
||||
".buildkite/*",
|
||||
|
@ -223,7 +223,7 @@ def test_async_tp_pass_correctness(
|
||||
"VLLM_USE_V1": "1",
|
||||
}
|
||||
|
||||
aysnc_tp_args = [
|
||||
async_tp_args = [
|
||||
*common_args,
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size),
|
||||
@ -242,7 +242,7 @@ def test_async_tp_pass_correctness(
|
||||
]
|
||||
|
||||
compare_two_settings(model_id,
|
||||
aysnc_tp_args,
|
||||
async_tp_args,
|
||||
tp_args,
|
||||
async_tp_env,
|
||||
tp_env,
|
||||
|
@ -437,8 +437,8 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
|
||||
"enable_prefix_caching": True,
|
||||
}])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
def test_auto_prefix_caching_after_evition_start(baseline_llm_generator,
|
||||
test_llm_generator):
|
||||
def test_auto_prefix_caching_after_eviction_start(baseline_llm_generator,
|
||||
test_llm_generator):
|
||||
"""Verify block manager v2 with auto prefix caching could works normal
|
||||
even when eviction started.
|
||||
With APC enabled, all blocks are held by native block at the beginning.
|
||||
|
@ -33,8 +33,8 @@ BLOCK_SIZE = 16
|
||||
@pytest.mark.parametrize("batch_size", [5])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
|
||||
def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
|
||||
batch_size, seed, backend, monkeypatch):
|
||||
def test_sliding_window_retrieval(baseline_llm_generator, test_llm_generator,
|
||||
batch_size, seed, backend, monkeypatch):
|
||||
"""
|
||||
The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
|
||||
asks for value of one of them (which is outside the sliding window).
|
||||
@ -100,7 +100,7 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
|
||||
def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed,
|
||||
backend, monkeypatch):
|
||||
"""
|
||||
This is similar to test_sliding_window_retrival, however, it doesn't
|
||||
This is similar to test_sliding_window_retrieval, however, it doesn't
|
||||
compare against the v1 block manager since v1 doesn't support
|
||||
chunked prefill with sliding window.
|
||||
|
||||
|
@ -594,8 +594,8 @@ def test_decode_schedule_preempted():
|
||||
# should be preempted. 1 will also be preempted.
|
||||
budget = create_token_budget()
|
||||
output = scheduler._schedule_running(budget, curr_loras)
|
||||
remainig_running = scheduler.running
|
||||
assert len(remainig_running) == 0
|
||||
remaining_running = scheduler.running
|
||||
assert len(remaining_running) == 0
|
||||
assert len(output.decode_seq_groups) == 1
|
||||
assert len(output.prefill_seq_groups) == 0
|
||||
assert output.decode_seq_groups[0].seq_group.request_id == "0"
|
||||
|
@ -16,7 +16,7 @@ chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
|
||||
assert chatml_jinja_path.exists()
|
||||
|
||||
# Define models, templates, and their corresponding expected outputs
|
||||
MODEL_TEMPLATE_GENERATON_OUTPUT = [
|
||||
MODEL_TEMPLATE_GENERATION_OUTPUT = [
|
||||
("facebook/opt-125m", chatml_jinja_path, True, False, """<|im_start|>user
|
||||
Hello<|im_end|>
|
||||
<|im_start|>assistant
|
||||
@ -91,7 +91,7 @@ def test_no_load_chat_template_literallike():
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model,template,add_generation_prompt,continue_final_message,expected_output",
|
||||
MODEL_TEMPLATE_GENERATON_OUTPUT)
|
||||
MODEL_TEMPLATE_GENERATION_OUTPUT)
|
||||
def test_get_gen_prompt(model, template, add_generation_prompt,
|
||||
continue_final_message, expected_output):
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
|
@ -72,8 +72,8 @@ def test_copy_blocks(
|
||||
# destination blocks.
|
||||
assert 2 * num_mappings <= num_blocks
|
||||
src_blocks = random.sample(range(num_blocks), num_mappings)
|
||||
remainig_blocks = list(set(range(num_blocks)) - set(src_blocks))
|
||||
dst_blocks = random.sample(remainig_blocks, 2 * num_mappings)
|
||||
remaining_blocks = list(set(range(num_blocks)) - set(src_blocks))
|
||||
dst_blocks = random.sample(remaining_blocks, 2 * num_mappings)
|
||||
block_mapping: list[tuple[int, int]] = []
|
||||
for i in range(num_mappings):
|
||||
src = src_blocks[i]
|
||||
@ -189,12 +189,12 @@ def test_reshape_and_cache(
|
||||
|
||||
# Run the reference implementation.
|
||||
reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape)
|
||||
block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
|
||||
block_indicies_lst = block_indicies.cpu().tolist()
|
||||
block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
|
||||
block_indices_lst = block_indices.cpu().tolist()
|
||||
block_offsets = slot_mapping % block_size
|
||||
block_offsets_lst = block_offsets.cpu().tolist()
|
||||
for i in range(num_tokens):
|
||||
block_idx = block_indicies_lst[i]
|
||||
block_idx = block_indices_lst[i]
|
||||
block_offset = block_offsets_lst[i]
|
||||
cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i]
|
||||
cloned_value_cache[block_idx, :, :, block_offset] = value[i]
|
||||
@ -322,12 +322,12 @@ def test_reshape_and_cache_flash(
|
||||
kv_dtype=kv_cache_dtype)
|
||||
|
||||
# Run the reference implementation.
|
||||
block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
|
||||
block_indicies_lst = block_indicies.cpu().tolist()
|
||||
block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
|
||||
block_indices_lst = block_indices.cpu().tolist()
|
||||
block_offsets = slot_mapping % block_size
|
||||
block_offsets_lst = block_offsets.cpu().tolist()
|
||||
for i in range(num_tokens):
|
||||
block_idx = block_indicies_lst[i]
|
||||
block_idx = block_indices_lst[i]
|
||||
block_offset = block_offsets_lst[i]
|
||||
if kv_cache_layout == "NHD":
|
||||
cloned_key_cache[block_idx, block_offset, :, :] = key[i]
|
||||
|
@ -46,7 +46,7 @@ CUDA_DEVICE = "cuda:0"
|
||||
MAX_DEC_SEQ_LENS = [128]
|
||||
MAX_ENC_SEQ_LENS = [128]
|
||||
|
||||
# Narrow teest-cases for unsupported-scenario
|
||||
# Narrow test-cases for unsupported-scenario
|
||||
# tests
|
||||
HEAD_SIZES_FOR_UNSUPP = [HEAD_SIZES[0]]
|
||||
|
||||
|
@ -39,10 +39,10 @@ def rotary_embedding_opcheck(rot,
|
||||
@pytest.mark.parametrize("head_size", [32, 108])
|
||||
@pytest.mark.parametrize("seq_len", [11, 1024])
|
||||
@pytest.mark.parametrize("use_key", [True, False])
|
||||
@pytest.mark.parametrize("head_stride_is_contingous", [True, False])
|
||||
@pytest.mark.parametrize("head_stride_is_contiguous", [True, False])
|
||||
def test_rotary_embedding_opcheck(dist_init, device, max_position,
|
||||
is_neox_style, rotary_dim, head_size,
|
||||
seq_len, use_key, head_stride_is_contingous):
|
||||
seq_len, use_key, head_stride_is_contiguous):
|
||||
batch_size = 1
|
||||
base = 10000
|
||||
num_heads = 7
|
||||
@ -52,7 +52,7 @@ def test_rotary_embedding_opcheck(dist_init, device, max_position,
|
||||
positions = torch.randint(0,
|
||||
max_position, (batch_size, seq_len),
|
||||
device=device)
|
||||
head_stride = head_size + (64 if head_stride_is_contingous else 0)
|
||||
head_stride = head_size + (64 if head_stride_is_contiguous else 0)
|
||||
|
||||
query = torch.randn(batch_size,
|
||||
seq_len,
|
||||
@ -72,7 +72,7 @@ def test_rotary_embedding_opcheck(dist_init, device, max_position,
|
||||
|
||||
# if we have a contiguous head stride, test the alternate
|
||||
# [..., num_heads * head_dim] shape/layout
|
||||
if head_stride_is_contingous:
|
||||
if head_stride_is_contiguous:
|
||||
rotary_embedding_opcheck(
|
||||
rot, positions, query.flatten(start_dim=-2),
|
||||
key.flatten(start_dim=-2) if use_key else None)
|
||||
|
@ -107,15 +107,15 @@ def generate_random_inputs(batch_size,
|
||||
return A, dt, X, B, C
|
||||
|
||||
|
||||
def generate_continous_batched_examples(example_lens_by_batch,
|
||||
num_examples,
|
||||
full_length,
|
||||
last_taken,
|
||||
exhausted,
|
||||
n_heads,
|
||||
d_head,
|
||||
itype,
|
||||
device='cuda'):
|
||||
def generate_continuous_batched_examples(example_lens_by_batch,
|
||||
num_examples,
|
||||
full_length,
|
||||
last_taken,
|
||||
exhausted,
|
||||
n_heads,
|
||||
d_head,
|
||||
itype,
|
||||
device='cuda'):
|
||||
|
||||
# this function generates a random examples of certain length
|
||||
# and then cut according to "example_lens_by_batch" and feed
|
||||
@ -269,11 +269,10 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
|
||||
exhausted: dict = {} # map: eg -> boolean indicating example is exhausted
|
||||
|
||||
states = None
|
||||
for Y_min, cu_seqlens, seq_idx, (A, dt, X, B,
|
||||
C) in generate_continous_batched_examples(
|
||||
cases, num_examples, seqlen,
|
||||
last_taken, exhausted, n_heads,
|
||||
d_head, itype):
|
||||
for Y_min, cu_seqlens, seq_idx, (
|
||||
A, dt, X, B, C) in generate_continuous_batched_examples(
|
||||
cases, num_examples, seqlen, last_taken, exhausted, n_heads,
|
||||
d_head, itype):
|
||||
|
||||
chunk_indices, chunk_offsets = \
|
||||
_query_start_loc_to_chunk_indices_offsets(
|
||||
|
@ -118,7 +118,7 @@ def run_test(
|
||||
# default to enforce_eager=True if enforce_eager
|
||||
# is left unspecified. However, the
|
||||
# VllmRunner test fixture (which wraps around the LLM class) defaults to
|
||||
# enforce_eager=False (a behavior which a number of already-exisitng
|
||||
# enforce_eager=False (a behavior which a number of already-existing
|
||||
# decoder-only unit tests expect), so when testing an encoder/decoder
|
||||
# model we must explicitly specify enforce_eager=True in the VllmRunner
|
||||
# constructor.
|
||||
|
@ -248,7 +248,7 @@ def test_temperature_zero_target_distribution(seed: int, device: str):
|
||||
size=(batch_size, 1),
|
||||
dtype=torch.int64)
|
||||
# The target probaility distribution is a temperature zero distribution
|
||||
# with zero entroy. Since our draft token ids don't match the probability
|
||||
# with zero entropy. Since our draft token ids don't match the probability
|
||||
# 1.0 tokens in the target distribution we will reject all of them and
|
||||
# fallback to the greedy sampling for selecting 1 token for each sequence.
|
||||
# Verify the same.
|
||||
|
@ -18,7 +18,7 @@ However, we still need to verify below scenario could be passed:
|
||||
* Test greedy equality under various number of speculative tokens.
|
||||
|
||||
With those tests, we can say at least, EAGLE would not break the
|
||||
correctess for the target model outputs.
|
||||
correctness for the target model outputs.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
@ -18,7 +18,7 @@ However, we still need to verify below scenario could be passed:
|
||||
* Test greedy equality under various number of speculative tokens.
|
||||
|
||||
With those tests, we can say at least, Medusa would not break the
|
||||
correctess for the target model outputs.
|
||||
correctness for the target model outputs.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
@ -18,7 +18,7 @@ However, we still need to verify below scenario could be passed:
|
||||
* Test greedy equality under various number of speculative tokens.
|
||||
|
||||
With those tests, we can say at least, mtp would not break the
|
||||
correctess for the target model outputs.
|
||||
correctness for the target model outputs.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
@ -22,8 +22,8 @@ However, we still need to verify below scenario could be passed:
|
||||
* Test greedy equality under preemption
|
||||
* Test greedy equality under various ngram sizes / speculative sizes
|
||||
|
||||
With those tests, we can say at least, ngram spec would not break the correctess
|
||||
for the target model outputs.
|
||||
With those tests, we can say at least, ngram spec would not break the
|
||||
correctness for the target model outputs.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
@ -30,7 +30,7 @@ model_config = {
|
||||
])
|
||||
@pytest.mark.parametrize("batch_size", [5])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
def test_sliding_window_retrival(monkeypatch, model, batch_size, seed):
|
||||
def test_sliding_window_retrieval(monkeypatch, model, batch_size, seed):
|
||||
"""
|
||||
The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
|
||||
asks for value of one of them (which is outside the sliding window).
|
||||
|
@ -7,7 +7,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import (
|
||||
from .utils import create_request, create_scheduler, create_vllm_config
|
||||
|
||||
|
||||
def test_basic_inferface():
|
||||
def test_basic_interface():
|
||||
"""Unit test for basic NixlConnector interface functionality."""
|
||||
|
||||
vllm_config = create_vllm_config()
|
||||
@ -25,7 +25,7 @@ def test_basic_inferface():
|
||||
|
||||
scheduler.add_request(request)
|
||||
|
||||
# Remote Prefill, triggers NixlConnectorMetdata.
|
||||
# Remote Prefill, triggers NixlConnectorMetadata.
|
||||
scheduler_output = scheduler.schedule()
|
||||
kv_connector_metadata = scheduler_output.kv_connector_metadata
|
||||
assert kv_connector_metadata is not None
|
||||
|
@ -32,7 +32,7 @@ def test_prompt_logprobs_e2e():
|
||||
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
|
||||
|
||||
|
||||
def test_promt_logprobs_e2e_server():
|
||||
def test_prompt_logprobs_e2e_server():
|
||||
with RemoteOpenAIServer(MODEL, SERVER_ARGS) as remote_server:
|
||||
url = f"{remote_server.url_for('v1')}/completions"
|
||||
|
||||
|
@ -209,32 +209,32 @@ def test_multi_step_model_runner_input():
|
||||
received_model_input = (StatefulModelInput.from_broadcasted_tensor_dict(
|
||||
tensor_dict, attn_backend=attn_backend))
|
||||
|
||||
receieved_frozen_input = received_model_input.frozen_model_input
|
||||
received_frozen_input = received_model_input.frozen_model_input
|
||||
|
||||
# Check that received copy has correct values.
|
||||
assert isinstance(received_model_input, StatefulModelInput)
|
||||
assert receieved_frozen_input.input_tokens is not None
|
||||
assert (receieved_frozen_input.input_tokens ==
|
||||
assert received_frozen_input.input_tokens is not None
|
||||
assert (received_frozen_input.input_tokens ==
|
||||
frozen_model_input.input_tokens).all()
|
||||
assert receieved_frozen_input.input_positions is not None
|
||||
assert (receieved_frozen_input.input_positions ==
|
||||
assert received_frozen_input.input_positions is not None
|
||||
assert (received_frozen_input.input_positions ==
|
||||
frozen_model_input.input_positions).all()
|
||||
assert receieved_frozen_input.multi_modal_kwargs is None
|
||||
assert received_frozen_input.multi_modal_kwargs is None
|
||||
assert (frozen_model_input.multi_modal_kwargs ==
|
||||
frozen_model_input.multi_modal_kwargs)
|
||||
assert receieved_frozen_input.lora_requests is None
|
||||
assert (receieved_frozen_input.lora_requests ==
|
||||
assert received_frozen_input.lora_requests is None
|
||||
assert (received_frozen_input.lora_requests ==
|
||||
frozen_model_input.lora_requests)
|
||||
assert receieved_frozen_input.lora_mapping is None
|
||||
assert received_frozen_input.lora_mapping is None
|
||||
assert (
|
||||
receieved_frozen_input.lora_mapping == frozen_model_input.lora_mapping)
|
||||
received_frozen_input.lora_mapping == frozen_model_input.lora_mapping)
|
||||
for field in dataclasses.fields(AttentionMetadata):
|
||||
assert getattr(receieved_frozen_input.attn_metadata, field.name,
|
||||
assert getattr(received_frozen_input.attn_metadata, field.name,
|
||||
None) == getattr(attn_metadata, field.name, None)
|
||||
# For sampling metadata, only selected_token_indices is copied.
|
||||
assert (receieved_frozen_input.sampling_metadata.selected_token_indices ==
|
||||
assert (received_frozen_input.sampling_metadata.selected_token_indices ==
|
||||
sampling_metadata.selected_token_indices)
|
||||
assert receieved_frozen_input.sampling_metadata.seq_groups is None
|
||||
assert received_frozen_input.sampling_metadata.seq_groups is None
|
||||
|
||||
# check non frozen fields
|
||||
assert received_model_input.is_last_step == model_input.is_last_step
|
||||
|
@ -116,7 +116,7 @@ def ReadTargets(log, show_all):
|
||||
# If ninja.exe is rudely halted then the .ninja_log file may be
|
||||
# corrupt. Silently continue.
|
||||
continue
|
||||
start, end, _, name, cmdhash = parts # Ignore restat.
|
||||
start, end, _, name, cmdhash = parts # Ignore restart.
|
||||
# Convert from integral milliseconds to float seconds.
|
||||
start = int(start) / 1000.0
|
||||
end = int(end) / 1000.0
|
||||
|
179
typos.toml
Normal file
179
typos.toml
Normal file
@ -0,0 +1,179 @@
|
||||
[files]
|
||||
# these files may be written in non english words
|
||||
extend-exclude = ["tests/models/fixtures/*", "tests/prompts/*",
|
||||
"benchmarks/sonnet.txt", "tests/lora/data/*", "build/*",
|
||||
"vllm/third_party/*"]
|
||||
ignore-hidden = true
|
||||
ignore-files = true
|
||||
ignore-dot = true
|
||||
ignore-vcs = true
|
||||
ignore-global = true
|
||||
ignore-parent = true
|
||||
|
||||
[default]
|
||||
binary = false
|
||||
check-filename = false
|
||||
check-file = true
|
||||
unicode = true
|
||||
ignore-hex = true
|
||||
identifier-leading-digits = false
|
||||
locale = "en"
|
||||
extend-ignore-identifiers-re = ["NVML_*", ".*Unc.*", ".*_thw",
|
||||
".*UE8M0.*", ".*[UE4M3|ue4m3].*", ".*eles.*", ".*fo.*", ".*ba.*",
|
||||
".*ot.*", ".*[Tt]h[rR].*"]
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[default.extend-identifiers]
|
||||
bbc5b7ede = "bbc5b7ede"
|
||||
womens_doubles = "womens_doubles"
|
||||
v_2nd = "v_2nd"
|
||||
splitted_input = "splitted_input"
|
||||
NOOPs = "NOOPs"
|
||||
typ = "typ"
|
||||
nin_shortcut = "nin_shortcut"
|
||||
UperNetDecoder = "UperNetDecoder"
|
||||
subtile = "subtile"
|
||||
cudaDevAttrMaxSharedMemoryPerBlockOptin = "cudaDevAttrMaxSharedMemoryPerBlockOptin"
|
||||
SFOuput = "SFOuput"
|
||||
# huggingface transformers repo uses these words
|
||||
depthwise_seperable_out_channel = "depthwise_seperable_out_channel"
|
||||
DepthWiseSeperableConv1d = "DepthWiseSeperableConv1d"
|
||||
depthwise_seperable_CNN = "depthwise_seperable_CNN"
|
||||
|
||||
[default.extend-words]
|
||||
iy = "iy"
|
||||
tendencias = "tendencias"
|
||||
# intel cpu features
|
||||
tme = "tme"
|
||||
dout = "dout"
|
||||
Pn = "Pn"
|
||||
arange = "arange"
|
||||
|
||||
[type.py]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[type.py.extend-identifiers]
|
||||
arange = "arange"
|
||||
NDArray = "NDArray"
|
||||
EOFError = "EOFError"
|
||||
|
||||
[type.py.extend-words]
|
||||
|
||||
[type.cpp]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[type.cpp.extend-identifiers]
|
||||
countr_one = "countr_one"
|
||||
|
||||
[type.cpp.extend-words]
|
||||
|
||||
[type.rust]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[type.rust.extend-identifiers]
|
||||
flate2 = "flate2"
|
||||
|
||||
[type.rust.extend-words]
|
||||
ser = "ser"
|
||||
|
||||
[type.lock]
|
||||
extend-glob = []
|
||||
check-file = false
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[type.lock.extend-identifiers]
|
||||
|
||||
[type.lock.extend-words]
|
||||
|
||||
[type.jl]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[type.jl.extend-identifiers]
|
||||
|
||||
[type.jl.extend-words]
|
||||
modul = "modul"
|
||||
egals = "egals"
|
||||
usig = "usig"
|
||||
egal = "egal"
|
||||
|
||||
[type.go]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[type.go.extend-identifiers]
|
||||
flate = "flate"
|
||||
|
||||
[type.go.extend-words]
|
||||
|
||||
[type.css]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[type.css.extend-identifiers]
|
||||
nd = "nd"
|
||||
|
||||
[type.css.extend-words]
|
||||
|
||||
[type.man]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[type.man.extend-identifiers]
|
||||
Nd = "Nd"
|
||||
|
||||
[type.man.extend-words]
|
||||
|
||||
[type.cert]
|
||||
extend-glob = []
|
||||
check-file = false
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[type.cert.extend-identifiers]
|
||||
|
||||
[type.cert.extend-words]
|
||||
|
||||
[type.sh]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[type.sh.extend-identifiers]
|
||||
stap = "stap"
|
||||
ot = "ot"
|
||||
|
||||
[type.sh.extend-words]
|
||||
|
||||
[type.vimscript]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[type.vimscript.extend-identifiers]
|
||||
windo = "windo"
|
||||
|
||||
[type.vimscript.extend-words]
|
@ -1550,10 +1550,10 @@ def moe_wna16_gemm(input: torch.Tensor, output: torch.Tensor,
|
||||
|
||||
|
||||
def topk_softmax(topk_weights: torch.Tensor, topk_ids: torch.Tensor,
|
||||
token_expert_indicies: torch.Tensor,
|
||||
token_expert_indices: torch.Tensor,
|
||||
gating_output: torch.Tensor) -> None:
|
||||
torch.ops._moe_C.topk_softmax(topk_weights, topk_ids,
|
||||
token_expert_indicies, gating_output)
|
||||
torch.ops._moe_C.topk_softmax(topk_weights, topk_ids, token_expert_indices,
|
||||
gating_output)
|
||||
|
||||
|
||||
def moe_wna16_marlin_gemm(input: torch.Tensor, output: Optional[torch.Tensor],
|
||||
|
@ -373,7 +373,7 @@ class CommonAttentionState(AttentionState):
|
||||
f"Expected attn_backend name to be either 'XFORMERS'," \
|
||||
f"'ROCM_FLASH', or 'FLASH_ATTN', but " \
|
||||
f"got '{self.runner.attn_backend.get_name()}'"
|
||||
self._add_additonal_input_buffers_for_enc_dec_model(
|
||||
self._add_additional_input_buffers_for_enc_dec_model(
|
||||
attn_metadata=attn_metadata, input_buffers=input_buffers)
|
||||
return input_buffers
|
||||
|
||||
@ -427,7 +427,7 @@ class CommonAttentionState(AttentionState):
|
||||
attn_metadata.max_encoder_seq_len = self.runner.max_seq_len_to_capture
|
||||
attn_metadata.num_encoder_tokens = 0
|
||||
|
||||
def _add_additonal_input_buffers_for_enc_dec_model(
|
||||
def _add_additional_input_buffers_for_enc_dec_model(
|
||||
self, attn_metadata, input_buffers: Dict[str, Any]):
|
||||
"""
|
||||
Saves additional input buffers specific to the encoder-decoder model
|
||||
|
@ -40,7 +40,7 @@ class Internlm2ToolParser(ToolParser):
|
||||
request.skip_special_tokens = False
|
||||
return request
|
||||
|
||||
def get_argments(self, obj):
|
||||
def get_arguments(self, obj):
|
||||
if "parameters" in obj:
|
||||
return obj.get("parameters")
|
||||
elif "arguments" in obj:
|
||||
@ -119,9 +119,9 @@ class Internlm2ToolParser(ToolParser):
|
||||
# now we know we're on the same tool call and we're streaming
|
||||
# arguments
|
||||
else:
|
||||
prev_arguments = self.get_argments(
|
||||
prev_arguments = self.get_arguments(
|
||||
self.prev_tool_call_arr[self.current_tool_id])
|
||||
cur_arguments = self.get_argments(tool_call_arr)
|
||||
cur_arguments = self.get_arguments(tool_call_arr)
|
||||
|
||||
# not arguments generated
|
||||
if not cur_arguments and not prev_arguments:
|
||||
@ -170,7 +170,7 @@ class Internlm2ToolParser(ToolParser):
|
||||
# check to see if the name is defined and has been sent. if so,
|
||||
# stream the name - otherwise keep waiting
|
||||
# finish by setting old and returning None as base case
|
||||
tool_call_arr["arguments"] = self.get_argments(tool_call_arr)
|
||||
tool_call_arr["arguments"] = self.get_arguments(tool_call_arr)
|
||||
self.prev_tool_call_arr = [tool_call_arr]
|
||||
return delta
|
||||
except Exception:
|
||||
|
@ -1202,7 +1202,7 @@ class LinearScalingRotaryEmbeddingWithLoRA(BaseLayerWithLoRA):
|
||||
multiple LoRA adapters with a specialized kernel.
|
||||
|
||||
Replace LinearScalingRotaryEmbedding with MultiLinearScalingRotaryEmbedding
|
||||
which can handle multi lora adapters in a specialied kernel.
|
||||
which can handle multi lora adapters in a specialized kernel.
|
||||
"""
|
||||
|
||||
def __init__(self, base_layer: RotaryEmbedding) -> None:
|
||||
|
@ -68,11 +68,11 @@ def convert_mapping(
|
||||
LoRA indices.
|
||||
sampler_indices: Tensor of shape [batch_size] mapping requests to
|
||||
LoRA indices for sampler. For generation, this will be the
|
||||
same as base_indicies. For prefill, this will map requests
|
||||
same as base_indices. For prefill, this will map requests
|
||||
to LoRA indices.
|
||||
sampler_indices_padded: Tensor of shape [batch_size] mapping
|
||||
requests to LoRA indices for sampler with padding.
|
||||
Same as sampler_indicies, but -1 is replaced with
|
||||
Same as sampler_indices, but -1 is replaced with
|
||||
max_loras.
|
||||
embeddings_indices: Tensor of shape [2, batch_size] mapping
|
||||
requests to embedding indices. First row is for embeddings
|
||||
|
@ -319,7 +319,7 @@ class MambaMixer2(CustomOp):
|
||||
n_groups == 1, # if there was only one group
|
||||
)
|
||||
intermediate_settings = (intermediate_size, 0, False)
|
||||
head_setings = (self.num_heads, 0, False)
|
||||
head_settings = (self.num_heads, 0, False)
|
||||
|
||||
# - the weight already has a "weight_loader" attribute
|
||||
# which set_weight_attrs will raise if we do not
|
||||
@ -372,7 +372,7 @@ class MambaMixer2(CustomOp):
|
||||
intermediate_settings,
|
||||
group_shard_settings,
|
||||
group_shard_settings,
|
||||
head_setings, # for dt
|
||||
head_settings, # for dt
|
||||
],
|
||||
self.tp_size,
|
||||
tp_rank,
|
||||
|
@ -516,7 +516,7 @@ def _chunk_state_varlen_kernel(
|
||||
offs_n[None, :] * stride_chunk_states_dstate)
|
||||
else:
|
||||
|
||||
# - this seems repetitve, buts its to help the compiler
|
||||
# - this seems repetitive, buts its to help the compiler
|
||||
if start_idx < pid_c * chunk_size:
|
||||
past_states_ptrs = chunk_states_ptr + (
|
||||
offs_m[:, None] * stride_chunk_states_hdim +
|
||||
|
@ -219,7 +219,7 @@ def per_token_group_quant_int8(
|
||||
quantized tensor along with the scaling factor used for quantization.
|
||||
|
||||
Args:
|
||||
x: The input tenosr with ndim >= 2.
|
||||
x: The input tensor with ndim >= 2.
|
||||
group_size: The group size used for quantization.
|
||||
eps: The minimum to avoid dividing zero.
|
||||
dtype: The dype of output tensor. Note that only `torch.int8`
|
||||
|
@ -401,7 +401,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
|
||||
self.target_modules.append(
|
||||
name.replace(rep_name, sub_name))
|
||||
# Add original module name even if the module has stacked map,
|
||||
# in case model has a mixture of disk-merged and disk-splitted
|
||||
# in case model has a mixture of disk-merged and disk-split
|
||||
# weights with same last name.
|
||||
self.target_modules.append(name)
|
||||
|
||||
|
@ -131,7 +131,7 @@ class BaiChuanAttention(nn.Module):
|
||||
self.num_heads = (self.total_num_heads //
|
||||
tensor_model_parallel_world_size)
|
||||
self.head_dim = hidden_size // self.total_num_heads
|
||||
self.postion_embedding = position_embedding
|
||||
self.position_embedding = position_embedding
|
||||
self.rope_theta = rope_theta
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
|
||||
@ -151,7 +151,7 @@ class BaiChuanAttention(nn.Module):
|
||||
quant_config=quant_config,
|
||||
)
|
||||
# Create the alibi slopes and slice them.
|
||||
if self.postion_embedding == "ALIBI":
|
||||
if self.position_embedding == "ALIBI":
|
||||
tp_rank = get_tensor_model_parallel_rank()
|
||||
head_start = tp_rank * self.num_heads
|
||||
head_end = (tp_rank + 1) * self.num_heads
|
||||
@ -187,7 +187,7 @@ class BaiChuanAttention(nn.Module):
|
||||
) -> torch.Tensor:
|
||||
qkv, _ = self.W_pack(hidden_states)
|
||||
q, k, v = qkv.chunk(chunks=3, dim=-1)
|
||||
if self.postion_embedding != "ALIBI":
|
||||
if self.position_embedding != "ALIBI":
|
||||
q, k = self.rotary_emb(positions, q, k)
|
||||
attn_output = self.attn(q, k, v)
|
||||
output, _ = self.o_proj(attn_output)
|
||||
|
@ -344,7 +344,7 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
self.image_newline = nn.Parameter(
|
||||
torch.randn(self.projector_config.n_embed) * embed_std)
|
||||
# This is a typo in original implementation
|
||||
self.view_seperator = nn.Parameter(
|
||||
self.view_separator = nn.Parameter(
|
||||
torch.randn(self.projector_config.n_embed) * embed_std)
|
||||
else:
|
||||
raise ValueError(
|
||||
@ -549,13 +549,13 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
if self.global_view_pos == "head":
|
||||
global_local_features = torch.cat([
|
||||
global_features,
|
||||
self.view_seperator[None, :],
|
||||
self.view_separator[None, :],
|
||||
local_features,
|
||||
])
|
||||
else:
|
||||
global_local_features = torch.cat([
|
||||
local_features,
|
||||
self.view_seperator[None, :],
|
||||
self.view_separator[None, :],
|
||||
global_features,
|
||||
])
|
||||
|
||||
|
@ -197,7 +197,7 @@ class EAGLE(nn.Module):
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
||||
# This implementation is incompitable with https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B
|
||||
# This implementation is incompatible with https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B
|
||||
# due to missing lm_head weights and its config being that of a
|
||||
# Llama model. Here's a compatible version with the same weights:
|
||||
# https://huggingface.co/abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm
|
||||
|
@ -634,13 +634,13 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
|
||||
kwargs["has_images"] = True
|
||||
# NOTE(woosuk): Here, we distinguish the sequences by the position id 0.
|
||||
# This is a HACK. Fix this.
|
||||
start_idices = (positions == 0).cpu().nonzero()
|
||||
num_seqs = len(start_idices)
|
||||
start_indices = (positions == 0).cpu().nonzero()
|
||||
num_seqs = len(start_indices)
|
||||
seq_lens = []
|
||||
for i in range(num_seqs):
|
||||
start_idx = start_idices[i].item()
|
||||
start_idx = start_indices[i].item()
|
||||
if i < num_seqs - 1:
|
||||
end_idx = start_idices[i + 1].item()
|
||||
end_idx = start_indices[i + 1].item()
|
||||
else:
|
||||
end_idx = len(input_ids)
|
||||
seq_lens.append(end_idx - start_idx)
|
||||
|
@ -52,7 +52,7 @@ class Llama4MoE(nn.Module):
|
||||
renormalize: bool,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
router_scores, router_indices = fast_topk(gating_output, topk, dim=-1)
|
||||
# psuedo-standard is that the router scores are floats
|
||||
# pseudo-standard is that the router scores are floats
|
||||
router_scores = torch.sigmoid(router_scores.float())
|
||||
return (router_scores, router_indices.to(torch.int32))
|
||||
|
||||
|
@ -114,9 +114,9 @@ class MixtralMoE(nn.Module):
|
||||
f"Tensor parallel size {self.tp_size} is greater than "
|
||||
f"the number of experts {self.num_total_experts}.")
|
||||
# Split experts equally between ranks
|
||||
self.expert_indicies = np.array_split(range(
|
||||
self.num_total_experts), self.tp_size)[self.rank].tolist()
|
||||
if not self.expert_indicies:
|
||||
self.expert_indices = np.array_split(range(self.num_total_experts),
|
||||
self.tp_size)[self.rank].tolist()
|
||||
if not self.expert_indices:
|
||||
raise ValueError(
|
||||
f"Rank {self.rank} has no experts assigned to it.")
|
||||
|
||||
@ -125,7 +125,7 @@ class MixtralMoE(nn.Module):
|
||||
config.hidden_size,
|
||||
config.intermediate_size,
|
||||
quant_config=quant_config)
|
||||
if idx in self.expert_indicies else None
|
||||
if idx in self.expert_indices else None
|
||||
for idx in range(self.num_total_experts)
|
||||
])
|
||||
self.gate = ReplicatedLinear(config.hidden_size,
|
||||
@ -146,7 +146,7 @@ class MixtralMoE(nn.Module):
|
||||
routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
|
||||
|
||||
final_hidden_states = None
|
||||
for expert_idx in self.expert_indicies:
|
||||
for expert_idx in self.expert_indices:
|
||||
expert_layer = self.experts[expert_idx]
|
||||
expert_mask = (selected_experts == expert_idx)
|
||||
expert_weights = (routing_weights * expert_mask).sum(dim=-1,
|
||||
|
@ -283,7 +283,7 @@ class OvisProcessingInfo(BaseProcessingInfo):
|
||||
def get_image_size_with_most_features(self) -> ImageSize:
|
||||
height, width = self.get_hf_processor().get_image_size()
|
||||
hs = self.get_hf_config().visual_tokenizer_config.hidden_stride
|
||||
# NOTE(Isotr0py): 9 is `max_partion` hardcoded in original code
|
||||
# NOTE(Isotr0py): 9 is `max_partition` hardcoded in original code
|
||||
# https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/modeling_ovis.py#L96
|
||||
return ImageSize(width=width * hs * 9, height=height * hs * 9)
|
||||
|
||||
|
@ -145,7 +145,7 @@ class Phi3SmallSelfAttention(nn.Module):
|
||||
self.num_q_per_kv = self.num_heads // self.num_key_value_heads
|
||||
if self.tp_size > 1:
|
||||
assert self.num_key_value_heads % self.tp_size == 0
|
||||
self.num_kv_heads_per_partion = max(
|
||||
self.num_kv_heads_per_partition = max(
|
||||
1, self.num_key_value_heads // self.tp_size)
|
||||
self.num_heads_per_partition = self.num_heads // self.tp_size
|
||||
|
||||
@ -212,7 +212,7 @@ class Phi3SmallSelfAttention(nn.Module):
|
||||
bs_params = {
|
||||
'max_seqlen': self.max_position_embeddings,
|
||||
'num_heads': self.num_heads_per_partition,
|
||||
"num_kv_heads": self.num_kv_heads_per_partion,
|
||||
"num_kv_heads": self.num_kv_heads_per_partition,
|
||||
"block_size": self.sparse_block_size,
|
||||
"local_blocks": self.local_blocks,
|
||||
"vert_stride": self.vert_stride,
|
||||
@ -222,7 +222,7 @@ class Phi3SmallSelfAttention(nn.Module):
|
||||
self.attn = Attention(self.num_heads_per_partition,
|
||||
self.head_dim,
|
||||
self.scale,
|
||||
num_kv_heads=self.num_kv_heads_per_partion,
|
||||
num_kv_heads=self.num_kv_heads_per_partition,
|
||||
cache_config=cache_config,
|
||||
quant_config=quant_config,
|
||||
blocksparse_params=bs_params,
|
||||
@ -243,8 +243,8 @@ class Phi3SmallSelfAttention(nn.Module):
|
||||
# NOTE: this is required by RotaryEmbed, which indeed does not have to
|
||||
# TODO: allow 3D QK for rotary forward
|
||||
q = q.reshape(-1, self.head_dim * self.num_heads_per_partition)
|
||||
k = k.reshape(-1, self.head_dim * self.num_kv_heads_per_partion)
|
||||
v = v.reshape(-1, self.head_dim * self.num_kv_heads_per_partion)
|
||||
k = k.reshape(-1, self.head_dim * self.num_kv_heads_per_partition)
|
||||
v = v.reshape(-1, self.head_dim * self.num_kv_heads_per_partition)
|
||||
|
||||
q, k = self.rotary_emb(positions, q, k)
|
||||
attn_output = self.attn(q, k, v)
|
||||
|
@ -41,7 +41,7 @@ class ConformerEncoderLayer(nn.Module):
|
||||
for the last pointwise conv after swish activation.
|
||||
depthwise_seperable_out_channel: int
|
||||
if set different to 0, the number of
|
||||
depthwise_seperable_out_channel will be used as a
|
||||
depthwise_seperable_out_channel will be used as a
|
||||
channel_out of the second conv1d layer.
|
||||
otherwise, it equal to 0, the second conv1d layer is skipped.
|
||||
depthwise_multiplier: int
|
||||
@ -126,7 +126,7 @@ class ConformerEncoderLayer(nn.Module):
|
||||
(Multi-Head Attention),
|
||||
1 = typical Multi-Head Attention,
|
||||
1 < attn_group_sizes < attention_heads = Grouped-Query Attention
|
||||
attn_group_sizes = attenion_heads = Multi-Query Attention
|
||||
attn_group_sizes = attention_heads = Multi-Query Attention
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@ -318,7 +318,7 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
|
||||
1 = typical Multi-Head Attention,
|
||||
1 < attention_group_size < attention_heads = Grouped-Query
|
||||
Attention
|
||||
attention_group_size = attenion_heads = Multi-Query Attention
|
||||
attention_group_size = attention_heads = Multi-Query Attention
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@ -744,7 +744,7 @@ class ConformerEncoder(TransformerEncoderBase):
|
||||
1 = typical Multi-Head Attention,
|
||||
1 < attention_group_size < attention_heads = Grouped-Query
|
||||
Attention
|
||||
attention_group_size = attenion_heads = Multi-Query Attention
|
||||
attention_group_size = attention_heads = Multi-Query Attention
|
||||
"""
|
||||
|
||||
extra_multi_layer_output_idxs: list[int]
|
||||
|
@ -147,15 +147,15 @@ class mp(torch.autograd.Function):
|
||||
|
||||
grad_at_output = grad_at_output * multiplier
|
||||
|
||||
grad_at_scores_expaned = masked_gates * grad_at_output.mul(-1)
|
||||
grad_at_scores_expaned.scatter_add_(
|
||||
grad_at_scores_expanded = masked_gates * grad_at_output.mul(-1)
|
||||
grad_at_scores_expanded.scatter_add_(
|
||||
dim=-1,
|
||||
index=selected_experts,
|
||||
src=grad_at_output,
|
||||
)
|
||||
|
||||
return (
|
||||
grad_at_scores_expaned,
|
||||
grad_at_scores_expanded,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
|
@ -324,7 +324,7 @@ def merge_and_sort_multimodal_metadata(
|
||||
Returns:
|
||||
list[str]: List of item modalities in order of their positions in the
|
||||
input sequence.
|
||||
list[PlaceholderRange]: Sorted list of all PlaceholdeRanges from
|
||||
list[PlaceholderRange]: Sorted list of all PlaceholderRanges from
|
||||
mm_positions.
|
||||
Optional[list[str]]: Sorted list of all hashes from mm_hashes if given,
|
||||
None otherwise.
|
||||
|
@ -68,7 +68,7 @@ class OvisProcessor(ProcessorMixin):
|
||||
"""
|
||||
|
||||
attributes = ["image_processor", "tokenizer"]
|
||||
valid_kwargs = ["chat_template", "image_pad_token", "image_segement_len"]
|
||||
valid_kwargs = ["chat_template", "image_pad_token", "image_segment_len"]
|
||||
|
||||
image_processor_class = "AutoImageProcessor"
|
||||
tokenizer_class = "AutoTokenizer"
|
||||
|
@ -886,7 +886,7 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
|
||||
num_decode_tokens=0,
|
||||
slot_mapping=slot_mapping,
|
||||
multi_modal_placeholder_index_maps=
|
||||
None, # FIXME(kzawora): mutli-modality will not work here
|
||||
None, # FIXME(kzawora): multi-modality will not work here
|
||||
enable_kv_scales_calculation=False,
|
||||
)
|
||||
multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
|
||||
|
@ -277,7 +277,7 @@ class StatefulModelInput(BroadcastableModelInput):
|
||||
assert fmi.input_tokens.shape[0] >= self.num_seqs
|
||||
fmi_new_input_tokens: torch.Tensor = fmi.input_tokens[:self.num_seqs]
|
||||
|
||||
# Update frozen_model_input::input_positons.
|
||||
# Update frozen_model_input::input_positions.
|
||||
assert fmi.input_positions is not None
|
||||
assert fmi.input_positions.shape[0] >= self.num_seqs
|
||||
fmi_new_input_positions: torch.Tensor = fmi.input_positions[:self.
|
||||
|
@ -798,9 +798,9 @@ class ModelWrapper(nn.Module):
|
||||
"""
|
||||
batch_size, seq_len = token_ids.shape
|
||||
# Calculate the positions to sample from.
|
||||
start_indicies = torch.arange(
|
||||
start_indices = torch.arange(
|
||||
batch_size, dtype=torch.int32, device=input_lens.device) * seq_len
|
||||
logits_indices = start_indicies + input_lens - 1
|
||||
logits_indices = start_indices + input_lens - 1
|
||||
attn_metadata = get_forward_context().attn_metadata
|
||||
|
||||
# FIXME(woosuk): This is a temporary hack to avoid using the existing
|
||||
@ -822,14 +822,14 @@ class ModelWrapper(nn.Module):
|
||||
num_kv_heads, num_blocks, block_size, _ = kv_caches[0][0].shape
|
||||
slot_mapping = attn_metadata.slot_mapping
|
||||
slot_mapping = slot_mapping.flatten()
|
||||
head_indicies = torch.arange(0,
|
||||
num_kv_heads,
|
||||
device=slot_mapping.device,
|
||||
dtype=slot_mapping.dtype)
|
||||
head_indicies *= block_size * num_blocks
|
||||
head_indices = torch.arange(0,
|
||||
num_kv_heads,
|
||||
device=slot_mapping.device,
|
||||
dtype=slot_mapping.dtype)
|
||||
head_indices *= block_size * num_blocks
|
||||
slot_mapping = slot_mapping.repeat_interleave(num_kv_heads).view(
|
||||
-1, num_kv_heads)
|
||||
slot_mapping = slot_mapping + head_indicies.view(1, -1)
|
||||
slot_mapping = slot_mapping + head_indices.view(1, -1)
|
||||
slot_mapping = slot_mapping.flatten()
|
||||
attn_metadata.slot_mapping = slot_mapping
|
||||
|
||||
|
Reference in New Issue
Block a user