Fix per file ruff ignores related to line length (#26262)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2025-10-06 06:12:40 +01:00
committed by GitHub
parent 91ac7f764d
commit 6c04638214
65 changed files with 301 additions and 291 deletions

View File

@ -164,7 +164,7 @@ def invoke_main() -> None:
)
parser.add_argument(
"--batched", action="store_true", help="consider time to prepare batch"
) # noqa: E501
)
parser.add_argument(
"--num-iteration",
type=int,

View File

@ -909,13 +909,13 @@ def create_argument_parser():
parser.add_argument(
"--tokenizer",
type=str,
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
help="Name or path of the tokenizer, if not using the default tokenizer.",
)
parser.add_argument(
"--tokenizer-mode",
type=str,
default="auto",
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
help="Name or path of the tokenizer, if not using the default tokenizer.",
)
parser.add_argument(
"--num-prompts",

View File

@ -72,8 +72,8 @@ VLLMKernelScheduleTag: dict[
] = {
**KernelScheduleTag, # type: ignore
**{
MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized",
MixedInputKernelScheduleType.TmaWarpSpecializedPingpong: "cutlass::gemm::KernelTmaWarpSpecializedPingpong",
MixedInputKernelScheduleType.TmaWarpSpecializedCooperative: "cutlass::gemm::KernelTmaWarpSpecializedCooperative",
MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized", # noqa: E501
MixedInputKernelScheduleType.TmaWarpSpecializedPingpong: "cutlass::gemm::KernelTmaWarpSpecializedPingpong", # noqa: E501
MixedInputKernelScheduleType.TmaWarpSpecializedCooperative: "cutlass::gemm::KernelTmaWarpSpecializedCooperative", # noqa: E501
},
}

View File

@ -113,7 +113,7 @@ def run_e5_v(query: Query) -> ModelRequestData:
def _get_vlm2vec_prompt_image(query: Query, image_token: str):
if query["modality"] == "text":
text = query["text"]
prompt = f"Find me an everyday image that matches the given caption: {text}" # noqa: E501
prompt = f"Find me an everyday image that matches the given caption: {text}"
image = None
elif query["modality"] == "image":
prompt = f"{image_token} Find a day-to-day image that looks similar to the provided image." # noqa: E501

View File

@ -203,9 +203,9 @@ class Proxy:
async with session.post(
url=url, json=data, headers=headers
) as response:
if 200 <= response.status < 300 or 400 <= response.status < 500: # noqa: E501
if 200 <= response.status < 300 or 400 <= response.status < 500:
if use_chunked:
async for chunk_bytes in response.content.iter_chunked( # noqa: E501
async for chunk_bytes in response.content.iter_chunked(
1024
):
yield chunk_bytes

View File

@ -56,52 +56,6 @@ include = ["vllm*"]
"vllm/third_party/**" = ["ALL"]
"vllm/version.py" = ["F401"]
"vllm/_version.py" = ["ALL"]
# TEMPORARY! These ignores will be fixed forward
## Line length violations
"csrc/cutlass_extensions/vllm_cutlass_library_extension.py" = ["E501"]
"tests/compile/piecewise/test_simple.py" = ["E501"]
"tests/compile/piecewise/test_toy_llama.py" = ["E501", "B023"]
"tests/entrypoints/conftest.py" = ["E501"]
"tests/entrypoints/openai/test_audio.py" = ["E501"]
"tests/entrypoints/openai/test_chat.py" = ["E501"]
"tests/entrypoints/openai/test_chat_template.py" = ["E501"]
"tests/entrypoints/openai/test_chat_with_tool_reasoning.py" = ["E501"]
"tests/entrypoints/openai/test_completion_with_function_calling.py" = ["E501"]
"tests/entrypoints/openai/test_video.py" = ["E501"]
"tests/entrypoints/openai/test_vision.py" = ["E501"]
"tests/entrypoints/test_chat_utils.py" = ["E501"]
"tests/kernels/moe/modular_kernel_tools/common.py" = ["E501"]
"tests/models/language/generation/test_gemma.py" = ["E501"]
"tests/models/language/generation/test_mistral.py" = ["E501"]
"tests/models/multimodal/generation/test_ultravox.py" = ["E501"]
"tests/models/multimodal/generation/test_voxtral.py" = ["E501"]
"tests/models/multimodal/generation/vlm_utils/custom_inputs.py" = ["E501"]
"tests/tool_use/test_tool_choice_required.py" = ["E501"]
"tests/v1/attention/utils.py" = ["E501"]
"tests/v1/entrypoints/openai/responses/test_image.py" = ["E501"]
"tests/v1/kv_connector/nixl_integration/test_accuracy.py" = ["E501"]
"tests/v1/kv_connector/unit/test_offloading_connector.py" = ["E501"]
"tests/v1/logits_processors/test_custom_offline.py" = ["E501"]
"vllm/attention/ops/pallas_kv_cache_update.py" = ["E501"]
"vllm/compilation/collective_fusion.py" = ["E501"]
"vllm/compilation/wrapper.py" = ["E501"]
"vllm/config/vllm.py" = ["E501"]
"vllm/distributed/device_communicators/all2all.py" = ["E501"]
"vllm/entrypoints/openai/protocol.py" = ["E501"]
"vllm/lora/layers/vocal_parallel_embedding.py" = ["E501"]
"vllm/model_executor/model_loader/bitsandbytes_loader.py" = ["E501"]
"vllm/model_executor/models/bailing_moe.py" = ["E501"]
"vllm/model_executor/models/hyperclovax_vision.py" = ["E501"]
"vllm/model_executor/models/llama4_eagle.py" = ["E501"]
"vllm/model_executor/models/longcat_flash_mtp.py" = ["E501"]
"vllm/model_executor/models/phi4mm.py" = ["E501"]
"vllm/model_executor/models/qwen3_next.py" = ["E501"]
"vllm/model_executor/layers/quantization/ptpc_fp8.py" = ["E501"]
"vllm/v1/attention/backends/mla/common.py" = ["E501"]
"vllm/v1/engine/utils.py" = ["E501"]
"vllm/v1/utils.py" = ["E501"]
"vllm/v1/worker/gpu_model_runner.py" = ["E501"]
# End of temporary ignores
[tool.ruff.lint]
select = [

View File

@ -132,10 +132,14 @@ def test_simple_piecewise_compile(use_inductor):
splitting_ops=["silly.attention"],
use_inductor_graph_partition=False,
use_inductor=use_inductor,
expected_num_piecewise_graphs_seen=5, # 2 * num_layers + 1
expected_num_piecewise_capturable_graphs_seen=3, # 1 + num_layers
expected_num_backend_compilations=3, # num_piecewise_capturable_graphs_seen
expected_num_cudagraph_captured=6, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
# 2 * num_layers + 1
expected_num_piecewise_graphs_seen=5,
# 1 + num_layers
expected_num_piecewise_capturable_graphs_seen=3,
# num_piecewise_capturable_graphs_seen
expected_num_backend_compilations=3,
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
expected_num_cudagraph_captured=6,
)
@ -147,14 +151,16 @@ def test_simple_inductor_graph_partition(splitting_ops):
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
_run_simple_model(
# inductor graph partition automatically resets splitting_ops
# to be an empty list
# Inductor graph partition automatically resets splitting_ops to an empty list
splitting_ops=splitting_ops,
use_inductor_graph_partition=True,
use_inductor=True,
expected_num_piecewise_graphs_seen=1, # since not splitting at fx graph level
expected_num_piecewise_capturable_graphs_seen=1, # since not splitting at fx graph level
expected_num_backend_compilations=1, # since not splitting at fx graph level
expected_num_cudagraph_captured=6, # inductor graph partition still captures 6
# graph, same as fx graph partition.
# Since not splitting at fx graph level
expected_num_piecewise_graphs_seen=1,
# Since not splitting at fx graph level
expected_num_piecewise_capturable_graphs_seen=1,
# Since not splitting at fx graph level
expected_num_backend_compilations=1,
# Inductor graph partition still captures 6 graph, same as fx graph partition
expected_num_cudagraph_captured=6,
)

View File

@ -367,11 +367,14 @@ def test_toy_llama(use_inductor: bool):
kwargs = {"num_eager_compiles": 1, "num_inductor_compiles": 0}
with compilation_counter.expect(
num_graphs_seen=1, # one graph for the model
# One graph for the model
num_graphs_seen=1,
num_piecewise_graphs_seen=1,
num_piecewise_capturable_graphs_seen=1,
num_backend_compilations=1, # num_piecewise_capturable_graphs_seen
num_cudagraph_captured=2, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
# num_piecewise_capturable_graphs_seen
num_backend_compilations=1,
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
num_cudagraph_captured=2,
**kwargs,
):
outputs.append(
@ -478,9 +481,10 @@ def benchmark():
# it is fine here, because we only use the lambda function once.
runtime = do_bench(
lambda: graphs[b][0]( # noqa
input_ids[:b], positions[:b]
input_ids[:b], # noqa
positions[:b], # noqa
)
) # noqa
)
piecewise_cudagraph_time[b] = runtime
else:
runtime = do_bench(lambda: graphs[b][0].replay()) # noqa

View File

@ -243,7 +243,7 @@ def test_fix_functionalization(model_class: torch.nn.Module, do_fusion: bool):
# check if the functionalization pass is applied
for op in model.ops_in_model(do_fusion):
find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None # noqa: E501
assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None
# make sure the ops were all de-functionalized
found = dict()

View File

@ -565,7 +565,7 @@ def test_attention_quant_pattern(
elif quant_key.dtype == FP4_DTYPE:
assert attn_nodes_post[0].kwargs.get("output_block_scale") is not None, (
"Attention should have output_block_scale after FP4 fusion"
) # noqa: E501
)
# Check that results are close
torch.testing.assert_close(result_unfused, result_fused_1, atol=1e-2, rtol=1e-2)

View File

@ -186,7 +186,7 @@ class TestQuantModel(torch.nn.Module):
):
# If fusion happens, the fused op is the one
# we check for (de)functionalization
return [torch.ops._C.fused_add_rms_norm_static_fp8_quant.default] # noqa: E501
return [torch.ops._C.fused_add_rms_norm_static_fp8_quant.default]
else:
# If no fusion, the original ops are checked
return [
@ -322,7 +322,7 @@ def sequence_parallelism_pass_on_test_model(
# check if the functionalization pass is applied
for op in model.ops_in_model():
find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None # noqa: E501
assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None
# make sure the ops were all de-functionalized
found = dict()

View File

@ -104,7 +104,7 @@ TEXT_GENERATION_MODELS = {
# [Decoder-only]
# Uses Llama
# "BAAI/AquilaChat-7B": PPTestSettings.fast(),
"Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(load_format="dummy"), # noqa: E501
"Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(load_format="dummy"),
"baichuan-inc/Baichuan-7B": PPTestSettings.fast(),
"baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(),
"bigscience/bloomz-1b1": PPTestSettings.fast(),
@ -138,7 +138,7 @@ TEXT_GENERATION_MODELS = {
# Uses Llama
# "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
"state-spaces/mamba-130m-hf": PPTestSettings.fast(),
"mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(load_format="dummy"), # noqa: E501
"mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(load_format="dummy"),
"mosaicml/mpt-7b": PPTestSettings.fast(),
"nvidia/Minitron-8B-Base": PPTestSettings.fast(),
"allenai/OLMo-1B-hf": PPTestSettings.fast(),
@ -151,13 +151,13 @@ TEXT_GENERATION_MODELS = {
"microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(),
"microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(
multi_node_only=True, load_format="dummy"
), # noqa: E501
),
"Qwen/Qwen-7B-Chat": PPTestSettings.fast(),
"Qwen/Qwen2.5-0.5B-Instruct": PPTestSettings.fast(),
"Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
"stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
"bigcode/starcoder2-3b": PPTestSettings.fast(),
"upstage/solar-pro-preview-instruct": PPTestSettings.fast(load_format="dummy"), # noqa: E501
"upstage/solar-pro-preview-instruct": PPTestSettings.fast(load_format="dummy"),
# FIXME: Cannot load tokenizer in latest transformers version.
# Need to use tokenizer from `meta-llama/Llama-2-7b-chat-hf`
# "xverse/XVERSE-7B-Chat": PPTestSettings.fast(),

View File

@ -83,7 +83,8 @@ def sample_complex_json_schema():
"type": "array",
"items": {
"type": "string",
"pattern": "^[a-z]{1,10}$", # Combining length and pattern restrictions
# Combining length and pattern restrictions
"pattern": "^[a-z]{1,10}$",
},
},
},

View File

@ -145,7 +145,7 @@ async def test_single_chat_session_audio_base64encoded(
{
"type": "audio_url",
"audio_url": {
"url": f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}"
"url": f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}" # noqa: E501
},
},
{"type": "text", "text": "What's happening in this audio?"},

View File

@ -835,17 +835,18 @@ async def test_extra_fields_allowed(client: openai.AsyncOpenAI):
@pytest.mark.asyncio
async def test_complex_message_content(client: openai.AsyncOpenAI):
content = [
{
"type": "text",
"text": "what is 1+1? please provide the result without any other text.",
}
]
resp = await client.chat.completions.create(
model=MODEL_NAME,
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "what is 1+1? please provide the result without any other text.",
}
],
"content": content,
}
],
temperature=0,

View File

@ -76,8 +76,8 @@ def test_load_chat_template():
assert (
template_content
== """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %}
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}"""
) # noqa: E501
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}""" # noqa: E501
)
def test_no_load_chat_template_filelike():

View File

@ -45,12 +45,13 @@ TOOLS = [
"properties": {
"city": {
"type": "string",
"description": "The city to find the weather for, e.g. 'San Francisco'",
"description": "The city to find the weather for, e.g. "
"'San Francisco'",
},
"state": {
"type": "string",
"description": "the two-letter abbreviation for the state that the city is"
" in, e.g. 'CA' which would mean 'California'",
"description": "the two-letter abbreviation for the state that "
"the city is in, e.g. 'CA' which would mean 'California'",
},
"unit": {
"type": "string",
@ -69,7 +70,8 @@ MESSAGES = [
{"role": "assistant", "content": "I'm doing well! How can I help you?"},
{
"role": "user",
"content": "Can you tell me what the temperate will be in Dallas, in fahrenheit?",
"content": "Can you tell me what the temperate will be in Dallas, "
"in fahrenheit?",
},
]

View File

@ -25,12 +25,14 @@ tools = [
"properties": {
"city": {
"type": "string",
"description": "The city to find the weather for, e.g. 'Vienna'",
"description": "The city to find the weather for, e.g. "
"'Vienna'",
"default": "Vienna",
},
"country": {
"type": "string",
"description": "The country that the city is in, e.g. 'Austria'",
"description": "The country that the city is in, e.g. "
"'Austria'",
},
"unit": {
"type": "string",
@ -85,12 +87,14 @@ tools = [
"properties": {
"city": {
"type": "string",
"description": "The city to get the forecast for, e.g. 'Vienna'",
"description": "The city to get the forecast for, e.g. "
"'Vienna'",
"default": "Vienna",
},
"country": {
"type": "string",
"description": "The country that the city is in, e.g. 'Austria'",
"description": "The country that the city is in, e.g. "
"'Austria'",
},
"days": {
"type": "integer",

View File

@ -179,7 +179,7 @@ async def test_single_chat_session_video_base64encoded(
{
"type": "video_url",
"video_url": {
"url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
"url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" # noqa: E501
},
},
{"type": "text", "text": "What's in this video?"},
@ -238,7 +238,7 @@ async def test_single_chat_session_video_base64encoded_beamsearch(
{
"type": "video_url",
"video_url": {
"url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
"url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" # noqa: E501
},
},
{"type": "text", "text": "What's in this video?"},

View File

@ -233,7 +233,7 @@ async def test_single_chat_session_image_base64encoded(
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}"
"url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}" # noqa: E501
},
},
{"type": "text", "text": content_text},
@ -300,7 +300,7 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}"
"url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}" # noqa: E501
},
},
{"type": "text", "text": "What's in this image?"},

View File

@ -947,7 +947,8 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
{"type": "image_url", "image_url": {"url": image_url}},
{
"type": "text",
"text": "What's in <|image_1|> and how does it compare to the other one?", # noqa: E501
"text": "What's in <|image_1|> and how does it compare to "
"the other one?",
},
],
}
@ -960,8 +961,8 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
assert conversation == [
{
"role": "user",
"content": "<|image_2|>\nWhat's in <|image_1|> and how does it compare to the "
"other one?",
"content": "<|image_2|>\nWhat's in <|image_1|> and how does it compare to "
"the other one?",
}
]
_assert_mm_data_is_image_input(mm_data, 2)
@ -1364,7 +1365,7 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interleave( # noqa: E501
def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interleave(
phi3v_model_config_mm_interleaved,
phi3v_tokenizer,
image_url,
@ -1451,14 +1452,14 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
assert conversation == [
{
"role": "user",
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
"Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", # noqa: E501
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
"\nNow listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",
},
{"role": "assistant", "content": "Some stuff."},
{
"role": "user",
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
"And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
"\nAnd what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
},
]
@ -1468,7 +1469,7 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
_assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None])
def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interleave( # noqa: E501
def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interleave(
qwen25omni_model_config_mm_interleaved,
qwen25omni_tokenizer,
image_url,
@ -1521,14 +1522,14 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl
assert conversation == [
{
"role": "user",
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
"Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", # noqa: E501
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
"\nNow listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",
},
{"role": "assistant", "content": "Some stuff."},
{
"role": "user",
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
"And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
"\nAnd what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
},
]
@ -1593,14 +1594,14 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes
assert conversation == [
{
"role": "user",
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
"Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", # noqa: E501
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
"\nNow listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",
},
{"role": "assistant", "content": "Some stuff."},
{
"role": "user",
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
"And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
"\nAnd what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
},
]
@ -1661,14 +1662,14 @@ def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_message
assert conversation == [
{
"role": "user",
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
"Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", # noqa: E501
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
"\nNow listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",
},
{"role": "assistant", "content": "Some stuff."},
{
"role": "user",
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
"And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
"\nAnd what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
},
]
@ -2193,7 +2194,8 @@ def test_parse_chat_messages_single_empty_audio_with_uuid(
assert conversation == [
{
"role": "user",
"content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?",
"content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the "
"audio say?",
}
]
_assert_mm_data_inputs(mm_data, {"audio": 1})
@ -2228,7 +2230,8 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
assert conversation == [
{
"role": "user",
"content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?",
"content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the "
"audio say?",
}
]
_assert_mm_data_inputs(await mm_future, {"audio": 1})

View File

@ -165,7 +165,7 @@ def test_env(
# FlashMLA only supports block_size == 64
pytest.skip("FlashMLA only supports block_size 64")
else:
from vllm.v1.attention.backends.mla.flashmla import ( # noqa: E501
from vllm.v1.attention.backends.mla.flashmla import (
is_flashmla_supported,
)

View File

@ -331,7 +331,8 @@ class WeightTensors:
in_dtype=config.dtype,
quant_dtype=config.quant_dtype,
block_shape=config.quant_block_shape,
per_out_ch_quant=config.is_per_act_token_quant, # or config.is_per_out_ch_quant
# or config.is_per_out_ch_quant
per_out_ch_quant=config.is_per_act_token_quant,
)
return WeightTensors(
w1=w1, w2=w2, w1_scale=w1_scale, w2_scale=w2_scale, w1_gs=w1_gs, w2_gs=w2_gs

View File

@ -124,7 +124,7 @@ def make_feature_matrix(csv_file_path: str):
results_df: Optional[pd.DataFrame] = None
for m, k, n, e, topks, dtype, pf_type, experts_type, quant_config in tqdm(
combinations
): # noqa: E501
):
config = Config(
Ms=[m],
K=k,

View File

@ -10,7 +10,7 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
BatchedDeepGemmExperts,
)
from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import ( # noqa: E501
from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (
BatchedTritonOrDeepGemmExperts,
)
from vllm.model_executor.layers.fused_moe.config import (
@ -196,10 +196,10 @@ register_experts(
# Disable on blackwell for now
if has_deep_ep() and not current_platform.has_device_capability(100):
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( # noqa: E501
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (
DeepEPHTPrepareAndFinalize,
)
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( # noqa: E501
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (
DeepEPLLPrepareAndFinalize,
)
@ -233,7 +233,7 @@ if has_pplx():
)
if has_flashinfer_cutlass_fused_moe() and current_platform.has_device_capability(100):
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( # noqa: E501
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
FlashInferExperts,
)
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501

View File

@ -17,10 +17,10 @@ from typing_extensions import Concatenate, ParamSpec
from vllm.utils import get_open_port, has_deep_ep
if has_deep_ep():
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( # noqa: E501
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (
DeepEPHTPrepareAndFinalize,
)
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( # noqa: E501
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (
DeepEPLLPrepareAndFinalize,
)

View File

@ -30,10 +30,10 @@ from .parallel_utils import ProcessGroupInfo, parallel_launch
from .utils import make_test_weights
if has_deep_ep():
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( # noqa: E501
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (
DeepEPHTPrepareAndFinalize,
)
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( # noqa: E501
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (
DeepEPLLPrepareAndFinalize,
)

View File

@ -28,10 +28,10 @@ from ...utils import multi_gpu_test
from .parallel_utils import ProcessGroupInfo, parallel_launch
if has_deep_ep():
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( # noqa: E501
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (
DeepEPHTPrepareAndFinalize,
)
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( # noqa: E501
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (
DeepEPLLPrepareAndFinalize,
)

View File

@ -271,7 +271,7 @@ if __name__ == "__main__":
parser = make_config_arg_parser(
description=(
"Run single prepare-finalize & fused-experts combination test"
"Example : python3 -m tests.kernels.moe.test_modular_kernel_combinations " # noqa: E501
"Example : python3 -m tests.kernels.moe.test_modular_kernel_combinations "
"--pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts"
)
)

View File

@ -483,8 +483,8 @@ def test_mixtral_moe(
}
if use_rocm_aiter:
# The values of rtol and atol are set based on the tests in ROCM AITER package. # noqa: E501
# https://github.com/ROCm/aiter/blob/dfed377f4be7da96ca2d75ac0761f569676f7240/op_tests/test_moe.py#L174 # noqa: E501
# The values of rtol and atol are set based on the tests in ROCM AITER package.
# https://github.com/ROCm/aiter/blob/dfed377f4be7da96ca2d75ac0761f569676f7240/op_tests/test_moe.py#L174
torch.testing.assert_close(
hf_states.flatten(0, 1), vllm_states, rtol=0.01, atol=100
)

View File

@ -10,11 +10,11 @@ import pytest
import torch
from packaging import version
from vllm.model_executor.layers.quantization.quark.quark import ( # noqa: E501
from vllm.model_executor.layers.quantization.quark.quark import (
QuarkLinearMethod,
QuarkW4A4MXFP4,
)
from vllm.model_executor.layers.quantization.quark.quark_moe import ( # noqa: E501
from vllm.model_executor.layers.quantization.quark.quark_moe import (
QuarkW4A4MXFp4MoEMethod,
)
from vllm.platforms import current_platform

View File

@ -12,7 +12,7 @@ PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example
EXPECTED_LORA_OUTPUT = [
"SELECT count(*) FROM singer",
"SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", # noqa: E501
"SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'",
"SELECT name , country , age FROM singer ORDER BY age",
]
@ -21,10 +21,16 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
prompts = [
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
PROMPT_TEMPLATE.format(
query="What is the average, minimum, and maximum age of all singers from France?" # noqa: E501
query=(
"What is the average, minimum, and maximum "
"age of all singers from France?"
)
),
PROMPT_TEMPLATE.format(
query="Show name, country, age for all singers ordered by age from the oldest to the youngest." # noqa: E501
query=(
"Show name, country, age for all singers ordered "
"by age from the oldest to the youngest."
)
),
]
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)

View File

@ -15,10 +15,10 @@ MODEL_PATH = "meta-llama/Llama-2-7b-hf"
EXPECTED_LORA_OUTPUT = [
" SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501
" SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ", # noqa: E501
" SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ",
" SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ", # noqa: E501
" SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ", # noqa: E501
" SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ", # noqa: E501
" SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ",
" SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' ", # noqa: E501
]

View File

@ -26,7 +26,7 @@ LORA_RANK = 8
LORA_TEST_PROMPTS = ["What is GitHub?", "Hi, tell me about you"]
LORA_TEST_EXPECTED = [
"GitHub is an open-source platform that provides a way to manage and develop software projects. It allows developers to store and manage code, collaborate on projects, and automate tasks.", # noqa: E501
"I am Alice, an AI assistant developed by GitHub/Charent.", # noqa: E501
"I am Alice, an AI assistant developed by GitHub/Charent.",
]

View File

@ -16,7 +16,7 @@ def test_dummy_loader(vllm_runner, monkeypatch, model: str) -> None:
) as llm:
if model == "google/gemma-3-4b-it":
normalizers = llm.llm.collective_rpc(
lambda self: self.model_runner.model.language_model.model.normalizer.cpu().item()
lambda self: self.model_runner.model.language_model.model.normalizer.cpu().item() # noqa: E501
)
config = llm.llm.llm_engine.model_config.hf_config.text_config
else:

View File

@ -46,12 +46,13 @@ TOOLS = [
"properties": {
"city": {
"type": "string",
"description": "The city to find the weather for, e.g. 'San Francisco'",
"description": "The city to find the weather for, e.g. "
"'San Francisco'",
},
"state": {
"type": "string",
"description": "the two-letter abbreviation for the state that the city is"
" in, e.g. 'CA' which would mean 'California'",
"description": "the two-letter abbreviation for the state that "
"the city is in, e.g. 'CA' which would mean 'California'",
},
"unit": {
"type": "string",
@ -85,7 +86,8 @@ MSGS = [
{"role": "system", "content": "You are an assistant."},
{
"role": "user",
"content": "Could you please rewrite the below article? \n\n My English needs improvving, maybe I make errors.", # noqa
"content": "Could you please rewrite the below article? \n\n My English needs "
"improvving, maybe I make errors.",
},
{
"role": "assistant",
@ -96,14 +98,16 @@ MSGS = [
"type": "function",
"function": {
"name": "rewrite",
"arguments": '{"text":"My English needs improvving, maybe I make errors."}', # noqa
"arguments": '{"text":"My English needs improvving, maybe '
'I make errors."}',
},
}
],
},
{
"role": "tool",
"content": '{"action":"rewrite","outcome":"My English needs improving, maybe I make errors."}', # noqa
"content": '{"action":"rewrite","outcome":"My English needs improving, maybe '
'I make errors."}',
"tool_call_id": "bbc5b7ede",
"name": "rewrite",
},

View File

@ -130,14 +130,14 @@ VLM_TEST_SETTINGS = {
dtype="bfloat16",
marks=[
pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")
], # noqa: E501
],
),
"qwen2_5_vl": VLMTestInfo(
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>",
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
@ -149,8 +149,8 @@ VLM_TEST_SETTINGS = {
models=["Qwen/Qwen2.5-Omni-3B"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_bos|><|IMAGE|><|vision_eos|>", # noqa: E501
video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_bos|><|IMAGE|><|vision_eos|>",
video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>",
max_model_len=4096,
max_num_seqs=2,
num_logprobs=6 if current_platform.is_cpu() else 5,
@ -181,7 +181,7 @@ VLM_TEST_SETTINGS = {
max_model_len=16384,
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
), # noqa: E501
),
auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
image_size_factors=[(0.25, 0.5, 1.0)],
@ -213,7 +213,7 @@ VLM_TEST_SETTINGS = {
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
test_type=VLMTestType.IMAGE,
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
@ -237,10 +237,10 @@ VLM_TEST_SETTINGS = {
single_image_prompts=IMAGE_ASSETS.prompts(
{
"stop_sign": "<vlm_image>Please describe the image shortly.",
"cherry_blossom": "<vlm_image>Please infer the season with reason.", # noqa: E501
"cherry_blossom": "<vlm_image>Please infer the season with reason.",
}
),
multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.", # noqa: E501
multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.",
stop_str=["<|im_end|>"],
image_size_factors=[(0.10, 0.15)],
max_tokens=64,
@ -252,11 +252,11 @@ VLM_TEST_SETTINGS = {
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts(
{
"stop_sign": "<image>What's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<image>What is the season?", # noqa: E501
"stop_sign": "<image>What's the content in the center of the image?",
"cherry_blossom": "<image>What is the season?",
}
),
multi_image_prompt="<image><image>Describe the two images in detail.", # noqa: E501
multi_image_prompt="<image><image>Describe the two images in detail.",
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
@ -268,11 +268,11 @@ VLM_TEST_SETTINGS = {
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts(
{
"stop_sign": "<image>What's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<image>What is the season?", # noqa: E501
"stop_sign": "<image>What's the content in the center of the image?",
"cherry_blossom": "<image>What is the season?",
}
),
multi_image_prompt="<image><image>Describe the two images in detail.", # noqa: E501
multi_image_prompt="<image><image>Describe the two images in detail.",
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
@ -311,14 +311,14 @@ VLM_TEST_SETTINGS = {
max_num_seqs=2,
single_image_prompts=IMAGE_ASSETS.prompts(
{
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
"stop_sign": "<image>\nWhat's the content in the center of the image?",
"cherry_blossom": "<image>\nPlease infer the season with reason in details.", # noqa: E501
}
),
multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?", # noqa: E501
patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
stop_str=["<end▁of▁sentence>", "<begin▁of▁sentence>"], # noqa: E501
stop_str=["<end▁of▁sentence>", "<begin▁of▁sentence>"],
image_size_factors=[(), (1.0,), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
),
"fuyu": VLMTestInfo(
@ -342,7 +342,7 @@ VLM_TEST_SETTINGS = {
single_image_prompts=IMAGE_ASSETS.prompts(
{
"stop_sign": "<start_of_image>What's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<start_of_image>What is the season?", # noqa: E501
"cherry_blossom": "<start_of_image>What is the season?",
}
),
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
@ -356,7 +356,7 @@ VLM_TEST_SETTINGS = {
"glm4v": VLMTestInfo(
models=["zai-org/glm-4v-9b"],
test_type=VLMTestType.IMAGE,
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",
single_image_prompts=IMAGE_ASSETS.prompts(
{
"stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?", # noqa: E501
@ -377,9 +377,9 @@ VLM_TEST_SETTINGS = {
"glm4_1v": VLMTestInfo(
models=["zai-org/GLM-4.1V-9B-Thinking"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501
img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>", # noqa: E501
video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>", # noqa: E501
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",
img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>",
video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>",
max_model_len=2048,
max_num_seqs=2,
get_stop_token_ids=lambda tok: [151329, 151336, 151338],
@ -410,10 +410,10 @@ VLM_TEST_SETTINGS = {
"h2oai/h2ovl-mississippi-2b",
],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501
prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>",
single_image_prompts=IMAGE_ASSETS.prompts(
{
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
"stop_sign": "<image>\nWhat's the content in the center of the image?",
"cherry_blossom": "<image>\nWhat is the season?",
}
),
@ -444,7 +444,7 @@ VLM_TEST_SETTINGS = {
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts(
{
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
"stop_sign": "<image>\nWhat's the content in the center of the image?",
"cherry_blossom": "<image>\nWhat is the season?",
}
),
@ -529,7 +529,7 @@ VLM_TEST_SETTINGS = {
max_model_len=16384,
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
), # noqa: E501
),
auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
custom_test_opts=[
@ -583,7 +583,7 @@ VLM_TEST_SETTINGS = {
max_num_seqs=2,
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(
["<|im_end|>", "<|endoftext|>"]
), # noqa: E501
),
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
# FIXME: https://huggingface.co/openbmb/MiniCPM-o-2_6/discussions/49
@ -598,7 +598,7 @@ VLM_TEST_SETTINGS = {
max_num_seqs=2,
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(
["<|im_end|>", "<|endoftext|>"]
), # noqa: E501
),
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
),
@ -627,7 +627,7 @@ VLM_TEST_SETTINGS = {
models=["AIDC-AI/Ovis1.6-Gemma2-9B"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>\n",
max_model_len=4096,
max_num_seqs=2,
dtype="half",
@ -640,7 +640,7 @@ VLM_TEST_SETTINGS = {
models=["AIDC-AI/Ovis2-1B"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>\n",
max_model_len=4096,
max_num_seqs=2,
dtype="half",
@ -652,7 +652,7 @@ VLM_TEST_SETTINGS = {
models=["AIDC-AI/Ovis2.5-2B"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>\n",
video_idx_to_prompt=lambda idx: "<video>\n",
max_model_len=4096,
max_num_seqs=2,
@ -701,8 +701,8 @@ VLM_TEST_SETTINGS = {
models=["Qwen/Qwen2-VL-2B-Instruct"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>",
multi_image_prompt="Picture 1: <vlm_image>\nPicture 2: <vlm_image>\nDescribe these two images with one paragraph respectively.", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
@ -717,11 +717,11 @@ VLM_TEST_SETTINGS = {
prompt_formatter=lambda img_prompt: f"<begin▁of▁sentence><User>\n{img_prompt}<Assistant><think>\n", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts(
{
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
"stop_sign": "<image>\nWhat's the content in the center of the image?",
"cherry_blossom": "<image>\nWhat is the season?",
}
),
multi_image_prompt="<image>\n<image>\nDescribe the two images in short.", # noqa: E501
multi_image_prompt="<image>\n<image>\nDescribe the two images in short.",
max_model_len=4096,
use_tokenizer_eos=True,
patch_hf_runner=model_utils.skyworkr1v_patch_hf_runner,
@ -754,8 +754,8 @@ VLM_TEST_SETTINGS = {
VLMTestType.VIDEO,
),
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>",
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
@ -816,7 +816,7 @@ VLM_TEST_SETTINGS = {
auto_cls=AutoModelForImageTextToText,
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
), # noqa: E501
),
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
custom_test_opts=[
CustomTestOptions(

View File

@ -170,7 +170,7 @@ async def test_online_serving(client, audio_assets: AudioTestAssets):
],
{
"type": "text",
"text": f"What's happening in these {len(audio_assets)} audio clips?",
"text": f"What's happening in these {len(audio_assets)} audio clips?", # noqa: E501
},
],
}

View File

@ -101,16 +101,11 @@ async def test_online_serving(client, audio_assets: AudioTestAssets):
return audio_dict
audio_chunks = [asset_to_chunk(asset) for asset in audio_assets]
text = f"What's happening in these {len(audio_assets)} audio clips?"
messages = [
{
"role": "user",
"content": [
*audio_chunks,
{
"type": "text",
"text": f"What's happening in these {len(audio_assets)} audio clips?",
},
],
"content": [*audio_chunks, {"type": "text", "text": text}],
}
]

View File

@ -102,8 +102,8 @@ def multi_video_multi_aspect_ratio_inputs(
def different_patch_input_cases_internvl():
images = [asset.pil_image.resize((896, 896)) for asset in IMAGE_ASSETS]
formatter = (
lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n"
) # noqa: E501
lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501
)
single_img_prompts = [
"<image>\nWhat's the content in the center of the image?",
"<image>\nWhat is the season?",

View File

@ -47,7 +47,8 @@ EXAMPLE_TOOLS = [
"properties": {
"city": {
"type": "string",
"description": "The city to get the forecast for, e.g. 'New York'",
"description": "The city to get the forecast for, e.g. "
"'New York'",
},
"days": {
"type": "integer",

View File

@ -134,15 +134,15 @@ def get_attention_backend(backend_name: _Backend):
else "vllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend"
),
_Backend.FLASHINFER: "vllm.v1.attention.backends.flashinfer.FlashInferBackend",
_Backend.FLEX_ATTENTION: "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend",
_Backend.TRITON_ATTN: "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend",
_Backend.FLEX_ATTENTION: "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend", # noqa: E501
_Backend.TRITON_ATTN: "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend", # noqa: E501
_Backend.TREE_ATTN: "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend",
_Backend.XFORMERS: "vllm.v1.attention.backends.xformers.XFormersAttentionBackend",
_Backend.CUTLASS_MLA: "vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend",
_Backend.XFORMERS: "vllm.v1.attention.backends.xformers.XFormersAttentionBackend", # noqa: E501
_Backend.CUTLASS_MLA: "vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend", # noqa: E501
_Backend.FLASHMLA: "vllm.v1.attention.backends.mla.flashmla.FlashMLABackend",
_Backend.FLASH_ATTN_MLA: "vllm.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend",
_Backend.FLASHINFER_MLA: "vllm.v1.attention.backends.mla.flashinfer_mla.FlashInferMLABackend",
_Backend.TRITON_MLA: "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend",
_Backend.FLASH_ATTN_MLA: "vllm.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend", # noqa: E501
_Backend.FLASHINFER_MLA: "vllm.v1.attention.backends.mla.flashinfer_mla.FlashInferMLABackend", # noqa: E501
_Backend.TRITON_MLA: "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend", # noqa: E501
}
if backend_name not in backend_map:

View File

@ -104,7 +104,7 @@ async def test_single_chat_session_image_base64encoded(
"content": [
{
"type": "input_image",
"image_url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}",
"image_url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}", # noqa: E501
"detail": "auto",
},
{"type": "input_text", "text": content_text},

View File

@ -15,8 +15,9 @@ RTOL = 0.03
EXPECTED_VALUES = {"Qwen/Qwen3-0.6B": 0.41, "deepseek-ai/deepseek-vl2-small": 0.59}
SIMPLE_PROMPT = (
"The best part about working on vLLM is that I got to meet so many people across various different organizations like UCB, Google, and Meta which means",
) # noqa: E501
"The best part about working on vLLM is that I got to meet so many people across "
"various different organizations like UCB, Google, and Meta which means",
)
# Get model name from environment variable
MODEL_NAME = os.environ.get("TEST_MODEL", "Qwen/Qwen3-0.6B")

View File

@ -127,7 +127,7 @@ class RequestRunner:
kv_role="kv_both",
kv_connector_extra_config={
"spec_name": "MockOffloadingSpec",
"spec_module_path": "tests.v1.kv_connector.unit.test_offloading_connector",
"spec_module_path": "tests.v1.kv_connector.unit.test_offloading_connector", # noqa: E501
"block_size": offloaded_block_size,
},
)

View File

@ -260,15 +260,8 @@ def test_pooling_rejects_custom_logitsprocs(
gpu_memory_utilization=0.1,
)
# Require that no logitsprocs have been loaded
assert (
sum(
[
1
for _ in llm.llm_engine.model_executor.driver_worker.worker.model_runner.input_batch.logitsprocs.all
]
)
== 0
)
worker = llm.llm_engine.model_executor.driver_worker.worker
assert sum([1 for _ in worker.model_runner.input_batch.logitsprocs.all]) == 0
return
kwargs: dict[str, list[Union[str, type[LogitsProcessor]]]] = {}

View File

@ -76,10 +76,14 @@ def _kv_cache_update_kernel(
static_argnames=["page_size", "num_slices_per_block"],
)
def kv_cache_update(
new_kv: jax.Array, # [total_num_token, num_combined_kv_heads, head_dim]
slices: jax.Array, # [3, slices], list of (kv_cache_start, new_kv_start, slice_len)
kv_cache: jax.Array, # [total_num_pages * page_size, num_combined_kv_heads, head_dim]
num_kv_update_slices: jax.Array, # [1]
# [total_num_token, num_combined_kv_heads, head_dim]
new_kv: jax.Array,
# [3, slices], list of (kv_cache_start, new_kv_start, slice_len)
slices: jax.Array,
# [total_num_pages * page_size, num_combined_kv_heads, head_dim]
kv_cache: jax.Array,
# [1]
num_kv_update_slices: jax.Array,
*,
page_size: int = 32,
num_slices_per_block: int = 8,

View File

@ -834,7 +834,10 @@ class AllReduceFusedRMSNormStaticQuantFP8Pattern(BasePattern):
scale_out=None,
rms_gamma=weight,
rms_eps=self.epsilon,
pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant, # we don't use norm_out afterwards
# We don't use norm_out afterwards
pattern_code=(
flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant
),
scale_factor=scale,
**self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
)
@ -928,11 +931,14 @@ class AllReduceFusedAddRMSNormStaticQuantFP8Pattern(BasePattern):
scale_out=None,
rms_gamma=weight,
rms_eps=self.epsilon,
pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant, # we don't use norm_out afterwards
# We don't use norm_out afterwards
pattern_code=(
flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant
),
scale_factor=scale,
**self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
)
# # quant_out, rms_norm_residual
# quant_out, rms_norm_residual
return allreduce[4], allreduce[2]
pm.register_replacement(
@ -1028,7 +1034,10 @@ class AllReduceFusedRMSNormStaticQuantNVFP4Pattern(BasePattern):
scale_out=output_scale,
rms_gamma=weight,
rms_eps=self.epsilon,
pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant, # we don't use norm_out afterwards
# We don't use norm_out afterwards
pattern_code=(
flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant
),
scale_factor=input_global_scale,
**self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
)
@ -1130,7 +1139,10 @@ class AllReduceFusedAddRMSNormStaticQuantNVFP4Pattern(BasePattern):
scale_out=output_scale,
rms_gamma=weight,
rms_eps=self.epsilon,
pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant, # we don't use norm_out afterwards
# We don't use norm_out afterwards
pattern_code=(
flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant
),
scale_factor=input_global_scale,
**self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
)

View File

@ -119,9 +119,12 @@ class TorchCompileWrapperWithCustomDispatcher:
src = depyf.decompile(new_code)
msg = (
"Assigning / modifying buffers of nn.Module during forward pass is not allowed when using cudagraph inside the compiler because it will cause silent errors. Please use eager mode or fix the code. The following code contains clues about which buffer is being modified (please search for the usage of the function `update`):\n"
+ src
) # noqa
"Assigning / modifying buffers of nn.Module during forward pass is not "
"allowed when using cudagraph inside the compiler because it will "
"cause silent errors. Please use eager mode or fix the code. The "
"following code contains clues about which buffer is being modified "
f"(please search for the usage of the function `update`):\n{src}"
)
raise RuntimeError(msg)
@contextmanager
@ -132,8 +135,9 @@ class TorchCompileWrapperWithCustomDispatcher:
variables as the original code. Therefore we can directly switch
the code object in the function and call it.
See https://dev-discuss.pytorch.org/t/what-is-the-relationship-requirement-among-original-bytecode-transformed-bytecode-and-bytecode-returned-by-hooks-in-dynamo/1693/7 for more details.
""" # noqa
See https://dev-discuss.pytorch.org/t/what-is-the-relationship-requirement-among-original-bytecode-transformed-bytecode-and-bytecode-returned-by-hooks-in-dynamo/1693/7
for more details.
"""
self.__class__.forward.__code__ = self.compiled_codes[index]
yield
self.__class__.forward.__code__ = self.original_code_object

View File

@ -472,7 +472,7 @@ class VllmConfig:
self.compilation_config.cudagraph_mode.has_full_cudagraphs()
and self.model_config is not None
and not self.model_config.disable_cascade_attn
and not self.compilation_config.cudagraph_mode.has_piecewise_cudagraphs()
and not self.compilation_config.cudagraph_mode.has_piecewise_cudagraphs() # noqa: E501
):
logger.warning_once(
"No piecewise cudagraph for executing cascade attention."

View File

@ -147,8 +147,9 @@ class PPLXAll2AllManager(All2AllManagerBase):
def __init__(self, cpu_group):
assert has_pplx(), (
"pplx_kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md to install pplx_kernels."
) # noqa
"pplx_kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md"
" to install pplx_kernels."
)
super().__init__(cpu_group)
if self.internode:
@ -220,7 +221,8 @@ class DeepEPAll2AllManagerBase(All2AllManagerBase):
def __init__(self, cpu_group):
assert has_deep_ep(), (
"DeepEP kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md to install DeepEP kernels."
"DeepEP kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md"
" to install DeepEP kernels."
) # noqa
super().__init__(cpu_group)
self.handle_cache = Cache()

View File

@ -471,7 +471,8 @@ class ChatCompletionRequest(OpenAIBaseModel):
top_logprobs: Optional[int] = 0
max_tokens: Optional[int] = Field(
default=None,
deprecated="max_tokens is deprecated in favor of the max_completion_tokens field",
deprecated="max_tokens is deprecated in favor of "
"the max_completion_tokens field",
)
max_completion_tokens: Optional[int] = None
n: Optional[int] = 1

View File

@ -31,7 +31,7 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
if self.base_layer.num_added_embeddings_per_partition > 0:
# We can start adding lora weights
self.embeddings_weights = self.base_layer.weight.data[
self.base_layer.num_org_embeddings_per_partition : self.base_layer.num_org_embeddings_per_partition
self.base_layer.num_org_embeddings_per_partition : self.base_layer.num_org_embeddings_per_partition # noqa: E501
+ self.base_layer.num_added_embeddings_per_partition
]
self.embeddings_slice = (

View File

@ -107,8 +107,8 @@ class PTPCFp8LinearMethod(Fp8LinearMethod):
layer.weight = torch.nn.Parameter(layer.weight.data, requires_grad=False)
assert layer.weight.data.dtype == torch.bfloat16, (
f"Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support output dtype of bfloat16. {str(layer.weight.data.dtype)} is specified."
) # noqa: E501
f"Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support output dtype of bfloat16. {str(layer.weight.data.dtype)} is specified." # noqa: E501
)
# Quantize the weights.
qweight, weight_scale = ops.scaled_fp8_quant(
layer.weight, scale=None, use_per_token_if_dynamic=True

View File

@ -391,7 +391,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
total_shard_sizes = next(
(
sizes
for module, sizes in self.maybe_fused_weights_modules.items()
for module, sizes in self.maybe_fused_weights_modules.items() # noqa: E501
if check_match(mapped_weight_name, module)
)
)

View File

@ -270,8 +270,8 @@ class BailingMoE(nn.Module):
) or (
self.score_function == "sigmoid" and self.correction_bias is not None
), (
"score_function and correction_bias should be in 2 combination (softmax, None) or (sigmoid, not None)"
) # noqa: E501
"score_function and correction_bias should be in 2 combination (softmax, None) or (sigmoid, not None)" # noqa: E501
)
else:
# default value for scoring_func
self.score_function = "softmax"

View File

@ -825,10 +825,10 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
# Run MM-Projector
# len(num_grids) == len(num_queries_vis_abstractors) + 1
grid_idx = 0
num_grids = [
grid_idx
] # e.g. [0, 9, 18, 19, 27, 28, 36, 37, 45, 46, 54, 55, 56]
num_queries_vis_abstractors = [] # e.g. [81, 81, 81, 9, 81, 9, 81, 9, 81, 9, 81, 9]
# e.g. [0, 9, 18, 19, 27, 28, 36, 37, 45, 46, 54, 55, 56]
num_grids = [grid_idx]
# e.g. [81, 81, 81, 9, 81, 9, 81, 9, 81, 9, 81, 9]
num_queries_vis_abstractors = []
len_total_frames = video_forward_outs.shape[0]
if self.config.first_last_frames_slow:

View File

@ -154,9 +154,10 @@ class LlamaModel(nn.Module):
str(layer_index), str(layer_index + start_layer_id)
)
quant_config.torchao_config.module_fqn_to_config = {
torchao_config = quant_config.torchao_config
torchao_config.module_fqn_to_config = {
pad_layer_name(layer): quantization
for layer, quantization in quant_config.torchao_config.module_fqn_to_config.items()
for layer, quantization in torchao_config.module_fqn_to_config.items()
}

View File

@ -186,26 +186,26 @@ class LongCatFlashMTP(nn.Module, SupportsPP):
"model.mtp.layers.0.eh_proj.weight_scale_inv": "eh_proj.weight_scale_inv",
"model.mtp.layers.0.enorm.m.weight": "enorm.weight",
"model.mtp.layers.0.hnorm.m.weight": "hnorm.weight",
"model.mtp.layers.0.input_layernorm.weight": "model.layers.0.input_layernorm.weight",
"model.mtp.layers.0.post_attention_layernorm.weight": "model.layers.0.post_attention_layernorm.weight",
"model.mtp.layers.0.self_attn.kv_a_layernorm.weight": "model.layers.0.self_attn.kv_a_layernorm.weight",
"model.mtp.layers.0.self_attn.kv_a_proj_with_mqa.weight": "model.layers.0.self_attn.kv_a_proj_with_mqa.weight",
"model.mtp.layers.0.self_attn.kv_a_proj_with_mqa.weight_scale_inv": "model.layers.0.self_attn.kv_a_proj_with_mqa.weight_scale_inv",
"model.mtp.layers.0.self_attn.kv_b_proj.weight": "model.layers.0.self_attn.kv_b_proj.weight",
"model.mtp.layers.0.self_attn.kv_b_proj.weight_scale_inv": "model.layers.0.self_attn.kv_b_proj.weight_scale_inv",
"model.mtp.layers.0.self_attn.o_proj.weight": "model.layers.0.self_attn.o_proj.weight",
"model.mtp.layers.0.self_attn.o_proj.weight_scale_inv": "model.layers.0.self_attn.o_proj.weight_scale_inv",
"model.mtp.layers.0.self_attn.q_a_layernorm.weight": "model.layers.0.self_attn.q_a_layernorm.weight",
"model.mtp.layers.0.self_attn.q_a_proj.weight": "model.layers.0.self_attn.q_a_proj.weight",
"model.mtp.layers.0.self_attn.q_a_proj.weight_scale_inv": "model.layers.0.self_attn.q_a_proj.weight_scale_inv",
"model.mtp.layers.0.self_attn.q_b_proj.weight": "model.layers.0.self_attn.q_b_proj.weight",
"model.mtp.layers.0.self_attn.q_b_proj.weight_scale_inv": "model.layers.0.self_attn.q_b_proj.weight_scale_inv",
"model.mtp.layers.0.transformer_layer.mlp.down_proj.weight": "model.layers.0.mlp.down_proj.weight",
"model.mtp.layers.0.transformer_layer.mlp.down_proj.weight_scale_inv": "model.layers.0.mlp.down_proj.weight_scale_inv",
"model.mtp.layers.0.transformer_layer.mlp.gate_proj.weight": "model.layers.0.mlp.gate_proj.weight",
"model.mtp.layers.0.transformer_layer.mlp.gate_proj.weight_scale_inv": "model.layers.0.mlp.gate_proj.weight_scale_inv",
"model.mtp.layers.0.transformer_layer.mlp.up_proj.weight": "model.layers.0.mlp.up_proj.weight",
"model.mtp.layers.0.transformer_layer.mlp.up_proj.weight_scale_inv": "model.layers.0.mlp.up_proj.weight_scale_inv",
"model.mtp.layers.0.input_layernorm.weight": "model.layers.0.input_layernorm.weight", # noqa: E501
"model.mtp.layers.0.post_attention_layernorm.weight": "model.layers.0.post_attention_layernorm.weight", # noqa: E501
"model.mtp.layers.0.self_attn.kv_a_layernorm.weight": "model.layers.0.self_attn.kv_a_layernorm.weight", # noqa: E501
"model.mtp.layers.0.self_attn.kv_a_proj_with_mqa.weight": "model.layers.0.self_attn.kv_a_proj_with_mqa.weight", # noqa: E501
"model.mtp.layers.0.self_attn.kv_a_proj_with_mqa.weight_scale_inv": "model.layers.0.self_attn.kv_a_proj_with_mqa.weight_scale_inv", # noqa: E501
"model.mtp.layers.0.self_attn.kv_b_proj.weight": "model.layers.0.self_attn.kv_b_proj.weight", # noqa: E501
"model.mtp.layers.0.self_attn.kv_b_proj.weight_scale_inv": "model.layers.0.self_attn.kv_b_proj.weight_scale_inv", # noqa: E501
"model.mtp.layers.0.self_attn.o_proj.weight": "model.layers.0.self_attn.o_proj.weight", # noqa: E501
"model.mtp.layers.0.self_attn.o_proj.weight_scale_inv": "model.layers.0.self_attn.o_proj.weight_scale_inv", # noqa: E501
"model.mtp.layers.0.self_attn.q_a_layernorm.weight": "model.layers.0.self_attn.q_a_layernorm.weight", # noqa: E501
"model.mtp.layers.0.self_attn.q_a_proj.weight": "model.layers.0.self_attn.q_a_proj.weight", # noqa: E501
"model.mtp.layers.0.self_attn.q_a_proj.weight_scale_inv": "model.layers.0.self_attn.q_a_proj.weight_scale_inv", # noqa: E501
"model.mtp.layers.0.self_attn.q_b_proj.weight": "model.layers.0.self_attn.q_b_proj.weight", # noqa: E501
"model.mtp.layers.0.self_attn.q_b_proj.weight_scale_inv": "model.layers.0.self_attn.q_b_proj.weight_scale_inv", # noqa: E501
"model.mtp.layers.0.transformer_layer.mlp.down_proj.weight": "model.layers.0.mlp.down_proj.weight", # noqa: E501
"model.mtp.layers.0.transformer_layer.mlp.down_proj.weight_scale_inv": "model.layers.0.mlp.down_proj.weight_scale_inv", # noqa: E501
"model.mtp.layers.0.transformer_layer.mlp.gate_proj.weight": "model.layers.0.mlp.gate_proj.weight", # noqa: E501
"model.mtp.layers.0.transformer_layer.mlp.gate_proj.weight_scale_inv": "model.layers.0.mlp.gate_proj.weight_scale_inv", # noqa: E501
"model.mtp.layers.0.transformer_layer.mlp.up_proj.weight": "model.layers.0.mlp.up_proj.weight", # noqa: E501
"model.mtp.layers.0.transformer_layer.mlp.up_proj.weight_scale_inv": "model.layers.0.mlp.up_proj.weight_scale_inv", # noqa: E501
"model.mtp.norm.weight": "final_layernorm.weight",
}

View File

@ -1000,8 +1000,8 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
"base_layer.": "",
},
orig_to_new_prefix={
"model.embed_tokens_extend.audio_embed.audio_projection.vision.": "embed_tokens_extend.audio_projection_for_vision.",
"model.embed_tokens_extend.audio_embed.audio_projection.speech.": "embed_tokens_extend.audio_projection.",
"model.embed_tokens_extend.audio_embed.audio_projection.vision.": "embed_tokens_extend.audio_projection_for_vision.", # noqa: E501
"model.embed_tokens_extend.audio_embed.audio_projection.speech.": "embed_tokens_extend.audio_projection.", # noqa: E501
"model.embed_tokens_extend.audio_embed.": "embed_tokens_extend.",
"model.embed_tokens_extend.image_embed.": "vision_encoder.",
},

View File

@ -916,8 +916,9 @@ class Qwen3NextDecoderLayer(nn.Module):
)
else:
assert len(hidden_states.shape) == len(self.ffn_layer_scale.shape), (
f"shape must be the same {len(hidden_states.shape)}, {len(self.ffn_layer_scale.shape)}"
) # noqa: E501
f"shape must be the same {len(hidden_states.shape)}, "
f"{len(self.ffn_layer_scale.shape)}"
)
hidden_states = hidden_states * (
self.ffn_layer_scale.to(hidden_states.dtype) + 1
)

View File

@ -255,8 +255,8 @@ def is_rocm_aiter_fp8bmm_enabled() -> bool:
if is_rocm_aiter_fp8bmm_enabled():
from aiter.ops.triton.batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant import ( # noqa: E501 # isort: skip
batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant as aiter_triton_fp8_bmm,
from aiter.ops.triton.batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant import ( # noqa: E501
batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant as aiter_triton_fp8_bmm, # noqa: E501
)
def dynamic_per_batched_tensor_quant(
@ -1284,8 +1284,10 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
actual_seq_lens_q=prefill.query_seq_lens.view(-1, 1, 1, 1),
actual_seq_lens_kv=prefill.query_seq_lens.view(-1, 1, 1, 1),
causal=True,
return_lse=True, # do not support False for now
is_cuda_graph_compatible=True, # Indicates actual_seq_lens are on GPU or CPU.
# Do not support False for now
return_lse=True,
# Indicates actual_seq_lens are on GPU or CPU.
is_cuda_graph_compatible=True,
)
if return_softmax_lse:
return output, lse
@ -1342,7 +1344,8 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
),
causal=False,
return_lse=True,
is_cuda_graph_compatible=True, # Indicates actual_seq_lens are on GPU or CPU.
# Indicates actual_seq_lens are on GPU or CPU.
is_cuda_graph_compatible=True,
)
def process_weights_after_loading(self, act_dtype: torch.dtype):

View File

@ -872,10 +872,13 @@ def wait_for_engine_startup(
EngineHandshakeMetadata(
addresses=addresses,
parallel_config={
"data_parallel_master_ip": parallel_config.data_parallel_master_ip,
"data_parallel_master_port": parallel_config.data_parallel_master_port,
"_data_parallel_master_port_list": parallel_config._data_parallel_master_port_list,
"data_parallel_size": parallel_config.data_parallel_size,
k: getattr(parallel_config, k)
for k in (
"data_parallel_master_ip",
"data_parallel_master_port",
"_data_parallel_master_port_list",
"data_parallel_size",
)
},
)
)

View File

@ -345,13 +345,15 @@ def report_usage_stats(
from vllm.model_executor.model_loader import get_architecture_class_name
parallel_config = vllm_config.parallel_config
usage_message.report_usage(
get_architecture_class_name(vllm_config.model_config),
usage_context,
extra_kvs={
# Common configuration
"dtype": str(vllm_config.model_config.dtype),
"tensor_parallel_size": vllm_config.parallel_config.tensor_parallel_size,
"tensor_parallel_size": parallel_config.tensor_parallel_size,
"block_size": vllm_config.cache_config.block_size,
"gpu_memory_utilization": vllm_config.cache_config.gpu_memory_utilization,
"kv_cache_memory_bytes": vllm_config.cache_config.kv_cache_memory_bytes,
@ -362,7 +364,7 @@ def report_usage_stats(
"enable_lora": bool(vllm_config.lora_config),
"enable_prefix_caching": vllm_config.cache_config.enable_prefix_caching,
"enforce_eager": vllm_config.model_config.enforce_eager,
"disable_custom_all_reduce": vllm_config.parallel_config.disable_custom_all_reduce,
"disable_custom_all_reduce": parallel_config.disable_custom_all_reduce,
},
)

View File

@ -3391,7 +3391,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
attn_metadata[ubid][layer_name] = attn_metadata_i
else:
assert type(attn_metadata) is dict
attn_metadata_i = attn_group.get_metadata_builder().build_for_cudagraph_capture(
metadata_builder = attn_group.get_metadata_builder()
attn_metadata_i = metadata_builder.build_for_cudagraph_capture(
common_attn_metadata
)
for layer_name in attn_group.layer_names: