Fix per file ruff ignores related to line length (#26262)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2025-10-06 06:12:40 +01:00
committed by GitHub
parent 91ac7f764d
commit 6c04638214
65 changed files with 301 additions and 291 deletions

View File

@ -164,7 +164,7 @@ def invoke_main() -> None:
) )
parser.add_argument( parser.add_argument(
"--batched", action="store_true", help="consider time to prepare batch" "--batched", action="store_true", help="consider time to prepare batch"
) # noqa: E501 )
parser.add_argument( parser.add_argument(
"--num-iteration", "--num-iteration",
type=int, type=int,

View File

@ -909,13 +909,13 @@ def create_argument_parser():
parser.add_argument( parser.add_argument(
"--tokenizer", "--tokenizer",
type=str, type=str,
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 help="Name or path of the tokenizer, if not using the default tokenizer.",
) )
parser.add_argument( parser.add_argument(
"--tokenizer-mode", "--tokenizer-mode",
type=str, type=str,
default="auto", default="auto",
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 help="Name or path of the tokenizer, if not using the default tokenizer.",
) )
parser.add_argument( parser.add_argument(
"--num-prompts", "--num-prompts",

View File

@ -72,8 +72,8 @@ VLLMKernelScheduleTag: dict[
] = { ] = {
**KernelScheduleTag, # type: ignore **KernelScheduleTag, # type: ignore
**{ **{
MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized", MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized", # noqa: E501
MixedInputKernelScheduleType.TmaWarpSpecializedPingpong: "cutlass::gemm::KernelTmaWarpSpecializedPingpong", MixedInputKernelScheduleType.TmaWarpSpecializedPingpong: "cutlass::gemm::KernelTmaWarpSpecializedPingpong", # noqa: E501
MixedInputKernelScheduleType.TmaWarpSpecializedCooperative: "cutlass::gemm::KernelTmaWarpSpecializedCooperative", MixedInputKernelScheduleType.TmaWarpSpecializedCooperative: "cutlass::gemm::KernelTmaWarpSpecializedCooperative", # noqa: E501
}, },
} }

View File

@ -113,7 +113,7 @@ def run_e5_v(query: Query) -> ModelRequestData:
def _get_vlm2vec_prompt_image(query: Query, image_token: str): def _get_vlm2vec_prompt_image(query: Query, image_token: str):
if query["modality"] == "text": if query["modality"] == "text":
text = query["text"] text = query["text"]
prompt = f"Find me an everyday image that matches the given caption: {text}" # noqa: E501 prompt = f"Find me an everyday image that matches the given caption: {text}"
image = None image = None
elif query["modality"] == "image": elif query["modality"] == "image":
prompt = f"{image_token} Find a day-to-day image that looks similar to the provided image." # noqa: E501 prompt = f"{image_token} Find a day-to-day image that looks similar to the provided image." # noqa: E501

View File

@ -203,9 +203,9 @@ class Proxy:
async with session.post( async with session.post(
url=url, json=data, headers=headers url=url, json=data, headers=headers
) as response: ) as response:
if 200 <= response.status < 300 or 400 <= response.status < 500: # noqa: E501 if 200 <= response.status < 300 or 400 <= response.status < 500:
if use_chunked: if use_chunked:
async for chunk_bytes in response.content.iter_chunked( # noqa: E501 async for chunk_bytes in response.content.iter_chunked(
1024 1024
): ):
yield chunk_bytes yield chunk_bytes

View File

@ -56,52 +56,6 @@ include = ["vllm*"]
"vllm/third_party/**" = ["ALL"] "vllm/third_party/**" = ["ALL"]
"vllm/version.py" = ["F401"] "vllm/version.py" = ["F401"]
"vllm/_version.py" = ["ALL"] "vllm/_version.py" = ["ALL"]
# TEMPORARY! These ignores will be fixed forward
## Line length violations
"csrc/cutlass_extensions/vllm_cutlass_library_extension.py" = ["E501"]
"tests/compile/piecewise/test_simple.py" = ["E501"]
"tests/compile/piecewise/test_toy_llama.py" = ["E501", "B023"]
"tests/entrypoints/conftest.py" = ["E501"]
"tests/entrypoints/openai/test_audio.py" = ["E501"]
"tests/entrypoints/openai/test_chat.py" = ["E501"]
"tests/entrypoints/openai/test_chat_template.py" = ["E501"]
"tests/entrypoints/openai/test_chat_with_tool_reasoning.py" = ["E501"]
"tests/entrypoints/openai/test_completion_with_function_calling.py" = ["E501"]
"tests/entrypoints/openai/test_video.py" = ["E501"]
"tests/entrypoints/openai/test_vision.py" = ["E501"]
"tests/entrypoints/test_chat_utils.py" = ["E501"]
"tests/kernels/moe/modular_kernel_tools/common.py" = ["E501"]
"tests/models/language/generation/test_gemma.py" = ["E501"]
"tests/models/language/generation/test_mistral.py" = ["E501"]
"tests/models/multimodal/generation/test_ultravox.py" = ["E501"]
"tests/models/multimodal/generation/test_voxtral.py" = ["E501"]
"tests/models/multimodal/generation/vlm_utils/custom_inputs.py" = ["E501"]
"tests/tool_use/test_tool_choice_required.py" = ["E501"]
"tests/v1/attention/utils.py" = ["E501"]
"tests/v1/entrypoints/openai/responses/test_image.py" = ["E501"]
"tests/v1/kv_connector/nixl_integration/test_accuracy.py" = ["E501"]
"tests/v1/kv_connector/unit/test_offloading_connector.py" = ["E501"]
"tests/v1/logits_processors/test_custom_offline.py" = ["E501"]
"vllm/attention/ops/pallas_kv_cache_update.py" = ["E501"]
"vllm/compilation/collective_fusion.py" = ["E501"]
"vllm/compilation/wrapper.py" = ["E501"]
"vllm/config/vllm.py" = ["E501"]
"vllm/distributed/device_communicators/all2all.py" = ["E501"]
"vllm/entrypoints/openai/protocol.py" = ["E501"]
"vllm/lora/layers/vocal_parallel_embedding.py" = ["E501"]
"vllm/model_executor/model_loader/bitsandbytes_loader.py" = ["E501"]
"vllm/model_executor/models/bailing_moe.py" = ["E501"]
"vllm/model_executor/models/hyperclovax_vision.py" = ["E501"]
"vllm/model_executor/models/llama4_eagle.py" = ["E501"]
"vllm/model_executor/models/longcat_flash_mtp.py" = ["E501"]
"vllm/model_executor/models/phi4mm.py" = ["E501"]
"vllm/model_executor/models/qwen3_next.py" = ["E501"]
"vllm/model_executor/layers/quantization/ptpc_fp8.py" = ["E501"]
"vllm/v1/attention/backends/mla/common.py" = ["E501"]
"vllm/v1/engine/utils.py" = ["E501"]
"vllm/v1/utils.py" = ["E501"]
"vllm/v1/worker/gpu_model_runner.py" = ["E501"]
# End of temporary ignores
[tool.ruff.lint] [tool.ruff.lint]
select = [ select = [

View File

@ -132,10 +132,14 @@ def test_simple_piecewise_compile(use_inductor):
splitting_ops=["silly.attention"], splitting_ops=["silly.attention"],
use_inductor_graph_partition=False, use_inductor_graph_partition=False,
use_inductor=use_inductor, use_inductor=use_inductor,
expected_num_piecewise_graphs_seen=5, # 2 * num_layers + 1 # 2 * num_layers + 1
expected_num_piecewise_capturable_graphs_seen=3, # 1 + num_layers expected_num_piecewise_graphs_seen=5,
expected_num_backend_compilations=3, # num_piecewise_capturable_graphs_seen # 1 + num_layers
expected_num_cudagraph_captured=6, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen expected_num_piecewise_capturable_graphs_seen=3,
# num_piecewise_capturable_graphs_seen
expected_num_backend_compilations=3,
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
expected_num_cudagraph_captured=6,
) )
@ -147,14 +151,16 @@ def test_simple_inductor_graph_partition(splitting_ops):
pytest.skip("inductor graph partition is only available in PyTorch 2.9+") pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
_run_simple_model( _run_simple_model(
# inductor graph partition automatically resets splitting_ops # Inductor graph partition automatically resets splitting_ops to an empty list
# to be an empty list
splitting_ops=splitting_ops, splitting_ops=splitting_ops,
use_inductor_graph_partition=True, use_inductor_graph_partition=True,
use_inductor=True, use_inductor=True,
expected_num_piecewise_graphs_seen=1, # since not splitting at fx graph level # Since not splitting at fx graph level
expected_num_piecewise_capturable_graphs_seen=1, # since not splitting at fx graph level expected_num_piecewise_graphs_seen=1,
expected_num_backend_compilations=1, # since not splitting at fx graph level # Since not splitting at fx graph level
expected_num_cudagraph_captured=6, # inductor graph partition still captures 6 expected_num_piecewise_capturable_graphs_seen=1,
# graph, same as fx graph partition. # Since not splitting at fx graph level
expected_num_backend_compilations=1,
# Inductor graph partition still captures 6 graph, same as fx graph partition
expected_num_cudagraph_captured=6,
) )

View File

@ -367,11 +367,14 @@ def test_toy_llama(use_inductor: bool):
kwargs = {"num_eager_compiles": 1, "num_inductor_compiles": 0} kwargs = {"num_eager_compiles": 1, "num_inductor_compiles": 0}
with compilation_counter.expect( with compilation_counter.expect(
num_graphs_seen=1, # one graph for the model # One graph for the model
num_graphs_seen=1,
num_piecewise_graphs_seen=1, num_piecewise_graphs_seen=1,
num_piecewise_capturable_graphs_seen=1, num_piecewise_capturable_graphs_seen=1,
num_backend_compilations=1, # num_piecewise_capturable_graphs_seen # num_piecewise_capturable_graphs_seen
num_cudagraph_captured=2, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen num_backend_compilations=1,
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
num_cudagraph_captured=2,
**kwargs, **kwargs,
): ):
outputs.append( outputs.append(
@ -478,9 +481,10 @@ def benchmark():
# it is fine here, because we only use the lambda function once. # it is fine here, because we only use the lambda function once.
runtime = do_bench( runtime = do_bench(
lambda: graphs[b][0]( # noqa lambda: graphs[b][0]( # noqa
input_ids[:b], positions[:b] input_ids[:b], # noqa
positions[:b], # noqa
)
) )
) # noqa
piecewise_cudagraph_time[b] = runtime piecewise_cudagraph_time[b] = runtime
else: else:
runtime = do_bench(lambda: graphs[b][0].replay()) # noqa runtime = do_bench(lambda: graphs[b][0].replay()) # noqa

View File

@ -243,7 +243,7 @@ def test_fix_functionalization(model_class: torch.nn.Module, do_fusion: bool):
# check if the functionalization pass is applied # check if the functionalization pass is applied
for op in model.ops_in_model(do_fusion): for op in model.ops_in_model(do_fusion):
find_auto_fn(backend_no_func.graph_post_pass.nodes, op) find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None # noqa: E501 assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None
# make sure the ops were all de-functionalized # make sure the ops were all de-functionalized
found = dict() found = dict()

View File

@ -565,7 +565,7 @@ def test_attention_quant_pattern(
elif quant_key.dtype == FP4_DTYPE: elif quant_key.dtype == FP4_DTYPE:
assert attn_nodes_post[0].kwargs.get("output_block_scale") is not None, ( assert attn_nodes_post[0].kwargs.get("output_block_scale") is not None, (
"Attention should have output_block_scale after FP4 fusion" "Attention should have output_block_scale after FP4 fusion"
) # noqa: E501 )
# Check that results are close # Check that results are close
torch.testing.assert_close(result_unfused, result_fused_1, atol=1e-2, rtol=1e-2) torch.testing.assert_close(result_unfused, result_fused_1, atol=1e-2, rtol=1e-2)

View File

@ -186,7 +186,7 @@ class TestQuantModel(torch.nn.Module):
): ):
# If fusion happens, the fused op is the one # If fusion happens, the fused op is the one
# we check for (de)functionalization # we check for (de)functionalization
return [torch.ops._C.fused_add_rms_norm_static_fp8_quant.default] # noqa: E501 return [torch.ops._C.fused_add_rms_norm_static_fp8_quant.default]
else: else:
# If no fusion, the original ops are checked # If no fusion, the original ops are checked
return [ return [
@ -322,7 +322,7 @@ def sequence_parallelism_pass_on_test_model(
# check if the functionalization pass is applied # check if the functionalization pass is applied
for op in model.ops_in_model(): for op in model.ops_in_model():
find_auto_fn(backend_no_func.graph_post_pass.nodes, op) find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None # noqa: E501 assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None
# make sure the ops were all de-functionalized # make sure the ops were all de-functionalized
found = dict() found = dict()

View File

@ -104,7 +104,7 @@ TEXT_GENERATION_MODELS = {
# [Decoder-only] # [Decoder-only]
# Uses Llama # Uses Llama
# "BAAI/AquilaChat-7B": PPTestSettings.fast(), # "BAAI/AquilaChat-7B": PPTestSettings.fast(),
"Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(load_format="dummy"), # noqa: E501 "Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(load_format="dummy"),
"baichuan-inc/Baichuan-7B": PPTestSettings.fast(), "baichuan-inc/Baichuan-7B": PPTestSettings.fast(),
"baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(), "baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(),
"bigscience/bloomz-1b1": PPTestSettings.fast(), "bigscience/bloomz-1b1": PPTestSettings.fast(),
@ -138,7 +138,7 @@ TEXT_GENERATION_MODELS = {
# Uses Llama # Uses Llama
# "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(), # "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
"state-spaces/mamba-130m-hf": PPTestSettings.fast(), "state-spaces/mamba-130m-hf": PPTestSettings.fast(),
"mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(load_format="dummy"), # noqa: E501 "mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(load_format="dummy"),
"mosaicml/mpt-7b": PPTestSettings.fast(), "mosaicml/mpt-7b": PPTestSettings.fast(),
"nvidia/Minitron-8B-Base": PPTestSettings.fast(), "nvidia/Minitron-8B-Base": PPTestSettings.fast(),
"allenai/OLMo-1B-hf": PPTestSettings.fast(), "allenai/OLMo-1B-hf": PPTestSettings.fast(),
@ -151,13 +151,13 @@ TEXT_GENERATION_MODELS = {
"microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(), "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(),
"microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed( "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(
multi_node_only=True, load_format="dummy" multi_node_only=True, load_format="dummy"
), # noqa: E501 ),
"Qwen/Qwen-7B-Chat": PPTestSettings.fast(), "Qwen/Qwen-7B-Chat": PPTestSettings.fast(),
"Qwen/Qwen2.5-0.5B-Instruct": PPTestSettings.fast(), "Qwen/Qwen2.5-0.5B-Instruct": PPTestSettings.fast(),
"Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(), "Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
"stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(), "stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
"bigcode/starcoder2-3b": PPTestSettings.fast(), "bigcode/starcoder2-3b": PPTestSettings.fast(),
"upstage/solar-pro-preview-instruct": PPTestSettings.fast(load_format="dummy"), # noqa: E501 "upstage/solar-pro-preview-instruct": PPTestSettings.fast(load_format="dummy"),
# FIXME: Cannot load tokenizer in latest transformers version. # FIXME: Cannot load tokenizer in latest transformers version.
# Need to use tokenizer from `meta-llama/Llama-2-7b-chat-hf` # Need to use tokenizer from `meta-llama/Llama-2-7b-chat-hf`
# "xverse/XVERSE-7B-Chat": PPTestSettings.fast(), # "xverse/XVERSE-7B-Chat": PPTestSettings.fast(),

View File

@ -83,7 +83,8 @@ def sample_complex_json_schema():
"type": "array", "type": "array",
"items": { "items": {
"type": "string", "type": "string",
"pattern": "^[a-z]{1,10}$", # Combining length and pattern restrictions # Combining length and pattern restrictions
"pattern": "^[a-z]{1,10}$",
}, },
}, },
}, },

View File

@ -145,7 +145,7 @@ async def test_single_chat_session_audio_base64encoded(
{ {
"type": "audio_url", "type": "audio_url",
"audio_url": { "audio_url": {
"url": f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}" "url": f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}" # noqa: E501
}, },
}, },
{"type": "text", "text": "What's happening in this audio?"}, {"type": "text", "text": "What's happening in this audio?"},

View File

@ -835,17 +835,18 @@ async def test_extra_fields_allowed(client: openai.AsyncOpenAI):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_complex_message_content(client: openai.AsyncOpenAI): async def test_complex_message_content(client: openai.AsyncOpenAI):
content = [
{
"type": "text",
"text": "what is 1+1? please provide the result without any other text.",
}
]
resp = await client.chat.completions.create( resp = await client.chat.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
messages=[ messages=[
{ {
"role": "user", "role": "user",
"content": [ "content": content,
{
"type": "text",
"text": "what is 1+1? please provide the result without any other text.",
}
],
} }
], ],
temperature=0, temperature=0,

View File

@ -76,8 +76,8 @@ def test_load_chat_template():
assert ( assert (
template_content template_content
== """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %} == """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %}
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}""" {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}""" # noqa: E501
) # noqa: E501 )
def test_no_load_chat_template_filelike(): def test_no_load_chat_template_filelike():

View File

@ -45,12 +45,13 @@ TOOLS = [
"properties": { "properties": {
"city": { "city": {
"type": "string", "type": "string",
"description": "The city to find the weather for, e.g. 'San Francisco'", "description": "The city to find the weather for, e.g. "
"'San Francisco'",
}, },
"state": { "state": {
"type": "string", "type": "string",
"description": "the two-letter abbreviation for the state that the city is" "description": "the two-letter abbreviation for the state that "
" in, e.g. 'CA' which would mean 'California'", "the city is in, e.g. 'CA' which would mean 'California'",
}, },
"unit": { "unit": {
"type": "string", "type": "string",
@ -69,7 +70,8 @@ MESSAGES = [
{"role": "assistant", "content": "I'm doing well! How can I help you?"}, {"role": "assistant", "content": "I'm doing well! How can I help you?"},
{ {
"role": "user", "role": "user",
"content": "Can you tell me what the temperate will be in Dallas, in fahrenheit?", "content": "Can you tell me what the temperate will be in Dallas, "
"in fahrenheit?",
}, },
] ]

View File

@ -25,12 +25,14 @@ tools = [
"properties": { "properties": {
"city": { "city": {
"type": "string", "type": "string",
"description": "The city to find the weather for, e.g. 'Vienna'", "description": "The city to find the weather for, e.g. "
"'Vienna'",
"default": "Vienna", "default": "Vienna",
}, },
"country": { "country": {
"type": "string", "type": "string",
"description": "The country that the city is in, e.g. 'Austria'", "description": "The country that the city is in, e.g. "
"'Austria'",
}, },
"unit": { "unit": {
"type": "string", "type": "string",
@ -85,12 +87,14 @@ tools = [
"properties": { "properties": {
"city": { "city": {
"type": "string", "type": "string",
"description": "The city to get the forecast for, e.g. 'Vienna'", "description": "The city to get the forecast for, e.g. "
"'Vienna'",
"default": "Vienna", "default": "Vienna",
}, },
"country": { "country": {
"type": "string", "type": "string",
"description": "The country that the city is in, e.g. 'Austria'", "description": "The country that the city is in, e.g. "
"'Austria'",
}, },
"days": { "days": {
"type": "integer", "type": "integer",

View File

@ -179,7 +179,7 @@ async def test_single_chat_session_video_base64encoded(
{ {
"type": "video_url", "type": "video_url",
"video_url": { "video_url": {
"url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" "url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" # noqa: E501
}, },
}, },
{"type": "text", "text": "What's in this video?"}, {"type": "text", "text": "What's in this video?"},
@ -238,7 +238,7 @@ async def test_single_chat_session_video_base64encoded_beamsearch(
{ {
"type": "video_url", "type": "video_url",
"video_url": { "video_url": {
"url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" "url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" # noqa: E501
}, },
}, },
{"type": "text", "text": "What's in this video?"}, {"type": "text", "text": "What's in this video?"},

View File

@ -233,7 +233,7 @@ async def test_single_chat_session_image_base64encoded(
{ {
"type": "image_url", "type": "image_url",
"image_url": { "image_url": {
"url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}" "url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}" # noqa: E501
}, },
}, },
{"type": "text", "text": content_text}, {"type": "text", "text": content_text},
@ -300,7 +300,7 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
{ {
"type": "image_url", "type": "image_url",
"image_url": { "image_url": {
"url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}" "url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}" # noqa: E501
}, },
}, },
{"type": "text", "text": "What's in this image?"}, {"type": "text", "text": "What's in this image?"},

View File

@ -947,7 +947,8 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
{"type": "image_url", "image_url": {"url": image_url}}, {"type": "image_url", "image_url": {"url": image_url}},
{ {
"type": "text", "type": "text",
"text": "What's in <|image_1|> and how does it compare to the other one?", # noqa: E501 "text": "What's in <|image_1|> and how does it compare to "
"the other one?",
}, },
], ],
} }
@ -960,8 +961,8 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
assert conversation == [ assert conversation == [
{ {
"role": "user", "role": "user",
"content": "<|image_2|>\nWhat's in <|image_1|> and how does it compare to the " "content": "<|image_2|>\nWhat's in <|image_1|> and how does it compare to "
"other one?", "the other one?",
} }
] ]
_assert_mm_data_is_image_input(mm_data, 2) _assert_mm_data_is_image_input(mm_data, 2)
@ -1364,7 +1365,7 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interleave( # noqa: E501 def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interleave(
phi3v_model_config_mm_interleaved, phi3v_model_config_mm_interleaved,
phi3v_tokenizer, phi3v_tokenizer,
image_url, image_url,
@ -1451,14 +1452,14 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
assert conversation == [ assert conversation == [
{ {
"role": "user", "role": "user",
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n" "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
"Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", # noqa: E501 "\nNow listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",
}, },
{"role": "assistant", "content": "Some stuff."}, {"role": "assistant", "content": "Some stuff."},
{ {
"role": "user", "role": "user",
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n" "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
"And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>", "\nAnd what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
}, },
] ]
@ -1468,7 +1469,7 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
_assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None]) _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None])
def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interleave( # noqa: E501 def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interleave(
qwen25omni_model_config_mm_interleaved, qwen25omni_model_config_mm_interleaved,
qwen25omni_tokenizer, qwen25omni_tokenizer,
image_url, image_url,
@ -1521,14 +1522,14 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl
assert conversation == [ assert conversation == [
{ {
"role": "user", "role": "user",
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n" "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
"Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", # noqa: E501 "\nNow listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",
}, },
{"role": "assistant", "content": "Some stuff."}, {"role": "assistant", "content": "Some stuff."},
{ {
"role": "user", "role": "user",
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n" "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
"And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>", "\nAnd what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
}, },
] ]
@ -1593,14 +1594,14 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes
assert conversation == [ assert conversation == [
{ {
"role": "user", "role": "user",
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n" "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
"Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", # noqa: E501 "\nNow listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",
}, },
{"role": "assistant", "content": "Some stuff."}, {"role": "assistant", "content": "Some stuff."},
{ {
"role": "user", "role": "user",
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n" "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
"And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>", "\nAnd what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
}, },
] ]
@ -1661,14 +1662,14 @@ def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_message
assert conversation == [ assert conversation == [
{ {
"role": "user", "role": "user",
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n" "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
"Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", # noqa: E501 "\nNow listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",
}, },
{"role": "assistant", "content": "Some stuff."}, {"role": "assistant", "content": "Some stuff."},
{ {
"role": "user", "role": "user",
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n" "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
"And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>", "\nAnd what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
}, },
] ]
@ -2193,7 +2194,8 @@ def test_parse_chat_messages_single_empty_audio_with_uuid(
assert conversation == [ assert conversation == [
{ {
"role": "user", "role": "user",
"content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?", "content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the "
"audio say?",
} }
] ]
_assert_mm_data_inputs(mm_data, {"audio": 1}) _assert_mm_data_inputs(mm_data, {"audio": 1})
@ -2228,7 +2230,8 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
assert conversation == [ assert conversation == [
{ {
"role": "user", "role": "user",
"content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?", "content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the "
"audio say?",
} }
] ]
_assert_mm_data_inputs(await mm_future, {"audio": 1}) _assert_mm_data_inputs(await mm_future, {"audio": 1})

View File

@ -165,7 +165,7 @@ def test_env(
# FlashMLA only supports block_size == 64 # FlashMLA only supports block_size == 64
pytest.skip("FlashMLA only supports block_size 64") pytest.skip("FlashMLA only supports block_size 64")
else: else:
from vllm.v1.attention.backends.mla.flashmla import ( # noqa: E501 from vllm.v1.attention.backends.mla.flashmla import (
is_flashmla_supported, is_flashmla_supported,
) )

View File

@ -331,7 +331,8 @@ class WeightTensors:
in_dtype=config.dtype, in_dtype=config.dtype,
quant_dtype=config.quant_dtype, quant_dtype=config.quant_dtype,
block_shape=config.quant_block_shape, block_shape=config.quant_block_shape,
per_out_ch_quant=config.is_per_act_token_quant, # or config.is_per_out_ch_quant # or config.is_per_out_ch_quant
per_out_ch_quant=config.is_per_act_token_quant,
) )
return WeightTensors( return WeightTensors(
w1=w1, w2=w2, w1_scale=w1_scale, w2_scale=w2_scale, w1_gs=w1_gs, w2_gs=w2_gs w1=w1, w2=w2, w1_scale=w1_scale, w2_scale=w2_scale, w1_gs=w1_gs, w2_gs=w2_gs

View File

@ -124,7 +124,7 @@ def make_feature_matrix(csv_file_path: str):
results_df: Optional[pd.DataFrame] = None results_df: Optional[pd.DataFrame] = None
for m, k, n, e, topks, dtype, pf_type, experts_type, quant_config in tqdm( for m, k, n, e, topks, dtype, pf_type, experts_type, quant_config in tqdm(
combinations combinations
): # noqa: E501 ):
config = Config( config = Config(
Ms=[m], Ms=[m],
K=k, K=k,

View File

@ -10,7 +10,7 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
BatchedDeepGemmExperts, BatchedDeepGemmExperts,
) )
from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import ( # noqa: E501 from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (
BatchedTritonOrDeepGemmExperts, BatchedTritonOrDeepGemmExperts,
) )
from vllm.model_executor.layers.fused_moe.config import ( from vllm.model_executor.layers.fused_moe.config import (
@ -196,10 +196,10 @@ register_experts(
# Disable on blackwell for now # Disable on blackwell for now
if has_deep_ep() and not current_platform.has_device_capability(100): if has_deep_ep() and not current_platform.has_device_capability(100):
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( # noqa: E501 from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (
DeepEPHTPrepareAndFinalize, DeepEPHTPrepareAndFinalize,
) )
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( # noqa: E501 from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (
DeepEPLLPrepareAndFinalize, DeepEPLLPrepareAndFinalize,
) )
@ -233,7 +233,7 @@ if has_pplx():
) )
if has_flashinfer_cutlass_fused_moe() and current_platform.has_device_capability(100): if has_flashinfer_cutlass_fused_moe() and current_platform.has_device_capability(100):
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( # noqa: E501 from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
FlashInferExperts, FlashInferExperts,
) )
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501 from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501

View File

@ -17,10 +17,10 @@ from typing_extensions import Concatenate, ParamSpec
from vllm.utils import get_open_port, has_deep_ep from vllm.utils import get_open_port, has_deep_ep
if has_deep_ep(): if has_deep_ep():
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( # noqa: E501 from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (
DeepEPHTPrepareAndFinalize, DeepEPHTPrepareAndFinalize,
) )
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( # noqa: E501 from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (
DeepEPLLPrepareAndFinalize, DeepEPLLPrepareAndFinalize,
) )

View File

@ -30,10 +30,10 @@ from .parallel_utils import ProcessGroupInfo, parallel_launch
from .utils import make_test_weights from .utils import make_test_weights
if has_deep_ep(): if has_deep_ep():
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( # noqa: E501 from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (
DeepEPHTPrepareAndFinalize, DeepEPHTPrepareAndFinalize,
) )
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( # noqa: E501 from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (
DeepEPLLPrepareAndFinalize, DeepEPLLPrepareAndFinalize,
) )

View File

@ -28,10 +28,10 @@ from ...utils import multi_gpu_test
from .parallel_utils import ProcessGroupInfo, parallel_launch from .parallel_utils import ProcessGroupInfo, parallel_launch
if has_deep_ep(): if has_deep_ep():
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( # noqa: E501 from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (
DeepEPHTPrepareAndFinalize, DeepEPHTPrepareAndFinalize,
) )
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( # noqa: E501 from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (
DeepEPLLPrepareAndFinalize, DeepEPLLPrepareAndFinalize,
) )

View File

@ -271,7 +271,7 @@ if __name__ == "__main__":
parser = make_config_arg_parser( parser = make_config_arg_parser(
description=( description=(
"Run single prepare-finalize & fused-experts combination test" "Run single prepare-finalize & fused-experts combination test"
"Example : python3 -m tests.kernels.moe.test_modular_kernel_combinations " # noqa: E501 "Example : python3 -m tests.kernels.moe.test_modular_kernel_combinations "
"--pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts" "--pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts"
) )
) )

View File

@ -483,8 +483,8 @@ def test_mixtral_moe(
} }
if use_rocm_aiter: if use_rocm_aiter:
# The values of rtol and atol are set based on the tests in ROCM AITER package. # noqa: E501 # The values of rtol and atol are set based on the tests in ROCM AITER package.
# https://github.com/ROCm/aiter/blob/dfed377f4be7da96ca2d75ac0761f569676f7240/op_tests/test_moe.py#L174 # noqa: E501 # https://github.com/ROCm/aiter/blob/dfed377f4be7da96ca2d75ac0761f569676f7240/op_tests/test_moe.py#L174
torch.testing.assert_close( torch.testing.assert_close(
hf_states.flatten(0, 1), vllm_states, rtol=0.01, atol=100 hf_states.flatten(0, 1), vllm_states, rtol=0.01, atol=100
) )

View File

@ -10,11 +10,11 @@ import pytest
import torch import torch
from packaging import version from packaging import version
from vllm.model_executor.layers.quantization.quark.quark import ( # noqa: E501 from vllm.model_executor.layers.quantization.quark.quark import (
QuarkLinearMethod, QuarkLinearMethod,
QuarkW4A4MXFP4, QuarkW4A4MXFP4,
) )
from vllm.model_executor.layers.quantization.quark.quark_moe import ( # noqa: E501 from vllm.model_executor.layers.quantization.quark.quark_moe import (
QuarkW4A4MXFp4MoEMethod, QuarkW4A4MXFp4MoEMethod,
) )
from vllm.platforms import current_platform from vllm.platforms import current_platform

View File

@ -12,7 +12,7 @@ PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example
EXPECTED_LORA_OUTPUT = [ EXPECTED_LORA_OUTPUT = [
"SELECT count(*) FROM singer", "SELECT count(*) FROM singer",
"SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", # noqa: E501 "SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'",
"SELECT name , country , age FROM singer ORDER BY age", "SELECT name , country , age FROM singer ORDER BY age",
] ]
@ -21,10 +21,16 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
prompts = [ prompts = [
PROMPT_TEMPLATE.format(query="How many singers do we have?"), PROMPT_TEMPLATE.format(query="How many singers do we have?"),
PROMPT_TEMPLATE.format( PROMPT_TEMPLATE.format(
query="What is the average, minimum, and maximum age of all singers from France?" # noqa: E501 query=(
"What is the average, minimum, and maximum "
"age of all singers from France?"
)
), ),
PROMPT_TEMPLATE.format( PROMPT_TEMPLATE.format(
query="Show name, country, age for all singers ordered by age from the oldest to the youngest." # noqa: E501 query=(
"Show name, country, age for all singers ordered "
"by age from the oldest to the youngest."
)
), ),
] ]
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32) sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)

View File

@ -15,10 +15,10 @@ MODEL_PATH = "meta-llama/Llama-2-7b-hf"
EXPECTED_LORA_OUTPUT = [ EXPECTED_LORA_OUTPUT = [
" SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501 " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501
" SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ", # noqa: E501 " SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ",
" SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ", # noqa: E501 " SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ", # noqa: E501
" SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ", # noqa: E501 " SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ", # noqa: E501
" SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ", # noqa: E501 " SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ",
" SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' ", # noqa: E501 " SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' ", # noqa: E501
] ]

View File

@ -26,7 +26,7 @@ LORA_RANK = 8
LORA_TEST_PROMPTS = ["What is GitHub?", "Hi, tell me about you"] LORA_TEST_PROMPTS = ["What is GitHub?", "Hi, tell me about you"]
LORA_TEST_EXPECTED = [ LORA_TEST_EXPECTED = [
"GitHub is an open-source platform that provides a way to manage and develop software projects. It allows developers to store and manage code, collaborate on projects, and automate tasks.", # noqa: E501 "GitHub is an open-source platform that provides a way to manage and develop software projects. It allows developers to store and manage code, collaborate on projects, and automate tasks.", # noqa: E501
"I am Alice, an AI assistant developed by GitHub/Charent.", # noqa: E501 "I am Alice, an AI assistant developed by GitHub/Charent.",
] ]

View File

@ -16,7 +16,7 @@ def test_dummy_loader(vllm_runner, monkeypatch, model: str) -> None:
) as llm: ) as llm:
if model == "google/gemma-3-4b-it": if model == "google/gemma-3-4b-it":
normalizers = llm.llm.collective_rpc( normalizers = llm.llm.collective_rpc(
lambda self: self.model_runner.model.language_model.model.normalizer.cpu().item() lambda self: self.model_runner.model.language_model.model.normalizer.cpu().item() # noqa: E501
) )
config = llm.llm.llm_engine.model_config.hf_config.text_config config = llm.llm.llm_engine.model_config.hf_config.text_config
else: else:

View File

@ -46,12 +46,13 @@ TOOLS = [
"properties": { "properties": {
"city": { "city": {
"type": "string", "type": "string",
"description": "The city to find the weather for, e.g. 'San Francisco'", "description": "The city to find the weather for, e.g. "
"'San Francisco'",
}, },
"state": { "state": {
"type": "string", "type": "string",
"description": "the two-letter abbreviation for the state that the city is" "description": "the two-letter abbreviation for the state that "
" in, e.g. 'CA' which would mean 'California'", "the city is in, e.g. 'CA' which would mean 'California'",
}, },
"unit": { "unit": {
"type": "string", "type": "string",
@ -85,7 +86,8 @@ MSGS = [
{"role": "system", "content": "You are an assistant."}, {"role": "system", "content": "You are an assistant."},
{ {
"role": "user", "role": "user",
"content": "Could you please rewrite the below article? \n\n My English needs improvving, maybe I make errors.", # noqa "content": "Could you please rewrite the below article? \n\n My English needs "
"improvving, maybe I make errors.",
}, },
{ {
"role": "assistant", "role": "assistant",
@ -96,14 +98,16 @@ MSGS = [
"type": "function", "type": "function",
"function": { "function": {
"name": "rewrite", "name": "rewrite",
"arguments": '{"text":"My English needs improvving, maybe I make errors."}', # noqa "arguments": '{"text":"My English needs improvving, maybe '
'I make errors."}',
}, },
} }
], ],
}, },
{ {
"role": "tool", "role": "tool",
"content": '{"action":"rewrite","outcome":"My English needs improving, maybe I make errors."}', # noqa "content": '{"action":"rewrite","outcome":"My English needs improving, maybe '
'I make errors."}',
"tool_call_id": "bbc5b7ede", "tool_call_id": "bbc5b7ede",
"name": "rewrite", "name": "rewrite",
}, },

View File

@ -130,14 +130,14 @@ VLM_TEST_SETTINGS = {
dtype="bfloat16", dtype="bfloat16",
marks=[ marks=[
pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask") pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")
], # noqa: E501 ],
), ),
"qwen2_5_vl": VLMTestInfo( "qwen2_5_vl": VLMTestInfo(
models=["Qwen/Qwen2.5-VL-3B-Instruct"], models=["Qwen/Qwen2.5-VL-3B-Instruct"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501 img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501 video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
@ -149,8 +149,8 @@ VLM_TEST_SETTINGS = {
models=["Qwen/Qwen2.5-Omni-3B"], models=["Qwen/Qwen2.5-Omni-3B"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_bos|><|IMAGE|><|vision_eos|>", # noqa: E501 img_idx_to_prompt=lambda idx: "<|vision_bos|><|IMAGE|><|vision_eos|>",
video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>", # noqa: E501 video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
num_logprobs=6 if current_platform.is_cpu() else 5, num_logprobs=6 if current_platform.is_cpu() else 5,
@ -181,7 +181,7 @@ VLM_TEST_SETTINGS = {
max_model_len=16384, max_model_len=16384,
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs( hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf" "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
), # noqa: E501 ),
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output, vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
image_size_factors=[(0.25, 0.5, 1.0)], image_size_factors=[(0.25, 0.5, 1.0)],
@ -213,7 +213,7 @@ VLM_TEST_SETTINGS = {
models=["Qwen/Qwen2.5-VL-3B-Instruct"], models=["Qwen/Qwen2.5-VL-3B-Instruct"],
test_type=VLMTestType.IMAGE, test_type=VLMTestType.IMAGE,
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501 img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
@ -237,10 +237,10 @@ VLM_TEST_SETTINGS = {
single_image_prompts=IMAGE_ASSETS.prompts( single_image_prompts=IMAGE_ASSETS.prompts(
{ {
"stop_sign": "<vlm_image>Please describe the image shortly.", "stop_sign": "<vlm_image>Please describe the image shortly.",
"cherry_blossom": "<vlm_image>Please infer the season with reason.", # noqa: E501 "cherry_blossom": "<vlm_image>Please infer the season with reason.",
} }
), ),
multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.", # noqa: E501 multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.",
stop_str=["<|im_end|>"], stop_str=["<|im_end|>"],
image_size_factors=[(0.10, 0.15)], image_size_factors=[(0.10, 0.15)],
max_tokens=64, max_tokens=64,
@ -252,11 +252,11 @@ VLM_TEST_SETTINGS = {
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts( single_image_prompts=IMAGE_ASSETS.prompts(
{ {
"stop_sign": "<image>What's the content in the center of the image?", # noqa: E501 "stop_sign": "<image>What's the content in the center of the image?",
"cherry_blossom": "<image>What is the season?", # noqa: E501 "cherry_blossom": "<image>What is the season?",
} }
), ),
multi_image_prompt="<image><image>Describe the two images in detail.", # noqa: E501 multi_image_prompt="<image><image>Describe the two images in detail.",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
@ -268,11 +268,11 @@ VLM_TEST_SETTINGS = {
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts( single_image_prompts=IMAGE_ASSETS.prompts(
{ {
"stop_sign": "<image>What's the content in the center of the image?", # noqa: E501 "stop_sign": "<image>What's the content in the center of the image?",
"cherry_blossom": "<image>What is the season?", # noqa: E501 "cherry_blossom": "<image>What is the season?",
} }
), ),
multi_image_prompt="<image><image>Describe the two images in detail.", # noqa: E501 multi_image_prompt="<image><image>Describe the two images in detail.",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
@ -311,14 +311,14 @@ VLM_TEST_SETTINGS = {
max_num_seqs=2, max_num_seqs=2,
single_image_prompts=IMAGE_ASSETS.prompts( single_image_prompts=IMAGE_ASSETS.prompts(
{ {
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501 "stop_sign": "<image>\nWhat's the content in the center of the image?",
"cherry_blossom": "<image>\nPlease infer the season with reason in details.", # noqa: E501 "cherry_blossom": "<image>\nPlease infer the season with reason in details.", # noqa: E501
} }
), ),
multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?", # noqa: E501 multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?", # noqa: E501
patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner, patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output, hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
stop_str=["<end▁of▁sentence>", "<begin▁of▁sentence>"], # noqa: E501 stop_str=["<end▁of▁sentence>", "<begin▁of▁sentence>"],
image_size_factors=[(), (1.0,), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)], image_size_factors=[(), (1.0,), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
), ),
"fuyu": VLMTestInfo( "fuyu": VLMTestInfo(
@ -342,7 +342,7 @@ VLM_TEST_SETTINGS = {
single_image_prompts=IMAGE_ASSETS.prompts( single_image_prompts=IMAGE_ASSETS.prompts(
{ {
"stop_sign": "<start_of_image>What's the content in the center of the image?", # noqa: E501 "stop_sign": "<start_of_image>What's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<start_of_image>What is the season?", # noqa: E501 "cherry_blossom": "<start_of_image>What is the season?",
} }
), ),
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501 multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
@ -356,7 +356,7 @@ VLM_TEST_SETTINGS = {
"glm4v": VLMTestInfo( "glm4v": VLMTestInfo(
models=["zai-org/glm-4v-9b"], models=["zai-org/glm-4v-9b"],
test_type=VLMTestType.IMAGE, test_type=VLMTestType.IMAGE,
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",
single_image_prompts=IMAGE_ASSETS.prompts( single_image_prompts=IMAGE_ASSETS.prompts(
{ {
"stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?", # noqa: E501 "stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?", # noqa: E501
@ -377,9 +377,9 @@ VLM_TEST_SETTINGS = {
"glm4_1v": VLMTestInfo( "glm4_1v": VLMTestInfo(
models=["zai-org/GLM-4.1V-9B-Thinking"], models=["zai-org/GLM-4.1V-9B-Thinking"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",
img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>", # noqa: E501 img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>",
video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>", # noqa: E501 video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>",
max_model_len=2048, max_model_len=2048,
max_num_seqs=2, max_num_seqs=2,
get_stop_token_ids=lambda tok: [151329, 151336, 151338], get_stop_token_ids=lambda tok: [151329, 151336, 151338],
@ -410,10 +410,10 @@ VLM_TEST_SETTINGS = {
"h2oai/h2ovl-mississippi-2b", "h2oai/h2ovl-mississippi-2b",
], ],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>",
single_image_prompts=IMAGE_ASSETS.prompts( single_image_prompts=IMAGE_ASSETS.prompts(
{ {
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501 "stop_sign": "<image>\nWhat's the content in the center of the image?",
"cherry_blossom": "<image>\nWhat is the season?", "cherry_blossom": "<image>\nWhat is the season?",
} }
), ),
@ -444,7 +444,7 @@ VLM_TEST_SETTINGS = {
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts( single_image_prompts=IMAGE_ASSETS.prompts(
{ {
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501 "stop_sign": "<image>\nWhat's the content in the center of the image?",
"cherry_blossom": "<image>\nWhat is the season?", "cherry_blossom": "<image>\nWhat is the season?",
} }
), ),
@ -529,7 +529,7 @@ VLM_TEST_SETTINGS = {
max_model_len=16384, max_model_len=16384,
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs( hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf" "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
), # noqa: E501 ),
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output, vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
custom_test_opts=[ custom_test_opts=[
@ -583,7 +583,7 @@ VLM_TEST_SETTINGS = {
max_num_seqs=2, max_num_seqs=2,
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids( get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(
["<|im_end|>", "<|endoftext|>"] ["<|im_end|>", "<|endoftext|>"]
), # noqa: E501 ),
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner, patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
# FIXME: https://huggingface.co/openbmb/MiniCPM-o-2_6/discussions/49 # FIXME: https://huggingface.co/openbmb/MiniCPM-o-2_6/discussions/49
@ -598,7 +598,7 @@ VLM_TEST_SETTINGS = {
max_num_seqs=2, max_num_seqs=2,
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids( get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(
["<|im_end|>", "<|endoftext|>"] ["<|im_end|>", "<|endoftext|>"]
), # noqa: E501 ),
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner, patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
), ),
@ -627,7 +627,7 @@ VLM_TEST_SETTINGS = {
models=["AIDC-AI/Ovis1.6-Gemma2-9B"], models=["AIDC-AI/Ovis1.6-Gemma2-9B"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501 img_idx_to_prompt=lambda idx: "<image>\n",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
dtype="half", dtype="half",
@ -640,7 +640,7 @@ VLM_TEST_SETTINGS = {
models=["AIDC-AI/Ovis2-1B"], models=["AIDC-AI/Ovis2-1B"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501 img_idx_to_prompt=lambda idx: "<image>\n",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
dtype="half", dtype="half",
@ -652,7 +652,7 @@ VLM_TEST_SETTINGS = {
models=["AIDC-AI/Ovis2.5-2B"], models=["AIDC-AI/Ovis2.5-2B"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501 img_idx_to_prompt=lambda idx: "<image>\n",
video_idx_to_prompt=lambda idx: "<video>\n", video_idx_to_prompt=lambda idx: "<video>\n",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
@ -701,8 +701,8 @@ VLM_TEST_SETTINGS = {
models=["Qwen/Qwen2-VL-2B-Instruct"], models=["Qwen/Qwen2-VL-2B-Instruct"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501 img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501 video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>",
multi_image_prompt="Picture 1: <vlm_image>\nPicture 2: <vlm_image>\nDescribe these two images with one paragraph respectively.", # noqa: E501 multi_image_prompt="Picture 1: <vlm_image>\nPicture 2: <vlm_image>\nDescribe these two images with one paragraph respectively.", # noqa: E501
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
@ -717,11 +717,11 @@ VLM_TEST_SETTINGS = {
prompt_formatter=lambda img_prompt: f"<begin▁of▁sentence><User>\n{img_prompt}<Assistant><think>\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<begin▁of▁sentence><User>\n{img_prompt}<Assistant><think>\n", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts( single_image_prompts=IMAGE_ASSETS.prompts(
{ {
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501 "stop_sign": "<image>\nWhat's the content in the center of the image?",
"cherry_blossom": "<image>\nWhat is the season?", "cherry_blossom": "<image>\nWhat is the season?",
} }
), ),
multi_image_prompt="<image>\n<image>\nDescribe the two images in short.", # noqa: E501 multi_image_prompt="<image>\n<image>\nDescribe the two images in short.",
max_model_len=4096, max_model_len=4096,
use_tokenizer_eos=True, use_tokenizer_eos=True,
patch_hf_runner=model_utils.skyworkr1v_patch_hf_runner, patch_hf_runner=model_utils.skyworkr1v_patch_hf_runner,
@ -754,8 +754,8 @@ VLM_TEST_SETTINGS = {
VLMTestType.VIDEO, VLMTestType.VIDEO,
), ),
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501 img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501 video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
@ -816,7 +816,7 @@ VLM_TEST_SETTINGS = {
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs( hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf" "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
), # noqa: E501 ),
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output, vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
custom_test_opts=[ custom_test_opts=[
CustomTestOptions( CustomTestOptions(

View File

@ -170,7 +170,7 @@ async def test_online_serving(client, audio_assets: AudioTestAssets):
], ],
{ {
"type": "text", "type": "text",
"text": f"What's happening in these {len(audio_assets)} audio clips?", "text": f"What's happening in these {len(audio_assets)} audio clips?", # noqa: E501
}, },
], ],
} }

View File

@ -101,16 +101,11 @@ async def test_online_serving(client, audio_assets: AudioTestAssets):
return audio_dict return audio_dict
audio_chunks = [asset_to_chunk(asset) for asset in audio_assets] audio_chunks = [asset_to_chunk(asset) for asset in audio_assets]
text = f"What's happening in these {len(audio_assets)} audio clips?"
messages = [ messages = [
{ {
"role": "user", "role": "user",
"content": [ "content": [*audio_chunks, {"type": "text", "text": text}],
*audio_chunks,
{
"type": "text",
"text": f"What's happening in these {len(audio_assets)} audio clips?",
},
],
} }
] ]

View File

@ -102,8 +102,8 @@ def multi_video_multi_aspect_ratio_inputs(
def different_patch_input_cases_internvl(): def different_patch_input_cases_internvl():
images = [asset.pil_image.resize((896, 896)) for asset in IMAGE_ASSETS] images = [asset.pil_image.resize((896, 896)) for asset in IMAGE_ASSETS]
formatter = ( formatter = (
lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n" lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501
) # noqa: E501 )
single_img_prompts = [ single_img_prompts = [
"<image>\nWhat's the content in the center of the image?", "<image>\nWhat's the content in the center of the image?",
"<image>\nWhat is the season?", "<image>\nWhat is the season?",

View File

@ -47,7 +47,8 @@ EXAMPLE_TOOLS = [
"properties": { "properties": {
"city": { "city": {
"type": "string", "type": "string",
"description": "The city to get the forecast for, e.g. 'New York'", "description": "The city to get the forecast for, e.g. "
"'New York'",
}, },
"days": { "days": {
"type": "integer", "type": "integer",

View File

@ -134,15 +134,15 @@ def get_attention_backend(backend_name: _Backend):
else "vllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend" else "vllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend"
), ),
_Backend.FLASHINFER: "vllm.v1.attention.backends.flashinfer.FlashInferBackend", _Backend.FLASHINFER: "vllm.v1.attention.backends.flashinfer.FlashInferBackend",
_Backend.FLEX_ATTENTION: "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend", _Backend.FLEX_ATTENTION: "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend", # noqa: E501
_Backend.TRITON_ATTN: "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend", _Backend.TRITON_ATTN: "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend", # noqa: E501
_Backend.TREE_ATTN: "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend", _Backend.TREE_ATTN: "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend",
_Backend.XFORMERS: "vllm.v1.attention.backends.xformers.XFormersAttentionBackend", _Backend.XFORMERS: "vllm.v1.attention.backends.xformers.XFormersAttentionBackend", # noqa: E501
_Backend.CUTLASS_MLA: "vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend", _Backend.CUTLASS_MLA: "vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend", # noqa: E501
_Backend.FLASHMLA: "vllm.v1.attention.backends.mla.flashmla.FlashMLABackend", _Backend.FLASHMLA: "vllm.v1.attention.backends.mla.flashmla.FlashMLABackend",
_Backend.FLASH_ATTN_MLA: "vllm.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend", _Backend.FLASH_ATTN_MLA: "vllm.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend", # noqa: E501
_Backend.FLASHINFER_MLA: "vllm.v1.attention.backends.mla.flashinfer_mla.FlashInferMLABackend", _Backend.FLASHINFER_MLA: "vllm.v1.attention.backends.mla.flashinfer_mla.FlashInferMLABackend", # noqa: E501
_Backend.TRITON_MLA: "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend", _Backend.TRITON_MLA: "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend", # noqa: E501
} }
if backend_name not in backend_map: if backend_name not in backend_map:

View File

@ -104,7 +104,7 @@ async def test_single_chat_session_image_base64encoded(
"content": [ "content": [
{ {
"type": "input_image", "type": "input_image",
"image_url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}", "image_url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}", # noqa: E501
"detail": "auto", "detail": "auto",
}, },
{"type": "input_text", "text": content_text}, {"type": "input_text", "text": content_text},

View File

@ -15,8 +15,9 @@ RTOL = 0.03
EXPECTED_VALUES = {"Qwen/Qwen3-0.6B": 0.41, "deepseek-ai/deepseek-vl2-small": 0.59} EXPECTED_VALUES = {"Qwen/Qwen3-0.6B": 0.41, "deepseek-ai/deepseek-vl2-small": 0.59}
SIMPLE_PROMPT = ( SIMPLE_PROMPT = (
"The best part about working on vLLM is that I got to meet so many people across various different organizations like UCB, Google, and Meta which means", "The best part about working on vLLM is that I got to meet so many people across "
) # noqa: E501 "various different organizations like UCB, Google, and Meta which means",
)
# Get model name from environment variable # Get model name from environment variable
MODEL_NAME = os.environ.get("TEST_MODEL", "Qwen/Qwen3-0.6B") MODEL_NAME = os.environ.get("TEST_MODEL", "Qwen/Qwen3-0.6B")

View File

@ -127,7 +127,7 @@ class RequestRunner:
kv_role="kv_both", kv_role="kv_both",
kv_connector_extra_config={ kv_connector_extra_config={
"spec_name": "MockOffloadingSpec", "spec_name": "MockOffloadingSpec",
"spec_module_path": "tests.v1.kv_connector.unit.test_offloading_connector", "spec_module_path": "tests.v1.kv_connector.unit.test_offloading_connector", # noqa: E501
"block_size": offloaded_block_size, "block_size": offloaded_block_size,
}, },
) )

View File

@ -260,15 +260,8 @@ def test_pooling_rejects_custom_logitsprocs(
gpu_memory_utilization=0.1, gpu_memory_utilization=0.1,
) )
# Require that no logitsprocs have been loaded # Require that no logitsprocs have been loaded
assert ( worker = llm.llm_engine.model_executor.driver_worker.worker
sum( assert sum([1 for _ in worker.model_runner.input_batch.logitsprocs.all]) == 0
[
1
for _ in llm.llm_engine.model_executor.driver_worker.worker.model_runner.input_batch.logitsprocs.all
]
)
== 0
)
return return
kwargs: dict[str, list[Union[str, type[LogitsProcessor]]]] = {} kwargs: dict[str, list[Union[str, type[LogitsProcessor]]]] = {}

View File

@ -76,10 +76,14 @@ def _kv_cache_update_kernel(
static_argnames=["page_size", "num_slices_per_block"], static_argnames=["page_size", "num_slices_per_block"],
) )
def kv_cache_update( def kv_cache_update(
new_kv: jax.Array, # [total_num_token, num_combined_kv_heads, head_dim] # [total_num_token, num_combined_kv_heads, head_dim]
slices: jax.Array, # [3, slices], list of (kv_cache_start, new_kv_start, slice_len) new_kv: jax.Array,
kv_cache: jax.Array, # [total_num_pages * page_size, num_combined_kv_heads, head_dim] # [3, slices], list of (kv_cache_start, new_kv_start, slice_len)
num_kv_update_slices: jax.Array, # [1] slices: jax.Array,
# [total_num_pages * page_size, num_combined_kv_heads, head_dim]
kv_cache: jax.Array,
# [1]
num_kv_update_slices: jax.Array,
*, *,
page_size: int = 32, page_size: int = 32,
num_slices_per_block: int = 8, num_slices_per_block: int = 8,

View File

@ -834,7 +834,10 @@ class AllReduceFusedRMSNormStaticQuantFP8Pattern(BasePattern):
scale_out=None, scale_out=None,
rms_gamma=weight, rms_gamma=weight,
rms_eps=self.epsilon, rms_eps=self.epsilon,
pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant, # we don't use norm_out afterwards # We don't use norm_out afterwards
pattern_code=(
flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant
),
scale_factor=scale, scale_factor=scale,
**self.allreduce_params.get_trtllm_fused_allreduce_kwargs(), **self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
) )
@ -928,11 +931,14 @@ class AllReduceFusedAddRMSNormStaticQuantFP8Pattern(BasePattern):
scale_out=None, scale_out=None,
rms_gamma=weight, rms_gamma=weight,
rms_eps=self.epsilon, rms_eps=self.epsilon,
pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant, # we don't use norm_out afterwards # We don't use norm_out afterwards
pattern_code=(
flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant
),
scale_factor=scale, scale_factor=scale,
**self.allreduce_params.get_trtllm_fused_allreduce_kwargs(), **self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
) )
# # quant_out, rms_norm_residual # quant_out, rms_norm_residual
return allreduce[4], allreduce[2] return allreduce[4], allreduce[2]
pm.register_replacement( pm.register_replacement(
@ -1028,7 +1034,10 @@ class AllReduceFusedRMSNormStaticQuantNVFP4Pattern(BasePattern):
scale_out=output_scale, scale_out=output_scale,
rms_gamma=weight, rms_gamma=weight,
rms_eps=self.epsilon, rms_eps=self.epsilon,
pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant, # we don't use norm_out afterwards # We don't use norm_out afterwards
pattern_code=(
flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant
),
scale_factor=input_global_scale, scale_factor=input_global_scale,
**self.allreduce_params.get_trtllm_fused_allreduce_kwargs(), **self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
) )
@ -1130,7 +1139,10 @@ class AllReduceFusedAddRMSNormStaticQuantNVFP4Pattern(BasePattern):
scale_out=output_scale, scale_out=output_scale,
rms_gamma=weight, rms_gamma=weight,
rms_eps=self.epsilon, rms_eps=self.epsilon,
pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant, # we don't use norm_out afterwards # We don't use norm_out afterwards
pattern_code=(
flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant
),
scale_factor=input_global_scale, scale_factor=input_global_scale,
**self.allreduce_params.get_trtllm_fused_allreduce_kwargs(), **self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
) )

View File

@ -119,9 +119,12 @@ class TorchCompileWrapperWithCustomDispatcher:
src = depyf.decompile(new_code) src = depyf.decompile(new_code)
msg = ( msg = (
"Assigning / modifying buffers of nn.Module during forward pass is not allowed when using cudagraph inside the compiler because it will cause silent errors. Please use eager mode or fix the code. The following code contains clues about which buffer is being modified (please search for the usage of the function `update`):\n" "Assigning / modifying buffers of nn.Module during forward pass is not "
+ src "allowed when using cudagraph inside the compiler because it will "
) # noqa "cause silent errors. Please use eager mode or fix the code. The "
"following code contains clues about which buffer is being modified "
f"(please search for the usage of the function `update`):\n{src}"
)
raise RuntimeError(msg) raise RuntimeError(msg)
@contextmanager @contextmanager
@ -132,8 +135,9 @@ class TorchCompileWrapperWithCustomDispatcher:
variables as the original code. Therefore we can directly switch variables as the original code. Therefore we can directly switch
the code object in the function and call it. the code object in the function and call it.
See https://dev-discuss.pytorch.org/t/what-is-the-relationship-requirement-among-original-bytecode-transformed-bytecode-and-bytecode-returned-by-hooks-in-dynamo/1693/7 for more details. See https://dev-discuss.pytorch.org/t/what-is-the-relationship-requirement-among-original-bytecode-transformed-bytecode-and-bytecode-returned-by-hooks-in-dynamo/1693/7
""" # noqa for more details.
"""
self.__class__.forward.__code__ = self.compiled_codes[index] self.__class__.forward.__code__ = self.compiled_codes[index]
yield yield
self.__class__.forward.__code__ = self.original_code_object self.__class__.forward.__code__ = self.original_code_object

View File

@ -472,7 +472,7 @@ class VllmConfig:
self.compilation_config.cudagraph_mode.has_full_cudagraphs() self.compilation_config.cudagraph_mode.has_full_cudagraphs()
and self.model_config is not None and self.model_config is not None
and not self.model_config.disable_cascade_attn and not self.model_config.disable_cascade_attn
and not self.compilation_config.cudagraph_mode.has_piecewise_cudagraphs() and not self.compilation_config.cudagraph_mode.has_piecewise_cudagraphs() # noqa: E501
): ):
logger.warning_once( logger.warning_once(
"No piecewise cudagraph for executing cascade attention." "No piecewise cudagraph for executing cascade attention."

View File

@ -147,8 +147,9 @@ class PPLXAll2AllManager(All2AllManagerBase):
def __init__(self, cpu_group): def __init__(self, cpu_group):
assert has_pplx(), ( assert has_pplx(), (
"pplx_kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md to install pplx_kernels." "pplx_kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md"
) # noqa " to install pplx_kernels."
)
super().__init__(cpu_group) super().__init__(cpu_group)
if self.internode: if self.internode:
@ -220,7 +221,8 @@ class DeepEPAll2AllManagerBase(All2AllManagerBase):
def __init__(self, cpu_group): def __init__(self, cpu_group):
assert has_deep_ep(), ( assert has_deep_ep(), (
"DeepEP kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md to install DeepEP kernels." "DeepEP kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md"
" to install DeepEP kernels."
) # noqa ) # noqa
super().__init__(cpu_group) super().__init__(cpu_group)
self.handle_cache = Cache() self.handle_cache = Cache()

View File

@ -471,7 +471,8 @@ class ChatCompletionRequest(OpenAIBaseModel):
top_logprobs: Optional[int] = 0 top_logprobs: Optional[int] = 0
max_tokens: Optional[int] = Field( max_tokens: Optional[int] = Field(
default=None, default=None,
deprecated="max_tokens is deprecated in favor of the max_completion_tokens field", deprecated="max_tokens is deprecated in favor of "
"the max_completion_tokens field",
) )
max_completion_tokens: Optional[int] = None max_completion_tokens: Optional[int] = None
n: Optional[int] = 1 n: Optional[int] = 1

View File

@ -31,7 +31,7 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
if self.base_layer.num_added_embeddings_per_partition > 0: if self.base_layer.num_added_embeddings_per_partition > 0:
# We can start adding lora weights # We can start adding lora weights
self.embeddings_weights = self.base_layer.weight.data[ self.embeddings_weights = self.base_layer.weight.data[
self.base_layer.num_org_embeddings_per_partition : self.base_layer.num_org_embeddings_per_partition self.base_layer.num_org_embeddings_per_partition : self.base_layer.num_org_embeddings_per_partition # noqa: E501
+ self.base_layer.num_added_embeddings_per_partition + self.base_layer.num_added_embeddings_per_partition
] ]
self.embeddings_slice = ( self.embeddings_slice = (

View File

@ -107,8 +107,8 @@ class PTPCFp8LinearMethod(Fp8LinearMethod):
layer.weight = torch.nn.Parameter(layer.weight.data, requires_grad=False) layer.weight = torch.nn.Parameter(layer.weight.data, requires_grad=False)
assert layer.weight.data.dtype == torch.bfloat16, ( assert layer.weight.data.dtype == torch.bfloat16, (
f"Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support output dtype of bfloat16. {str(layer.weight.data.dtype)} is specified." f"Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support output dtype of bfloat16. {str(layer.weight.data.dtype)} is specified." # noqa: E501
) # noqa: E501 )
# Quantize the weights. # Quantize the weights.
qweight, weight_scale = ops.scaled_fp8_quant( qweight, weight_scale = ops.scaled_fp8_quant(
layer.weight, scale=None, use_per_token_if_dynamic=True layer.weight, scale=None, use_per_token_if_dynamic=True

View File

@ -391,7 +391,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
total_shard_sizes = next( total_shard_sizes = next(
( (
sizes sizes
for module, sizes in self.maybe_fused_weights_modules.items() for module, sizes in self.maybe_fused_weights_modules.items() # noqa: E501
if check_match(mapped_weight_name, module) if check_match(mapped_weight_name, module)
) )
) )

View File

@ -270,8 +270,8 @@ class BailingMoE(nn.Module):
) or ( ) or (
self.score_function == "sigmoid" and self.correction_bias is not None self.score_function == "sigmoid" and self.correction_bias is not None
), ( ), (
"score_function and correction_bias should be in 2 combination (softmax, None) or (sigmoid, not None)" "score_function and correction_bias should be in 2 combination (softmax, None) or (sigmoid, not None)" # noqa: E501
) # noqa: E501 )
else: else:
# default value for scoring_func # default value for scoring_func
self.score_function = "softmax" self.score_function = "softmax"

View File

@ -825,10 +825,10 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
# Run MM-Projector # Run MM-Projector
# len(num_grids) == len(num_queries_vis_abstractors) + 1 # len(num_grids) == len(num_queries_vis_abstractors) + 1
grid_idx = 0 grid_idx = 0
num_grids = [ # e.g. [0, 9, 18, 19, 27, 28, 36, 37, 45, 46, 54, 55, 56]
grid_idx num_grids = [grid_idx]
] # e.g. [0, 9, 18, 19, 27, 28, 36, 37, 45, 46, 54, 55, 56] # e.g. [81, 81, 81, 9, 81, 9, 81, 9, 81, 9, 81, 9]
num_queries_vis_abstractors = [] # e.g. [81, 81, 81, 9, 81, 9, 81, 9, 81, 9, 81, 9] num_queries_vis_abstractors = []
len_total_frames = video_forward_outs.shape[0] len_total_frames = video_forward_outs.shape[0]
if self.config.first_last_frames_slow: if self.config.first_last_frames_slow:

View File

@ -154,9 +154,10 @@ class LlamaModel(nn.Module):
str(layer_index), str(layer_index + start_layer_id) str(layer_index), str(layer_index + start_layer_id)
) )
quant_config.torchao_config.module_fqn_to_config = { torchao_config = quant_config.torchao_config
torchao_config.module_fqn_to_config = {
pad_layer_name(layer): quantization pad_layer_name(layer): quantization
for layer, quantization in quant_config.torchao_config.module_fqn_to_config.items() for layer, quantization in torchao_config.module_fqn_to_config.items()
} }

View File

@ -186,26 +186,26 @@ class LongCatFlashMTP(nn.Module, SupportsPP):
"model.mtp.layers.0.eh_proj.weight_scale_inv": "eh_proj.weight_scale_inv", "model.mtp.layers.0.eh_proj.weight_scale_inv": "eh_proj.weight_scale_inv",
"model.mtp.layers.0.enorm.m.weight": "enorm.weight", "model.mtp.layers.0.enorm.m.weight": "enorm.weight",
"model.mtp.layers.0.hnorm.m.weight": "hnorm.weight", "model.mtp.layers.0.hnorm.m.weight": "hnorm.weight",
"model.mtp.layers.0.input_layernorm.weight": "model.layers.0.input_layernorm.weight", "model.mtp.layers.0.input_layernorm.weight": "model.layers.0.input_layernorm.weight", # noqa: E501
"model.mtp.layers.0.post_attention_layernorm.weight": "model.layers.0.post_attention_layernorm.weight", "model.mtp.layers.0.post_attention_layernorm.weight": "model.layers.0.post_attention_layernorm.weight", # noqa: E501
"model.mtp.layers.0.self_attn.kv_a_layernorm.weight": "model.layers.0.self_attn.kv_a_layernorm.weight", "model.mtp.layers.0.self_attn.kv_a_layernorm.weight": "model.layers.0.self_attn.kv_a_layernorm.weight", # noqa: E501
"model.mtp.layers.0.self_attn.kv_a_proj_with_mqa.weight": "model.layers.0.self_attn.kv_a_proj_with_mqa.weight", "model.mtp.layers.0.self_attn.kv_a_proj_with_mqa.weight": "model.layers.0.self_attn.kv_a_proj_with_mqa.weight", # noqa: E501
"model.mtp.layers.0.self_attn.kv_a_proj_with_mqa.weight_scale_inv": "model.layers.0.self_attn.kv_a_proj_with_mqa.weight_scale_inv", "model.mtp.layers.0.self_attn.kv_a_proj_with_mqa.weight_scale_inv": "model.layers.0.self_attn.kv_a_proj_with_mqa.weight_scale_inv", # noqa: E501
"model.mtp.layers.0.self_attn.kv_b_proj.weight": "model.layers.0.self_attn.kv_b_proj.weight", "model.mtp.layers.0.self_attn.kv_b_proj.weight": "model.layers.0.self_attn.kv_b_proj.weight", # noqa: E501
"model.mtp.layers.0.self_attn.kv_b_proj.weight_scale_inv": "model.layers.0.self_attn.kv_b_proj.weight_scale_inv", "model.mtp.layers.0.self_attn.kv_b_proj.weight_scale_inv": "model.layers.0.self_attn.kv_b_proj.weight_scale_inv", # noqa: E501
"model.mtp.layers.0.self_attn.o_proj.weight": "model.layers.0.self_attn.o_proj.weight", "model.mtp.layers.0.self_attn.o_proj.weight": "model.layers.0.self_attn.o_proj.weight", # noqa: E501
"model.mtp.layers.0.self_attn.o_proj.weight_scale_inv": "model.layers.0.self_attn.o_proj.weight_scale_inv", "model.mtp.layers.0.self_attn.o_proj.weight_scale_inv": "model.layers.0.self_attn.o_proj.weight_scale_inv", # noqa: E501
"model.mtp.layers.0.self_attn.q_a_layernorm.weight": "model.layers.0.self_attn.q_a_layernorm.weight", "model.mtp.layers.0.self_attn.q_a_layernorm.weight": "model.layers.0.self_attn.q_a_layernorm.weight", # noqa: E501
"model.mtp.layers.0.self_attn.q_a_proj.weight": "model.layers.0.self_attn.q_a_proj.weight", "model.mtp.layers.0.self_attn.q_a_proj.weight": "model.layers.0.self_attn.q_a_proj.weight", # noqa: E501
"model.mtp.layers.0.self_attn.q_a_proj.weight_scale_inv": "model.layers.0.self_attn.q_a_proj.weight_scale_inv", "model.mtp.layers.0.self_attn.q_a_proj.weight_scale_inv": "model.layers.0.self_attn.q_a_proj.weight_scale_inv", # noqa: E501
"model.mtp.layers.0.self_attn.q_b_proj.weight": "model.layers.0.self_attn.q_b_proj.weight", "model.mtp.layers.0.self_attn.q_b_proj.weight": "model.layers.0.self_attn.q_b_proj.weight", # noqa: E501
"model.mtp.layers.0.self_attn.q_b_proj.weight_scale_inv": "model.layers.0.self_attn.q_b_proj.weight_scale_inv", "model.mtp.layers.0.self_attn.q_b_proj.weight_scale_inv": "model.layers.0.self_attn.q_b_proj.weight_scale_inv", # noqa: E501
"model.mtp.layers.0.transformer_layer.mlp.down_proj.weight": "model.layers.0.mlp.down_proj.weight", "model.mtp.layers.0.transformer_layer.mlp.down_proj.weight": "model.layers.0.mlp.down_proj.weight", # noqa: E501
"model.mtp.layers.0.transformer_layer.mlp.down_proj.weight_scale_inv": "model.layers.0.mlp.down_proj.weight_scale_inv", "model.mtp.layers.0.transformer_layer.mlp.down_proj.weight_scale_inv": "model.layers.0.mlp.down_proj.weight_scale_inv", # noqa: E501
"model.mtp.layers.0.transformer_layer.mlp.gate_proj.weight": "model.layers.0.mlp.gate_proj.weight", "model.mtp.layers.0.transformer_layer.mlp.gate_proj.weight": "model.layers.0.mlp.gate_proj.weight", # noqa: E501
"model.mtp.layers.0.transformer_layer.mlp.gate_proj.weight_scale_inv": "model.layers.0.mlp.gate_proj.weight_scale_inv", "model.mtp.layers.0.transformer_layer.mlp.gate_proj.weight_scale_inv": "model.layers.0.mlp.gate_proj.weight_scale_inv", # noqa: E501
"model.mtp.layers.0.transformer_layer.mlp.up_proj.weight": "model.layers.0.mlp.up_proj.weight", "model.mtp.layers.0.transformer_layer.mlp.up_proj.weight": "model.layers.0.mlp.up_proj.weight", # noqa: E501
"model.mtp.layers.0.transformer_layer.mlp.up_proj.weight_scale_inv": "model.layers.0.mlp.up_proj.weight_scale_inv", "model.mtp.layers.0.transformer_layer.mlp.up_proj.weight_scale_inv": "model.layers.0.mlp.up_proj.weight_scale_inv", # noqa: E501
"model.mtp.norm.weight": "final_layernorm.weight", "model.mtp.norm.weight": "final_layernorm.weight",
} }

View File

@ -1000,8 +1000,8 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
"base_layer.": "", "base_layer.": "",
}, },
orig_to_new_prefix={ orig_to_new_prefix={
"model.embed_tokens_extend.audio_embed.audio_projection.vision.": "embed_tokens_extend.audio_projection_for_vision.", "model.embed_tokens_extend.audio_embed.audio_projection.vision.": "embed_tokens_extend.audio_projection_for_vision.", # noqa: E501
"model.embed_tokens_extend.audio_embed.audio_projection.speech.": "embed_tokens_extend.audio_projection.", "model.embed_tokens_extend.audio_embed.audio_projection.speech.": "embed_tokens_extend.audio_projection.", # noqa: E501
"model.embed_tokens_extend.audio_embed.": "embed_tokens_extend.", "model.embed_tokens_extend.audio_embed.": "embed_tokens_extend.",
"model.embed_tokens_extend.image_embed.": "vision_encoder.", "model.embed_tokens_extend.image_embed.": "vision_encoder.",
}, },

View File

@ -916,8 +916,9 @@ class Qwen3NextDecoderLayer(nn.Module):
) )
else: else:
assert len(hidden_states.shape) == len(self.ffn_layer_scale.shape), ( assert len(hidden_states.shape) == len(self.ffn_layer_scale.shape), (
f"shape must be the same {len(hidden_states.shape)}, {len(self.ffn_layer_scale.shape)}" f"shape must be the same {len(hidden_states.shape)}, "
) # noqa: E501 f"{len(self.ffn_layer_scale.shape)}"
)
hidden_states = hidden_states * ( hidden_states = hidden_states * (
self.ffn_layer_scale.to(hidden_states.dtype) + 1 self.ffn_layer_scale.to(hidden_states.dtype) + 1
) )

View File

@ -255,8 +255,8 @@ def is_rocm_aiter_fp8bmm_enabled() -> bool:
if is_rocm_aiter_fp8bmm_enabled(): if is_rocm_aiter_fp8bmm_enabled():
from aiter.ops.triton.batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant import ( # noqa: E501 # isort: skip from aiter.ops.triton.batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant import ( # noqa: E501
batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant as aiter_triton_fp8_bmm, batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant as aiter_triton_fp8_bmm, # noqa: E501
) )
def dynamic_per_batched_tensor_quant( def dynamic_per_batched_tensor_quant(
@ -1284,8 +1284,10 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
actual_seq_lens_q=prefill.query_seq_lens.view(-1, 1, 1, 1), actual_seq_lens_q=prefill.query_seq_lens.view(-1, 1, 1, 1),
actual_seq_lens_kv=prefill.query_seq_lens.view(-1, 1, 1, 1), actual_seq_lens_kv=prefill.query_seq_lens.view(-1, 1, 1, 1),
causal=True, causal=True,
return_lse=True, # do not support False for now # Do not support False for now
is_cuda_graph_compatible=True, # Indicates actual_seq_lens are on GPU or CPU. return_lse=True,
# Indicates actual_seq_lens are on GPU or CPU.
is_cuda_graph_compatible=True,
) )
if return_softmax_lse: if return_softmax_lse:
return output, lse return output, lse
@ -1342,7 +1344,8 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
), ),
causal=False, causal=False,
return_lse=True, return_lse=True,
is_cuda_graph_compatible=True, # Indicates actual_seq_lens are on GPU or CPU. # Indicates actual_seq_lens are on GPU or CPU.
is_cuda_graph_compatible=True,
) )
def process_weights_after_loading(self, act_dtype: torch.dtype): def process_weights_after_loading(self, act_dtype: torch.dtype):

View File

@ -872,10 +872,13 @@ def wait_for_engine_startup(
EngineHandshakeMetadata( EngineHandshakeMetadata(
addresses=addresses, addresses=addresses,
parallel_config={ parallel_config={
"data_parallel_master_ip": parallel_config.data_parallel_master_ip, k: getattr(parallel_config, k)
"data_parallel_master_port": parallel_config.data_parallel_master_port, for k in (
"_data_parallel_master_port_list": parallel_config._data_parallel_master_port_list, "data_parallel_master_ip",
"data_parallel_size": parallel_config.data_parallel_size, "data_parallel_master_port",
"_data_parallel_master_port_list",
"data_parallel_size",
)
}, },
) )
) )

View File

@ -345,13 +345,15 @@ def report_usage_stats(
from vllm.model_executor.model_loader import get_architecture_class_name from vllm.model_executor.model_loader import get_architecture_class_name
parallel_config = vllm_config.parallel_config
usage_message.report_usage( usage_message.report_usage(
get_architecture_class_name(vllm_config.model_config), get_architecture_class_name(vllm_config.model_config),
usage_context, usage_context,
extra_kvs={ extra_kvs={
# Common configuration # Common configuration
"dtype": str(vllm_config.model_config.dtype), "dtype": str(vllm_config.model_config.dtype),
"tensor_parallel_size": vllm_config.parallel_config.tensor_parallel_size, "tensor_parallel_size": parallel_config.tensor_parallel_size,
"block_size": vllm_config.cache_config.block_size, "block_size": vllm_config.cache_config.block_size,
"gpu_memory_utilization": vllm_config.cache_config.gpu_memory_utilization, "gpu_memory_utilization": vllm_config.cache_config.gpu_memory_utilization,
"kv_cache_memory_bytes": vllm_config.cache_config.kv_cache_memory_bytes, "kv_cache_memory_bytes": vllm_config.cache_config.kv_cache_memory_bytes,
@ -362,7 +364,7 @@ def report_usage_stats(
"enable_lora": bool(vllm_config.lora_config), "enable_lora": bool(vllm_config.lora_config),
"enable_prefix_caching": vllm_config.cache_config.enable_prefix_caching, "enable_prefix_caching": vllm_config.cache_config.enable_prefix_caching,
"enforce_eager": vllm_config.model_config.enforce_eager, "enforce_eager": vllm_config.model_config.enforce_eager,
"disable_custom_all_reduce": vllm_config.parallel_config.disable_custom_all_reduce, "disable_custom_all_reduce": parallel_config.disable_custom_all_reduce,
}, },
) )

View File

@ -3391,7 +3391,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
attn_metadata[ubid][layer_name] = attn_metadata_i attn_metadata[ubid][layer_name] = attn_metadata_i
else: else:
assert type(attn_metadata) is dict assert type(attn_metadata) is dict
attn_metadata_i = attn_group.get_metadata_builder().build_for_cudagraph_capture( metadata_builder = attn_group.get_metadata_builder()
attn_metadata_i = metadata_builder.build_for_cudagraph_capture(
common_attn_metadata common_attn_metadata
) )
for layer_name in attn_group.layer_names: for layer_name in attn_group.layer_names: