mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
Fix per file ruff ignores related to line length (#26262)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@ -164,7 +164,7 @@ def invoke_main() -> None:
|
||||
)
|
||||
parser.add_argument(
|
||||
"--batched", action="store_true", help="consider time to prepare batch"
|
||||
) # noqa: E501
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-iteration",
|
||||
type=int,
|
||||
|
@ -909,13 +909,13 @@ def create_argument_parser():
|
||||
parser.add_argument(
|
||||
"--tokenizer",
|
||||
type=str,
|
||||
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
||||
help="Name or path of the tokenizer, if not using the default tokenizer.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tokenizer-mode",
|
||||
type=str,
|
||||
default="auto",
|
||||
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
||||
help="Name or path of the tokenizer, if not using the default tokenizer.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-prompts",
|
||||
|
@ -72,8 +72,8 @@ VLLMKernelScheduleTag: dict[
|
||||
] = {
|
||||
**KernelScheduleTag, # type: ignore
|
||||
**{
|
||||
MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized",
|
||||
MixedInputKernelScheduleType.TmaWarpSpecializedPingpong: "cutlass::gemm::KernelTmaWarpSpecializedPingpong",
|
||||
MixedInputKernelScheduleType.TmaWarpSpecializedCooperative: "cutlass::gemm::KernelTmaWarpSpecializedCooperative",
|
||||
MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized", # noqa: E501
|
||||
MixedInputKernelScheduleType.TmaWarpSpecializedPingpong: "cutlass::gemm::KernelTmaWarpSpecializedPingpong", # noqa: E501
|
||||
MixedInputKernelScheduleType.TmaWarpSpecializedCooperative: "cutlass::gemm::KernelTmaWarpSpecializedCooperative", # noqa: E501
|
||||
},
|
||||
}
|
||||
|
@ -113,7 +113,7 @@ def run_e5_v(query: Query) -> ModelRequestData:
|
||||
def _get_vlm2vec_prompt_image(query: Query, image_token: str):
|
||||
if query["modality"] == "text":
|
||||
text = query["text"]
|
||||
prompt = f"Find me an everyday image that matches the given caption: {text}" # noqa: E501
|
||||
prompt = f"Find me an everyday image that matches the given caption: {text}"
|
||||
image = None
|
||||
elif query["modality"] == "image":
|
||||
prompt = f"{image_token} Find a day-to-day image that looks similar to the provided image." # noqa: E501
|
||||
|
@ -203,9 +203,9 @@ class Proxy:
|
||||
async with session.post(
|
||||
url=url, json=data, headers=headers
|
||||
) as response:
|
||||
if 200 <= response.status < 300 or 400 <= response.status < 500: # noqa: E501
|
||||
if 200 <= response.status < 300 or 400 <= response.status < 500:
|
||||
if use_chunked:
|
||||
async for chunk_bytes in response.content.iter_chunked( # noqa: E501
|
||||
async for chunk_bytes in response.content.iter_chunked(
|
||||
1024
|
||||
):
|
||||
yield chunk_bytes
|
||||
|
@ -56,52 +56,6 @@ include = ["vllm*"]
|
||||
"vllm/third_party/**" = ["ALL"]
|
||||
"vllm/version.py" = ["F401"]
|
||||
"vllm/_version.py" = ["ALL"]
|
||||
# TEMPORARY! These ignores will be fixed forward
|
||||
## Line length violations
|
||||
"csrc/cutlass_extensions/vllm_cutlass_library_extension.py" = ["E501"]
|
||||
"tests/compile/piecewise/test_simple.py" = ["E501"]
|
||||
"tests/compile/piecewise/test_toy_llama.py" = ["E501", "B023"]
|
||||
"tests/entrypoints/conftest.py" = ["E501"]
|
||||
"tests/entrypoints/openai/test_audio.py" = ["E501"]
|
||||
"tests/entrypoints/openai/test_chat.py" = ["E501"]
|
||||
"tests/entrypoints/openai/test_chat_template.py" = ["E501"]
|
||||
"tests/entrypoints/openai/test_chat_with_tool_reasoning.py" = ["E501"]
|
||||
"tests/entrypoints/openai/test_completion_with_function_calling.py" = ["E501"]
|
||||
"tests/entrypoints/openai/test_video.py" = ["E501"]
|
||||
"tests/entrypoints/openai/test_vision.py" = ["E501"]
|
||||
"tests/entrypoints/test_chat_utils.py" = ["E501"]
|
||||
"tests/kernels/moe/modular_kernel_tools/common.py" = ["E501"]
|
||||
"tests/models/language/generation/test_gemma.py" = ["E501"]
|
||||
"tests/models/language/generation/test_mistral.py" = ["E501"]
|
||||
"tests/models/multimodal/generation/test_ultravox.py" = ["E501"]
|
||||
"tests/models/multimodal/generation/test_voxtral.py" = ["E501"]
|
||||
"tests/models/multimodal/generation/vlm_utils/custom_inputs.py" = ["E501"]
|
||||
"tests/tool_use/test_tool_choice_required.py" = ["E501"]
|
||||
"tests/v1/attention/utils.py" = ["E501"]
|
||||
"tests/v1/entrypoints/openai/responses/test_image.py" = ["E501"]
|
||||
"tests/v1/kv_connector/nixl_integration/test_accuracy.py" = ["E501"]
|
||||
"tests/v1/kv_connector/unit/test_offloading_connector.py" = ["E501"]
|
||||
"tests/v1/logits_processors/test_custom_offline.py" = ["E501"]
|
||||
"vllm/attention/ops/pallas_kv_cache_update.py" = ["E501"]
|
||||
"vllm/compilation/collective_fusion.py" = ["E501"]
|
||||
"vllm/compilation/wrapper.py" = ["E501"]
|
||||
"vllm/config/vllm.py" = ["E501"]
|
||||
"vllm/distributed/device_communicators/all2all.py" = ["E501"]
|
||||
"vllm/entrypoints/openai/protocol.py" = ["E501"]
|
||||
"vllm/lora/layers/vocal_parallel_embedding.py" = ["E501"]
|
||||
"vllm/model_executor/model_loader/bitsandbytes_loader.py" = ["E501"]
|
||||
"vllm/model_executor/models/bailing_moe.py" = ["E501"]
|
||||
"vllm/model_executor/models/hyperclovax_vision.py" = ["E501"]
|
||||
"vllm/model_executor/models/llama4_eagle.py" = ["E501"]
|
||||
"vllm/model_executor/models/longcat_flash_mtp.py" = ["E501"]
|
||||
"vllm/model_executor/models/phi4mm.py" = ["E501"]
|
||||
"vllm/model_executor/models/qwen3_next.py" = ["E501"]
|
||||
"vllm/model_executor/layers/quantization/ptpc_fp8.py" = ["E501"]
|
||||
"vllm/v1/attention/backends/mla/common.py" = ["E501"]
|
||||
"vllm/v1/engine/utils.py" = ["E501"]
|
||||
"vllm/v1/utils.py" = ["E501"]
|
||||
"vllm/v1/worker/gpu_model_runner.py" = ["E501"]
|
||||
# End of temporary ignores
|
||||
|
||||
[tool.ruff.lint]
|
||||
select = [
|
||||
|
@ -132,10 +132,14 @@ def test_simple_piecewise_compile(use_inductor):
|
||||
splitting_ops=["silly.attention"],
|
||||
use_inductor_graph_partition=False,
|
||||
use_inductor=use_inductor,
|
||||
expected_num_piecewise_graphs_seen=5, # 2 * num_layers + 1
|
||||
expected_num_piecewise_capturable_graphs_seen=3, # 1 + num_layers
|
||||
expected_num_backend_compilations=3, # num_piecewise_capturable_graphs_seen
|
||||
expected_num_cudagraph_captured=6, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
||||
# 2 * num_layers + 1
|
||||
expected_num_piecewise_graphs_seen=5,
|
||||
# 1 + num_layers
|
||||
expected_num_piecewise_capturable_graphs_seen=3,
|
||||
# num_piecewise_capturable_graphs_seen
|
||||
expected_num_backend_compilations=3,
|
||||
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
||||
expected_num_cudagraph_captured=6,
|
||||
)
|
||||
|
||||
|
||||
@ -147,14 +151,16 @@ def test_simple_inductor_graph_partition(splitting_ops):
|
||||
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
|
||||
|
||||
_run_simple_model(
|
||||
# inductor graph partition automatically resets splitting_ops
|
||||
# to be an empty list
|
||||
# Inductor graph partition automatically resets splitting_ops to an empty list
|
||||
splitting_ops=splitting_ops,
|
||||
use_inductor_graph_partition=True,
|
||||
use_inductor=True,
|
||||
expected_num_piecewise_graphs_seen=1, # since not splitting at fx graph level
|
||||
expected_num_piecewise_capturable_graphs_seen=1, # since not splitting at fx graph level
|
||||
expected_num_backend_compilations=1, # since not splitting at fx graph level
|
||||
expected_num_cudagraph_captured=6, # inductor graph partition still captures 6
|
||||
# graph, same as fx graph partition.
|
||||
# Since not splitting at fx graph level
|
||||
expected_num_piecewise_graphs_seen=1,
|
||||
# Since not splitting at fx graph level
|
||||
expected_num_piecewise_capturable_graphs_seen=1,
|
||||
# Since not splitting at fx graph level
|
||||
expected_num_backend_compilations=1,
|
||||
# Inductor graph partition still captures 6 graph, same as fx graph partition
|
||||
expected_num_cudagraph_captured=6,
|
||||
)
|
||||
|
@ -367,11 +367,14 @@ def test_toy_llama(use_inductor: bool):
|
||||
kwargs = {"num_eager_compiles": 1, "num_inductor_compiles": 0}
|
||||
|
||||
with compilation_counter.expect(
|
||||
num_graphs_seen=1, # one graph for the model
|
||||
# One graph for the model
|
||||
num_graphs_seen=1,
|
||||
num_piecewise_graphs_seen=1,
|
||||
num_piecewise_capturable_graphs_seen=1,
|
||||
num_backend_compilations=1, # num_piecewise_capturable_graphs_seen
|
||||
num_cudagraph_captured=2, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
||||
# num_piecewise_capturable_graphs_seen
|
||||
num_backend_compilations=1,
|
||||
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
||||
num_cudagraph_captured=2,
|
||||
**kwargs,
|
||||
):
|
||||
outputs.append(
|
||||
@ -478,9 +481,10 @@ def benchmark():
|
||||
# it is fine here, because we only use the lambda function once.
|
||||
runtime = do_bench(
|
||||
lambda: graphs[b][0]( # noqa
|
||||
input_ids[:b], positions[:b]
|
||||
input_ids[:b], # noqa
|
||||
positions[:b], # noqa
|
||||
)
|
||||
)
|
||||
) # noqa
|
||||
piecewise_cudagraph_time[b] = runtime
|
||||
else:
|
||||
runtime = do_bench(lambda: graphs[b][0].replay()) # noqa
|
||||
|
@ -243,7 +243,7 @@ def test_fix_functionalization(model_class: torch.nn.Module, do_fusion: bool):
|
||||
# check if the functionalization pass is applied
|
||||
for op in model.ops_in_model(do_fusion):
|
||||
find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
|
||||
assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None # noqa: E501
|
||||
assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None
|
||||
|
||||
# make sure the ops were all de-functionalized
|
||||
found = dict()
|
||||
|
@ -565,7 +565,7 @@ def test_attention_quant_pattern(
|
||||
elif quant_key.dtype == FP4_DTYPE:
|
||||
assert attn_nodes_post[0].kwargs.get("output_block_scale") is not None, (
|
||||
"Attention should have output_block_scale after FP4 fusion"
|
||||
) # noqa: E501
|
||||
)
|
||||
|
||||
# Check that results are close
|
||||
torch.testing.assert_close(result_unfused, result_fused_1, atol=1e-2, rtol=1e-2)
|
||||
|
@ -186,7 +186,7 @@ class TestQuantModel(torch.nn.Module):
|
||||
):
|
||||
# If fusion happens, the fused op is the one
|
||||
# we check for (de)functionalization
|
||||
return [torch.ops._C.fused_add_rms_norm_static_fp8_quant.default] # noqa: E501
|
||||
return [torch.ops._C.fused_add_rms_norm_static_fp8_quant.default]
|
||||
else:
|
||||
# If no fusion, the original ops are checked
|
||||
return [
|
||||
@ -322,7 +322,7 @@ def sequence_parallelism_pass_on_test_model(
|
||||
# check if the functionalization pass is applied
|
||||
for op in model.ops_in_model():
|
||||
find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
|
||||
assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None # noqa: E501
|
||||
assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None
|
||||
|
||||
# make sure the ops were all de-functionalized
|
||||
found = dict()
|
||||
|
@ -104,7 +104,7 @@ TEXT_GENERATION_MODELS = {
|
||||
# [Decoder-only]
|
||||
# Uses Llama
|
||||
# "BAAI/AquilaChat-7B": PPTestSettings.fast(),
|
||||
"Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(load_format="dummy"), # noqa: E501
|
||||
"Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(load_format="dummy"),
|
||||
"baichuan-inc/Baichuan-7B": PPTestSettings.fast(),
|
||||
"baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(),
|
||||
"bigscience/bloomz-1b1": PPTestSettings.fast(),
|
||||
@ -138,7 +138,7 @@ TEXT_GENERATION_MODELS = {
|
||||
# Uses Llama
|
||||
# "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
|
||||
"state-spaces/mamba-130m-hf": PPTestSettings.fast(),
|
||||
"mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(load_format="dummy"), # noqa: E501
|
||||
"mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(load_format="dummy"),
|
||||
"mosaicml/mpt-7b": PPTestSettings.fast(),
|
||||
"nvidia/Minitron-8B-Base": PPTestSettings.fast(),
|
||||
"allenai/OLMo-1B-hf": PPTestSettings.fast(),
|
||||
@ -151,13 +151,13 @@ TEXT_GENERATION_MODELS = {
|
||||
"microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(),
|
||||
"microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(
|
||||
multi_node_only=True, load_format="dummy"
|
||||
), # noqa: E501
|
||||
),
|
||||
"Qwen/Qwen-7B-Chat": PPTestSettings.fast(),
|
||||
"Qwen/Qwen2.5-0.5B-Instruct": PPTestSettings.fast(),
|
||||
"Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
|
||||
"stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
|
||||
"bigcode/starcoder2-3b": PPTestSettings.fast(),
|
||||
"upstage/solar-pro-preview-instruct": PPTestSettings.fast(load_format="dummy"), # noqa: E501
|
||||
"upstage/solar-pro-preview-instruct": PPTestSettings.fast(load_format="dummy"),
|
||||
# FIXME: Cannot load tokenizer in latest transformers version.
|
||||
# Need to use tokenizer from `meta-llama/Llama-2-7b-chat-hf`
|
||||
# "xverse/XVERSE-7B-Chat": PPTestSettings.fast(),
|
||||
|
@ -83,7 +83,8 @@ def sample_complex_json_schema():
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"pattern": "^[a-z]{1,10}$", # Combining length and pattern restrictions
|
||||
# Combining length and pattern restrictions
|
||||
"pattern": "^[a-z]{1,10}$",
|
||||
},
|
||||
},
|
||||
},
|
||||
|
@ -145,7 +145,7 @@ async def test_single_chat_session_audio_base64encoded(
|
||||
{
|
||||
"type": "audio_url",
|
||||
"audio_url": {
|
||||
"url": f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}"
|
||||
"url": f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}" # noqa: E501
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": "What's happening in this audio?"},
|
||||
|
@ -835,17 +835,18 @@ async def test_extra_fields_allowed(client: openai.AsyncOpenAI):
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_complex_message_content(client: openai.AsyncOpenAI):
|
||||
content = [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "what is 1+1? please provide the result without any other text.",
|
||||
}
|
||||
]
|
||||
resp = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "what is 1+1? please provide the result without any other text.",
|
||||
}
|
||||
],
|
||||
"content": content,
|
||||
}
|
||||
],
|
||||
temperature=0,
|
||||
|
@ -76,8 +76,8 @@ def test_load_chat_template():
|
||||
assert (
|
||||
template_content
|
||||
== """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %}
|
||||
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}"""
|
||||
) # noqa: E501
|
||||
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}""" # noqa: E501
|
||||
)
|
||||
|
||||
|
||||
def test_no_load_chat_template_filelike():
|
||||
|
@ -45,12 +45,13 @@ TOOLS = [
|
||||
"properties": {
|
||||
"city": {
|
||||
"type": "string",
|
||||
"description": "The city to find the weather for, e.g. 'San Francisco'",
|
||||
"description": "The city to find the weather for, e.g. "
|
||||
"'San Francisco'",
|
||||
},
|
||||
"state": {
|
||||
"type": "string",
|
||||
"description": "the two-letter abbreviation for the state that the city is"
|
||||
" in, e.g. 'CA' which would mean 'California'",
|
||||
"description": "the two-letter abbreviation for the state that "
|
||||
"the city is in, e.g. 'CA' which would mean 'California'",
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
@ -69,7 +70,8 @@ MESSAGES = [
|
||||
{"role": "assistant", "content": "I'm doing well! How can I help you?"},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Can you tell me what the temperate will be in Dallas, in fahrenheit?",
|
||||
"content": "Can you tell me what the temperate will be in Dallas, "
|
||||
"in fahrenheit?",
|
||||
},
|
||||
]
|
||||
|
||||
|
@ -25,12 +25,14 @@ tools = [
|
||||
"properties": {
|
||||
"city": {
|
||||
"type": "string",
|
||||
"description": "The city to find the weather for, e.g. 'Vienna'",
|
||||
"description": "The city to find the weather for, e.g. "
|
||||
"'Vienna'",
|
||||
"default": "Vienna",
|
||||
},
|
||||
"country": {
|
||||
"type": "string",
|
||||
"description": "The country that the city is in, e.g. 'Austria'",
|
||||
"description": "The country that the city is in, e.g. "
|
||||
"'Austria'",
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
@ -85,12 +87,14 @@ tools = [
|
||||
"properties": {
|
||||
"city": {
|
||||
"type": "string",
|
||||
"description": "The city to get the forecast for, e.g. 'Vienna'",
|
||||
"description": "The city to get the forecast for, e.g. "
|
||||
"'Vienna'",
|
||||
"default": "Vienna",
|
||||
},
|
||||
"country": {
|
||||
"type": "string",
|
||||
"description": "The country that the city is in, e.g. 'Austria'",
|
||||
"description": "The country that the city is in, e.g. "
|
||||
"'Austria'",
|
||||
},
|
||||
"days": {
|
||||
"type": "integer",
|
||||
|
@ -179,7 +179,7 @@ async def test_single_chat_session_video_base64encoded(
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {
|
||||
"url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
|
||||
"url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" # noqa: E501
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": "What's in this video?"},
|
||||
@ -238,7 +238,7 @@ async def test_single_chat_session_video_base64encoded_beamsearch(
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {
|
||||
"url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
|
||||
"url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" # noqa: E501
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": "What's in this video?"},
|
||||
|
@ -233,7 +233,7 @@ async def test_single_chat_session_image_base64encoded(
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}"
|
||||
"url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}" # noqa: E501
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": content_text},
|
||||
@ -300,7 +300,7 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}"
|
||||
"url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}" # noqa: E501
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": "What's in this image?"},
|
||||
|
@ -947,7 +947,8 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in <|image_1|> and how does it compare to the other one?", # noqa: E501
|
||||
"text": "What's in <|image_1|> and how does it compare to "
|
||||
"the other one?",
|
||||
},
|
||||
],
|
||||
}
|
||||
@ -960,8 +961,8 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
|
||||
assert conversation == [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "<|image_2|>\nWhat's in <|image_1|> and how does it compare to the "
|
||||
"other one?",
|
||||
"content": "<|image_2|>\nWhat's in <|image_1|> and how does it compare to "
|
||||
"the other one?",
|
||||
}
|
||||
]
|
||||
_assert_mm_data_is_image_input(mm_data, 2)
|
||||
@ -1364,7 +1365,7 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
|
||||
|
||||
|
||||
def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interleave( # noqa: E501
|
||||
def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interleave(
|
||||
phi3v_model_config_mm_interleaved,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
@ -1451,14 +1452,14 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
|
||||
assert conversation == [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
|
||||
"Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", # noqa: E501
|
||||
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
|
||||
"\nNow listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",
|
||||
},
|
||||
{"role": "assistant", "content": "Some stuff."},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
|
||||
"And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
|
||||
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
|
||||
"\nAnd what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
|
||||
},
|
||||
]
|
||||
|
||||
@ -1468,7 +1469,7 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
|
||||
_assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None])
|
||||
|
||||
|
||||
def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interleave( # noqa: E501
|
||||
def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interleave(
|
||||
qwen25omni_model_config_mm_interleaved,
|
||||
qwen25omni_tokenizer,
|
||||
image_url,
|
||||
@ -1521,14 +1522,14 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl
|
||||
assert conversation == [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
|
||||
"Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", # noqa: E501
|
||||
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
|
||||
"\nNow listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",
|
||||
},
|
||||
{"role": "assistant", "content": "Some stuff."},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
|
||||
"And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
|
||||
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
|
||||
"\nAnd what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
|
||||
},
|
||||
]
|
||||
|
||||
@ -1593,14 +1594,14 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes
|
||||
assert conversation == [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
|
||||
"Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", # noqa: E501
|
||||
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
|
||||
"\nNow listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",
|
||||
},
|
||||
{"role": "assistant", "content": "Some stuff."},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
|
||||
"And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
|
||||
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
|
||||
"\nAnd what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
|
||||
},
|
||||
]
|
||||
|
||||
@ -1661,14 +1662,14 @@ def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_message
|
||||
assert conversation == [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
|
||||
"Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", # noqa: E501
|
||||
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
|
||||
"\nNow listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",
|
||||
},
|
||||
{"role": "assistant", "content": "Some stuff."},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
|
||||
"And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
|
||||
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
|
||||
"\nAnd what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
|
||||
},
|
||||
]
|
||||
|
||||
@ -2193,7 +2194,8 @@ def test_parse_chat_messages_single_empty_audio_with_uuid(
|
||||
assert conversation == [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?",
|
||||
"content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the "
|
||||
"audio say?",
|
||||
}
|
||||
]
|
||||
_assert_mm_data_inputs(mm_data, {"audio": 1})
|
||||
@ -2228,7 +2230,8 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
|
||||
assert conversation == [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?",
|
||||
"content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the "
|
||||
"audio say?",
|
||||
}
|
||||
]
|
||||
_assert_mm_data_inputs(await mm_future, {"audio": 1})
|
||||
|
@ -165,7 +165,7 @@ def test_env(
|
||||
# FlashMLA only supports block_size == 64
|
||||
pytest.skip("FlashMLA only supports block_size 64")
|
||||
else:
|
||||
from vllm.v1.attention.backends.mla.flashmla import ( # noqa: E501
|
||||
from vllm.v1.attention.backends.mla.flashmla import (
|
||||
is_flashmla_supported,
|
||||
)
|
||||
|
||||
|
@ -331,7 +331,8 @@ class WeightTensors:
|
||||
in_dtype=config.dtype,
|
||||
quant_dtype=config.quant_dtype,
|
||||
block_shape=config.quant_block_shape,
|
||||
per_out_ch_quant=config.is_per_act_token_quant, # or config.is_per_out_ch_quant
|
||||
# or config.is_per_out_ch_quant
|
||||
per_out_ch_quant=config.is_per_act_token_quant,
|
||||
)
|
||||
return WeightTensors(
|
||||
w1=w1, w2=w2, w1_scale=w1_scale, w2_scale=w2_scale, w1_gs=w1_gs, w2_gs=w2_gs
|
||||
|
@ -124,7 +124,7 @@ def make_feature_matrix(csv_file_path: str):
|
||||
results_df: Optional[pd.DataFrame] = None
|
||||
for m, k, n, e, topks, dtype, pf_type, experts_type, quant_config in tqdm(
|
||||
combinations
|
||||
): # noqa: E501
|
||||
):
|
||||
config = Config(
|
||||
Ms=[m],
|
||||
K=k,
|
||||
|
@ -10,7 +10,7 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
||||
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
|
||||
BatchedDeepGemmExperts,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import ( # noqa: E501
|
||||
from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (
|
||||
BatchedTritonOrDeepGemmExperts,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.config import (
|
||||
@ -196,10 +196,10 @@ register_experts(
|
||||
|
||||
# Disable on blackwell for now
|
||||
if has_deep_ep() and not current_platform.has_device_capability(100):
|
||||
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( # noqa: E501
|
||||
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (
|
||||
DeepEPHTPrepareAndFinalize,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( # noqa: E501
|
||||
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (
|
||||
DeepEPLLPrepareAndFinalize,
|
||||
)
|
||||
|
||||
@ -233,7 +233,7 @@ if has_pplx():
|
||||
)
|
||||
|
||||
if has_flashinfer_cutlass_fused_moe() and current_platform.has_device_capability(100):
|
||||
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( # noqa: E501
|
||||
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
|
||||
FlashInferExperts,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501
|
||||
|
@ -17,10 +17,10 @@ from typing_extensions import Concatenate, ParamSpec
|
||||
from vllm.utils import get_open_port, has_deep_ep
|
||||
|
||||
if has_deep_ep():
|
||||
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( # noqa: E501
|
||||
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (
|
||||
DeepEPHTPrepareAndFinalize,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( # noqa: E501
|
||||
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (
|
||||
DeepEPLLPrepareAndFinalize,
|
||||
)
|
||||
|
||||
|
@ -30,10 +30,10 @@ from .parallel_utils import ProcessGroupInfo, parallel_launch
|
||||
from .utils import make_test_weights
|
||||
|
||||
if has_deep_ep():
|
||||
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( # noqa: E501
|
||||
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (
|
||||
DeepEPHTPrepareAndFinalize,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( # noqa: E501
|
||||
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (
|
||||
DeepEPLLPrepareAndFinalize,
|
||||
)
|
||||
|
||||
|
@ -28,10 +28,10 @@ from ...utils import multi_gpu_test
|
||||
from .parallel_utils import ProcessGroupInfo, parallel_launch
|
||||
|
||||
if has_deep_ep():
|
||||
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( # noqa: E501
|
||||
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (
|
||||
DeepEPHTPrepareAndFinalize,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( # noqa: E501
|
||||
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (
|
||||
DeepEPLLPrepareAndFinalize,
|
||||
)
|
||||
|
||||
|
@ -271,7 +271,7 @@ if __name__ == "__main__":
|
||||
parser = make_config_arg_parser(
|
||||
description=(
|
||||
"Run single prepare-finalize & fused-experts combination test"
|
||||
"Example : python3 -m tests.kernels.moe.test_modular_kernel_combinations " # noqa: E501
|
||||
"Example : python3 -m tests.kernels.moe.test_modular_kernel_combinations "
|
||||
"--pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts"
|
||||
)
|
||||
)
|
||||
|
@ -483,8 +483,8 @@ def test_mixtral_moe(
|
||||
}
|
||||
|
||||
if use_rocm_aiter:
|
||||
# The values of rtol and atol are set based on the tests in ROCM AITER package. # noqa: E501
|
||||
# https://github.com/ROCm/aiter/blob/dfed377f4be7da96ca2d75ac0761f569676f7240/op_tests/test_moe.py#L174 # noqa: E501
|
||||
# The values of rtol and atol are set based on the tests in ROCM AITER package.
|
||||
# https://github.com/ROCm/aiter/blob/dfed377f4be7da96ca2d75ac0761f569676f7240/op_tests/test_moe.py#L174
|
||||
torch.testing.assert_close(
|
||||
hf_states.flatten(0, 1), vllm_states, rtol=0.01, atol=100
|
||||
)
|
||||
|
@ -10,11 +10,11 @@ import pytest
|
||||
import torch
|
||||
from packaging import version
|
||||
|
||||
from vllm.model_executor.layers.quantization.quark.quark import ( # noqa: E501
|
||||
from vllm.model_executor.layers.quantization.quark.quark import (
|
||||
QuarkLinearMethod,
|
||||
QuarkW4A4MXFP4,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.quark.quark_moe import ( # noqa: E501
|
||||
from vllm.model_executor.layers.quantization.quark.quark_moe import (
|
||||
QuarkW4A4MXFp4MoEMethod,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
|
@ -12,7 +12,7 @@ PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example
|
||||
|
||||
EXPECTED_LORA_OUTPUT = [
|
||||
"SELECT count(*) FROM singer",
|
||||
"SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", # noqa: E501
|
||||
"SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'",
|
||||
"SELECT name , country , age FROM singer ORDER BY age",
|
||||
]
|
||||
|
||||
@ -21,10 +21,16 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
prompts = [
|
||||
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
|
||||
PROMPT_TEMPLATE.format(
|
||||
query="What is the average, minimum, and maximum age of all singers from France?" # noqa: E501
|
||||
query=(
|
||||
"What is the average, minimum, and maximum "
|
||||
"age of all singers from France?"
|
||||
)
|
||||
),
|
||||
PROMPT_TEMPLATE.format(
|
||||
query="Show name, country, age for all singers ordered by age from the oldest to the youngest." # noqa: E501
|
||||
query=(
|
||||
"Show name, country, age for all singers ordered "
|
||||
"by age from the oldest to the youngest."
|
||||
)
|
||||
),
|
||||
]
|
||||
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
|
||||
|
@ -15,10 +15,10 @@ MODEL_PATH = "meta-llama/Llama-2-7b-hf"
|
||||
|
||||
EXPECTED_LORA_OUTPUT = [
|
||||
" SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501
|
||||
" SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ", # noqa: E501
|
||||
" SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ",
|
||||
" SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ", # noqa: E501
|
||||
" SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ", # noqa: E501
|
||||
" SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ", # noqa: E501
|
||||
" SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ",
|
||||
" SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' ", # noqa: E501
|
||||
]
|
||||
|
||||
|
@ -26,7 +26,7 @@ LORA_RANK = 8
|
||||
LORA_TEST_PROMPTS = ["What is GitHub?", "Hi, tell me about you"]
|
||||
LORA_TEST_EXPECTED = [
|
||||
"GitHub is an open-source platform that provides a way to manage and develop software projects. It allows developers to store and manage code, collaborate on projects, and automate tasks.", # noqa: E501
|
||||
"I am Alice, an AI assistant developed by GitHub/Charent.", # noqa: E501
|
||||
"I am Alice, an AI assistant developed by GitHub/Charent.",
|
||||
]
|
||||
|
||||
|
||||
|
@ -16,7 +16,7 @@ def test_dummy_loader(vllm_runner, monkeypatch, model: str) -> None:
|
||||
) as llm:
|
||||
if model == "google/gemma-3-4b-it":
|
||||
normalizers = llm.llm.collective_rpc(
|
||||
lambda self: self.model_runner.model.language_model.model.normalizer.cpu().item()
|
||||
lambda self: self.model_runner.model.language_model.model.normalizer.cpu().item() # noqa: E501
|
||||
)
|
||||
config = llm.llm.llm_engine.model_config.hf_config.text_config
|
||||
else:
|
||||
|
@ -46,12 +46,13 @@ TOOLS = [
|
||||
"properties": {
|
||||
"city": {
|
||||
"type": "string",
|
||||
"description": "The city to find the weather for, e.g. 'San Francisco'",
|
||||
"description": "The city to find the weather for, e.g. "
|
||||
"'San Francisco'",
|
||||
},
|
||||
"state": {
|
||||
"type": "string",
|
||||
"description": "the two-letter abbreviation for the state that the city is"
|
||||
" in, e.g. 'CA' which would mean 'California'",
|
||||
"description": "the two-letter abbreviation for the state that "
|
||||
"the city is in, e.g. 'CA' which would mean 'California'",
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
@ -85,7 +86,8 @@ MSGS = [
|
||||
{"role": "system", "content": "You are an assistant."},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Could you please rewrite the below article? \n\n My English needs improvving, maybe I make errors.", # noqa
|
||||
"content": "Could you please rewrite the below article? \n\n My English needs "
|
||||
"improvving, maybe I make errors.",
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
@ -96,14 +98,16 @@ MSGS = [
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "rewrite",
|
||||
"arguments": '{"text":"My English needs improvving, maybe I make errors."}', # noqa
|
||||
"arguments": '{"text":"My English needs improvving, maybe '
|
||||
'I make errors."}',
|
||||
},
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"role": "tool",
|
||||
"content": '{"action":"rewrite","outcome":"My English needs improving, maybe I make errors."}', # noqa
|
||||
"content": '{"action":"rewrite","outcome":"My English needs improving, maybe '
|
||||
'I make errors."}',
|
||||
"tool_call_id": "bbc5b7ede",
|
||||
"name": "rewrite",
|
||||
},
|
||||
|
@ -130,14 +130,14 @@ VLM_TEST_SETTINGS = {
|
||||
dtype="bfloat16",
|
||||
marks=[
|
||||
pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")
|
||||
], # noqa: E501
|
||||
],
|
||||
),
|
||||
"qwen2_5_vl": VLMTestInfo(
|
||||
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
|
||||
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",
|
||||
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
@ -149,8 +149,8 @@ VLM_TEST_SETTINGS = {
|
||||
models=["Qwen/Qwen2.5-Omni-3B"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<|vision_bos|><|IMAGE|><|vision_eos|>", # noqa: E501
|
||||
video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<|vision_bos|><|IMAGE|><|vision_eos|>",
|
||||
video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
num_logprobs=6 if current_platform.is_cpu() else 5,
|
||||
@ -181,7 +181,7 @@ VLM_TEST_SETTINGS = {
|
||||
max_model_len=16384,
|
||||
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
|
||||
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
|
||||
), # noqa: E501
|
||||
),
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
|
||||
image_size_factors=[(0.25, 0.5, 1.0)],
|
||||
@ -213,7 +213,7 @@ VLM_TEST_SETTINGS = {
|
||||
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
|
||||
test_type=VLMTestType.IMAGE,
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
@ -237,10 +237,10 @@ VLM_TEST_SETTINGS = {
|
||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "<vlm_image>Please describe the image shortly.",
|
||||
"cherry_blossom": "<vlm_image>Please infer the season with reason.", # noqa: E501
|
||||
"cherry_blossom": "<vlm_image>Please infer the season with reason.",
|
||||
}
|
||||
),
|
||||
multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.", # noqa: E501
|
||||
multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.",
|
||||
stop_str=["<|im_end|>"],
|
||||
image_size_factors=[(0.10, 0.15)],
|
||||
max_tokens=64,
|
||||
@ -252,11 +252,11 @@ VLM_TEST_SETTINGS = {
|
||||
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
|
||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "<image>What's the content in the center of the image?", # noqa: E501
|
||||
"cherry_blossom": "<image>What is the season?", # noqa: E501
|
||||
"stop_sign": "<image>What's the content in the center of the image?",
|
||||
"cherry_blossom": "<image>What is the season?",
|
||||
}
|
||||
),
|
||||
multi_image_prompt="<image><image>Describe the two images in detail.", # noqa: E501
|
||||
multi_image_prompt="<image><image>Describe the two images in detail.",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
@ -268,11 +268,11 @@ VLM_TEST_SETTINGS = {
|
||||
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
|
||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "<image>What's the content in the center of the image?", # noqa: E501
|
||||
"cherry_blossom": "<image>What is the season?", # noqa: E501
|
||||
"stop_sign": "<image>What's the content in the center of the image?",
|
||||
"cherry_blossom": "<image>What is the season?",
|
||||
}
|
||||
),
|
||||
multi_image_prompt="<image><image>Describe the two images in detail.", # noqa: E501
|
||||
multi_image_prompt="<image><image>Describe the two images in detail.",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
@ -311,14 +311,14 @@ VLM_TEST_SETTINGS = {
|
||||
max_num_seqs=2,
|
||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
|
||||
"stop_sign": "<image>\nWhat's the content in the center of the image?",
|
||||
"cherry_blossom": "<image>\nPlease infer the season with reason in details.", # noqa: E501
|
||||
}
|
||||
),
|
||||
multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?", # noqa: E501
|
||||
patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
|
||||
hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
|
||||
stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"], # noqa: E501
|
||||
stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"],
|
||||
image_size_factors=[(), (1.0,), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
|
||||
),
|
||||
"fuyu": VLMTestInfo(
|
||||
@ -342,7 +342,7 @@ VLM_TEST_SETTINGS = {
|
||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "<start_of_image>What's the content in the center of the image?", # noqa: E501
|
||||
"cherry_blossom": "<start_of_image>What is the season?", # noqa: E501
|
||||
"cherry_blossom": "<start_of_image>What is the season?",
|
||||
}
|
||||
),
|
||||
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
|
||||
@ -356,7 +356,7 @@ VLM_TEST_SETTINGS = {
|
||||
"glm4v": VLMTestInfo(
|
||||
models=["zai-org/glm-4v-9b"],
|
||||
test_type=VLMTestType.IMAGE,
|
||||
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501
|
||||
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",
|
||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?", # noqa: E501
|
||||
@ -377,9 +377,9 @@ VLM_TEST_SETTINGS = {
|
||||
"glm4_1v": VLMTestInfo(
|
||||
models=["zai-org/GLM-4.1V-9B-Thinking"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>", # noqa: E501
|
||||
video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>", # noqa: E501
|
||||
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",
|
||||
img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>",
|
||||
video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>",
|
||||
max_model_len=2048,
|
||||
max_num_seqs=2,
|
||||
get_stop_token_ids=lambda tok: [151329, 151336, 151338],
|
||||
@ -410,10 +410,10 @@ VLM_TEST_SETTINGS = {
|
||||
"h2oai/h2ovl-mississippi-2b",
|
||||
],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501
|
||||
prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>",
|
||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
|
||||
"stop_sign": "<image>\nWhat's the content in the center of the image?",
|
||||
"cherry_blossom": "<image>\nWhat is the season?",
|
||||
}
|
||||
),
|
||||
@ -444,7 +444,7 @@ VLM_TEST_SETTINGS = {
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
|
||||
"stop_sign": "<image>\nWhat's the content in the center of the image?",
|
||||
"cherry_blossom": "<image>\nWhat is the season?",
|
||||
}
|
||||
),
|
||||
@ -529,7 +529,7 @@ VLM_TEST_SETTINGS = {
|
||||
max_model_len=16384,
|
||||
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
|
||||
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
|
||||
), # noqa: E501
|
||||
),
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
|
||||
custom_test_opts=[
|
||||
@ -583,7 +583,7 @@ VLM_TEST_SETTINGS = {
|
||||
max_num_seqs=2,
|
||||
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(
|
||||
["<|im_end|>", "<|endoftext|>"]
|
||||
), # noqa: E501
|
||||
),
|
||||
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
|
||||
patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
|
||||
# FIXME: https://huggingface.co/openbmb/MiniCPM-o-2_6/discussions/49
|
||||
@ -598,7 +598,7 @@ VLM_TEST_SETTINGS = {
|
||||
max_num_seqs=2,
|
||||
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(
|
||||
["<|im_end|>", "<|endoftext|>"]
|
||||
), # noqa: E501
|
||||
),
|
||||
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
|
||||
patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
|
||||
),
|
||||
@ -627,7 +627,7 @@ VLM_TEST_SETTINGS = {
|
||||
models=["AIDC-AI/Ovis1.6-Gemma2-9B"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<image>\n",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
dtype="half",
|
||||
@ -640,7 +640,7 @@ VLM_TEST_SETTINGS = {
|
||||
models=["AIDC-AI/Ovis2-1B"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<image>\n",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
dtype="half",
|
||||
@ -652,7 +652,7 @@ VLM_TEST_SETTINGS = {
|
||||
models=["AIDC-AI/Ovis2.5-2B"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<image>\n",
|
||||
video_idx_to_prompt=lambda idx: "<video>\n",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
@ -701,8 +701,8 @@ VLM_TEST_SETTINGS = {
|
||||
models=["Qwen/Qwen2-VL-2B-Instruct"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
|
||||
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",
|
||||
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>",
|
||||
multi_image_prompt="Picture 1: <vlm_image>\nPicture 2: <vlm_image>\nDescribe these two images with one paragraph respectively.", # noqa: E501
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
@ -717,11 +717,11 @@ VLM_TEST_SETTINGS = {
|
||||
prompt_formatter=lambda img_prompt: f"<|begin▁of▁sentence|><|User|>\n{img_prompt}<|Assistant|><think>\n", # noqa: E501
|
||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
|
||||
"stop_sign": "<image>\nWhat's the content in the center of the image?",
|
||||
"cherry_blossom": "<image>\nWhat is the season?",
|
||||
}
|
||||
),
|
||||
multi_image_prompt="<image>\n<image>\nDescribe the two images in short.", # noqa: E501
|
||||
multi_image_prompt="<image>\n<image>\nDescribe the two images in short.",
|
||||
max_model_len=4096,
|
||||
use_tokenizer_eos=True,
|
||||
patch_hf_runner=model_utils.skyworkr1v_patch_hf_runner,
|
||||
@ -754,8 +754,8 @@ VLM_TEST_SETTINGS = {
|
||||
VLMTestType.VIDEO,
|
||||
),
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
|
||||
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",
|
||||
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
@ -816,7 +816,7 @@ VLM_TEST_SETTINGS = {
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
|
||||
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
|
||||
), # noqa: E501
|
||||
),
|
||||
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
|
||||
custom_test_opts=[
|
||||
CustomTestOptions(
|
||||
|
@ -170,7 +170,7 @@ async def test_online_serving(client, audio_assets: AudioTestAssets):
|
||||
],
|
||||
{
|
||||
"type": "text",
|
||||
"text": f"What's happening in these {len(audio_assets)} audio clips?",
|
||||
"text": f"What's happening in these {len(audio_assets)} audio clips?", # noqa: E501
|
||||
},
|
||||
],
|
||||
}
|
||||
|
@ -101,16 +101,11 @@ async def test_online_serving(client, audio_assets: AudioTestAssets):
|
||||
return audio_dict
|
||||
|
||||
audio_chunks = [asset_to_chunk(asset) for asset in audio_assets]
|
||||
text = f"What's happening in these {len(audio_assets)} audio clips?"
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*audio_chunks,
|
||||
{
|
||||
"type": "text",
|
||||
"text": f"What's happening in these {len(audio_assets)} audio clips?",
|
||||
},
|
||||
],
|
||||
"content": [*audio_chunks, {"type": "text", "text": text}],
|
||||
}
|
||||
]
|
||||
|
||||
|
@ -102,8 +102,8 @@ def multi_video_multi_aspect_ratio_inputs(
|
||||
def different_patch_input_cases_internvl():
|
||||
images = [asset.pil_image.resize((896, 896)) for asset in IMAGE_ASSETS]
|
||||
formatter = (
|
||||
lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n"
|
||||
) # noqa: E501
|
||||
lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501
|
||||
)
|
||||
single_img_prompts = [
|
||||
"<image>\nWhat's the content in the center of the image?",
|
||||
"<image>\nWhat is the season?",
|
||||
|
@ -47,7 +47,8 @@ EXAMPLE_TOOLS = [
|
||||
"properties": {
|
||||
"city": {
|
||||
"type": "string",
|
||||
"description": "The city to get the forecast for, e.g. 'New York'",
|
||||
"description": "The city to get the forecast for, e.g. "
|
||||
"'New York'",
|
||||
},
|
||||
"days": {
|
||||
"type": "integer",
|
||||
|
@ -134,15 +134,15 @@ def get_attention_backend(backend_name: _Backend):
|
||||
else "vllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend"
|
||||
),
|
||||
_Backend.FLASHINFER: "vllm.v1.attention.backends.flashinfer.FlashInferBackend",
|
||||
_Backend.FLEX_ATTENTION: "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend",
|
||||
_Backend.TRITON_ATTN: "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend",
|
||||
_Backend.FLEX_ATTENTION: "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend", # noqa: E501
|
||||
_Backend.TRITON_ATTN: "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend", # noqa: E501
|
||||
_Backend.TREE_ATTN: "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend",
|
||||
_Backend.XFORMERS: "vllm.v1.attention.backends.xformers.XFormersAttentionBackend",
|
||||
_Backend.CUTLASS_MLA: "vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend",
|
||||
_Backend.XFORMERS: "vllm.v1.attention.backends.xformers.XFormersAttentionBackend", # noqa: E501
|
||||
_Backend.CUTLASS_MLA: "vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend", # noqa: E501
|
||||
_Backend.FLASHMLA: "vllm.v1.attention.backends.mla.flashmla.FlashMLABackend",
|
||||
_Backend.FLASH_ATTN_MLA: "vllm.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend",
|
||||
_Backend.FLASHINFER_MLA: "vllm.v1.attention.backends.mla.flashinfer_mla.FlashInferMLABackend",
|
||||
_Backend.TRITON_MLA: "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend",
|
||||
_Backend.FLASH_ATTN_MLA: "vllm.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend", # noqa: E501
|
||||
_Backend.FLASHINFER_MLA: "vllm.v1.attention.backends.mla.flashinfer_mla.FlashInferMLABackend", # noqa: E501
|
||||
_Backend.TRITON_MLA: "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend", # noqa: E501
|
||||
}
|
||||
|
||||
if backend_name not in backend_map:
|
||||
|
@ -104,7 +104,7 @@ async def test_single_chat_session_image_base64encoded(
|
||||
"content": [
|
||||
{
|
||||
"type": "input_image",
|
||||
"image_url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}",
|
||||
"image_url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}", # noqa: E501
|
||||
"detail": "auto",
|
||||
},
|
||||
{"type": "input_text", "text": content_text},
|
||||
|
@ -15,8 +15,9 @@ RTOL = 0.03
|
||||
EXPECTED_VALUES = {"Qwen/Qwen3-0.6B": 0.41, "deepseek-ai/deepseek-vl2-small": 0.59}
|
||||
|
||||
SIMPLE_PROMPT = (
|
||||
"The best part about working on vLLM is that I got to meet so many people across various different organizations like UCB, Google, and Meta which means",
|
||||
) # noqa: E501
|
||||
"The best part about working on vLLM is that I got to meet so many people across "
|
||||
"various different organizations like UCB, Google, and Meta which means",
|
||||
)
|
||||
|
||||
# Get model name from environment variable
|
||||
MODEL_NAME = os.environ.get("TEST_MODEL", "Qwen/Qwen3-0.6B")
|
||||
|
@ -127,7 +127,7 @@ class RequestRunner:
|
||||
kv_role="kv_both",
|
||||
kv_connector_extra_config={
|
||||
"spec_name": "MockOffloadingSpec",
|
||||
"spec_module_path": "tests.v1.kv_connector.unit.test_offloading_connector",
|
||||
"spec_module_path": "tests.v1.kv_connector.unit.test_offloading_connector", # noqa: E501
|
||||
"block_size": offloaded_block_size,
|
||||
},
|
||||
)
|
||||
|
@ -260,15 +260,8 @@ def test_pooling_rejects_custom_logitsprocs(
|
||||
gpu_memory_utilization=0.1,
|
||||
)
|
||||
# Require that no logitsprocs have been loaded
|
||||
assert (
|
||||
sum(
|
||||
[
|
||||
1
|
||||
for _ in llm.llm_engine.model_executor.driver_worker.worker.model_runner.input_batch.logitsprocs.all
|
||||
]
|
||||
)
|
||||
== 0
|
||||
)
|
||||
worker = llm.llm_engine.model_executor.driver_worker.worker
|
||||
assert sum([1 for _ in worker.model_runner.input_batch.logitsprocs.all]) == 0
|
||||
return
|
||||
|
||||
kwargs: dict[str, list[Union[str, type[LogitsProcessor]]]] = {}
|
||||
|
@ -76,10 +76,14 @@ def _kv_cache_update_kernel(
|
||||
static_argnames=["page_size", "num_slices_per_block"],
|
||||
)
|
||||
def kv_cache_update(
|
||||
new_kv: jax.Array, # [total_num_token, num_combined_kv_heads, head_dim]
|
||||
slices: jax.Array, # [3, slices], list of (kv_cache_start, new_kv_start, slice_len)
|
||||
kv_cache: jax.Array, # [total_num_pages * page_size, num_combined_kv_heads, head_dim]
|
||||
num_kv_update_slices: jax.Array, # [1]
|
||||
# [total_num_token, num_combined_kv_heads, head_dim]
|
||||
new_kv: jax.Array,
|
||||
# [3, slices], list of (kv_cache_start, new_kv_start, slice_len)
|
||||
slices: jax.Array,
|
||||
# [total_num_pages * page_size, num_combined_kv_heads, head_dim]
|
||||
kv_cache: jax.Array,
|
||||
# [1]
|
||||
num_kv_update_slices: jax.Array,
|
||||
*,
|
||||
page_size: int = 32,
|
||||
num_slices_per_block: int = 8,
|
||||
|
@ -834,7 +834,10 @@ class AllReduceFusedRMSNormStaticQuantFP8Pattern(BasePattern):
|
||||
scale_out=None,
|
||||
rms_gamma=weight,
|
||||
rms_eps=self.epsilon,
|
||||
pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant, # we don't use norm_out afterwards
|
||||
# We don't use norm_out afterwards
|
||||
pattern_code=(
|
||||
flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant
|
||||
),
|
||||
scale_factor=scale,
|
||||
**self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
|
||||
)
|
||||
@ -928,11 +931,14 @@ class AllReduceFusedAddRMSNormStaticQuantFP8Pattern(BasePattern):
|
||||
scale_out=None,
|
||||
rms_gamma=weight,
|
||||
rms_eps=self.epsilon,
|
||||
pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant, # we don't use norm_out afterwards
|
||||
# We don't use norm_out afterwards
|
||||
pattern_code=(
|
||||
flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant
|
||||
),
|
||||
scale_factor=scale,
|
||||
**self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
|
||||
)
|
||||
# # quant_out, rms_norm_residual
|
||||
# quant_out, rms_norm_residual
|
||||
return allreduce[4], allreduce[2]
|
||||
|
||||
pm.register_replacement(
|
||||
@ -1028,7 +1034,10 @@ class AllReduceFusedRMSNormStaticQuantNVFP4Pattern(BasePattern):
|
||||
scale_out=output_scale,
|
||||
rms_gamma=weight,
|
||||
rms_eps=self.epsilon,
|
||||
pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant, # we don't use norm_out afterwards
|
||||
# We don't use norm_out afterwards
|
||||
pattern_code=(
|
||||
flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant
|
||||
),
|
||||
scale_factor=input_global_scale,
|
||||
**self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
|
||||
)
|
||||
@ -1130,7 +1139,10 @@ class AllReduceFusedAddRMSNormStaticQuantNVFP4Pattern(BasePattern):
|
||||
scale_out=output_scale,
|
||||
rms_gamma=weight,
|
||||
rms_eps=self.epsilon,
|
||||
pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant, # we don't use norm_out afterwards
|
||||
# We don't use norm_out afterwards
|
||||
pattern_code=(
|
||||
flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant
|
||||
),
|
||||
scale_factor=input_global_scale,
|
||||
**self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
|
||||
)
|
||||
|
@ -119,9 +119,12 @@ class TorchCompileWrapperWithCustomDispatcher:
|
||||
|
||||
src = depyf.decompile(new_code)
|
||||
msg = (
|
||||
"Assigning / modifying buffers of nn.Module during forward pass is not allowed when using cudagraph inside the compiler because it will cause silent errors. Please use eager mode or fix the code. The following code contains clues about which buffer is being modified (please search for the usage of the function `update`):\n"
|
||||
+ src
|
||||
) # noqa
|
||||
"Assigning / modifying buffers of nn.Module during forward pass is not "
|
||||
"allowed when using cudagraph inside the compiler because it will "
|
||||
"cause silent errors. Please use eager mode or fix the code. The "
|
||||
"following code contains clues about which buffer is being modified "
|
||||
f"(please search for the usage of the function `update`):\n{src}"
|
||||
)
|
||||
raise RuntimeError(msg)
|
||||
|
||||
@contextmanager
|
||||
@ -132,8 +135,9 @@ class TorchCompileWrapperWithCustomDispatcher:
|
||||
variables as the original code. Therefore we can directly switch
|
||||
the code object in the function and call it.
|
||||
|
||||
See https://dev-discuss.pytorch.org/t/what-is-the-relationship-requirement-among-original-bytecode-transformed-bytecode-and-bytecode-returned-by-hooks-in-dynamo/1693/7 for more details.
|
||||
""" # noqa
|
||||
See https://dev-discuss.pytorch.org/t/what-is-the-relationship-requirement-among-original-bytecode-transformed-bytecode-and-bytecode-returned-by-hooks-in-dynamo/1693/7
|
||||
for more details.
|
||||
"""
|
||||
self.__class__.forward.__code__ = self.compiled_codes[index]
|
||||
yield
|
||||
self.__class__.forward.__code__ = self.original_code_object
|
||||
|
@ -472,7 +472,7 @@ class VllmConfig:
|
||||
self.compilation_config.cudagraph_mode.has_full_cudagraphs()
|
||||
and self.model_config is not None
|
||||
and not self.model_config.disable_cascade_attn
|
||||
and not self.compilation_config.cudagraph_mode.has_piecewise_cudagraphs()
|
||||
and not self.compilation_config.cudagraph_mode.has_piecewise_cudagraphs() # noqa: E501
|
||||
):
|
||||
logger.warning_once(
|
||||
"No piecewise cudagraph for executing cascade attention."
|
||||
|
@ -147,8 +147,9 @@ class PPLXAll2AllManager(All2AllManagerBase):
|
||||
|
||||
def __init__(self, cpu_group):
|
||||
assert has_pplx(), (
|
||||
"pplx_kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md to install pplx_kernels."
|
||||
) # noqa
|
||||
"pplx_kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md"
|
||||
" to install pplx_kernels."
|
||||
)
|
||||
super().__init__(cpu_group)
|
||||
|
||||
if self.internode:
|
||||
@ -220,7 +221,8 @@ class DeepEPAll2AllManagerBase(All2AllManagerBase):
|
||||
|
||||
def __init__(self, cpu_group):
|
||||
assert has_deep_ep(), (
|
||||
"DeepEP kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md to install DeepEP kernels."
|
||||
"DeepEP kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md"
|
||||
" to install DeepEP kernels."
|
||||
) # noqa
|
||||
super().__init__(cpu_group)
|
||||
self.handle_cache = Cache()
|
||||
|
@ -471,7 +471,8 @@ class ChatCompletionRequest(OpenAIBaseModel):
|
||||
top_logprobs: Optional[int] = 0
|
||||
max_tokens: Optional[int] = Field(
|
||||
default=None,
|
||||
deprecated="max_tokens is deprecated in favor of the max_completion_tokens field",
|
||||
deprecated="max_tokens is deprecated in favor of "
|
||||
"the max_completion_tokens field",
|
||||
)
|
||||
max_completion_tokens: Optional[int] = None
|
||||
n: Optional[int] = 1
|
||||
|
@ -31,7 +31,7 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
|
||||
if self.base_layer.num_added_embeddings_per_partition > 0:
|
||||
# We can start adding lora weights
|
||||
self.embeddings_weights = self.base_layer.weight.data[
|
||||
self.base_layer.num_org_embeddings_per_partition : self.base_layer.num_org_embeddings_per_partition
|
||||
self.base_layer.num_org_embeddings_per_partition : self.base_layer.num_org_embeddings_per_partition # noqa: E501
|
||||
+ self.base_layer.num_added_embeddings_per_partition
|
||||
]
|
||||
self.embeddings_slice = (
|
||||
|
@ -107,8 +107,8 @@ class PTPCFp8LinearMethod(Fp8LinearMethod):
|
||||
layer.weight = torch.nn.Parameter(layer.weight.data, requires_grad=False)
|
||||
|
||||
assert layer.weight.data.dtype == torch.bfloat16, (
|
||||
f"Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support output dtype of bfloat16. {str(layer.weight.data.dtype)} is specified."
|
||||
) # noqa: E501
|
||||
f"Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support output dtype of bfloat16. {str(layer.weight.data.dtype)} is specified." # noqa: E501
|
||||
)
|
||||
# Quantize the weights.
|
||||
qweight, weight_scale = ops.scaled_fp8_quant(
|
||||
layer.weight, scale=None, use_per_token_if_dynamic=True
|
||||
|
@ -391,7 +391,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
|
||||
total_shard_sizes = next(
|
||||
(
|
||||
sizes
|
||||
for module, sizes in self.maybe_fused_weights_modules.items()
|
||||
for module, sizes in self.maybe_fused_weights_modules.items() # noqa: E501
|
||||
if check_match(mapped_weight_name, module)
|
||||
)
|
||||
)
|
||||
|
@ -270,8 +270,8 @@ class BailingMoE(nn.Module):
|
||||
) or (
|
||||
self.score_function == "sigmoid" and self.correction_bias is not None
|
||||
), (
|
||||
"score_function and correction_bias should be in 2 combination (softmax, None) or (sigmoid, not None)"
|
||||
) # noqa: E501
|
||||
"score_function and correction_bias should be in 2 combination (softmax, None) or (sigmoid, not None)" # noqa: E501
|
||||
)
|
||||
else:
|
||||
# default value for scoring_func
|
||||
self.score_function = "softmax"
|
||||
|
@ -825,10 +825,10 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
# Run MM-Projector
|
||||
# len(num_grids) == len(num_queries_vis_abstractors) + 1
|
||||
grid_idx = 0
|
||||
num_grids = [
|
||||
grid_idx
|
||||
] # e.g. [0, 9, 18, 19, 27, 28, 36, 37, 45, 46, 54, 55, 56]
|
||||
num_queries_vis_abstractors = [] # e.g. [81, 81, 81, 9, 81, 9, 81, 9, 81, 9, 81, 9]
|
||||
# e.g. [0, 9, 18, 19, 27, 28, 36, 37, 45, 46, 54, 55, 56]
|
||||
num_grids = [grid_idx]
|
||||
# e.g. [81, 81, 81, 9, 81, 9, 81, 9, 81, 9, 81, 9]
|
||||
num_queries_vis_abstractors = []
|
||||
len_total_frames = video_forward_outs.shape[0]
|
||||
|
||||
if self.config.first_last_frames_slow:
|
||||
|
@ -154,9 +154,10 @@ class LlamaModel(nn.Module):
|
||||
str(layer_index), str(layer_index + start_layer_id)
|
||||
)
|
||||
|
||||
quant_config.torchao_config.module_fqn_to_config = {
|
||||
torchao_config = quant_config.torchao_config
|
||||
torchao_config.module_fqn_to_config = {
|
||||
pad_layer_name(layer): quantization
|
||||
for layer, quantization in quant_config.torchao_config.module_fqn_to_config.items()
|
||||
for layer, quantization in torchao_config.module_fqn_to_config.items()
|
||||
}
|
||||
|
||||
|
||||
|
@ -186,26 +186,26 @@ class LongCatFlashMTP(nn.Module, SupportsPP):
|
||||
"model.mtp.layers.0.eh_proj.weight_scale_inv": "eh_proj.weight_scale_inv",
|
||||
"model.mtp.layers.0.enorm.m.weight": "enorm.weight",
|
||||
"model.mtp.layers.0.hnorm.m.weight": "hnorm.weight",
|
||||
"model.mtp.layers.0.input_layernorm.weight": "model.layers.0.input_layernorm.weight",
|
||||
"model.mtp.layers.0.post_attention_layernorm.weight": "model.layers.0.post_attention_layernorm.weight",
|
||||
"model.mtp.layers.0.self_attn.kv_a_layernorm.weight": "model.layers.0.self_attn.kv_a_layernorm.weight",
|
||||
"model.mtp.layers.0.self_attn.kv_a_proj_with_mqa.weight": "model.layers.0.self_attn.kv_a_proj_with_mqa.weight",
|
||||
"model.mtp.layers.0.self_attn.kv_a_proj_with_mqa.weight_scale_inv": "model.layers.0.self_attn.kv_a_proj_with_mqa.weight_scale_inv",
|
||||
"model.mtp.layers.0.self_attn.kv_b_proj.weight": "model.layers.0.self_attn.kv_b_proj.weight",
|
||||
"model.mtp.layers.0.self_attn.kv_b_proj.weight_scale_inv": "model.layers.0.self_attn.kv_b_proj.weight_scale_inv",
|
||||
"model.mtp.layers.0.self_attn.o_proj.weight": "model.layers.0.self_attn.o_proj.weight",
|
||||
"model.mtp.layers.0.self_attn.o_proj.weight_scale_inv": "model.layers.0.self_attn.o_proj.weight_scale_inv",
|
||||
"model.mtp.layers.0.self_attn.q_a_layernorm.weight": "model.layers.0.self_attn.q_a_layernorm.weight",
|
||||
"model.mtp.layers.0.self_attn.q_a_proj.weight": "model.layers.0.self_attn.q_a_proj.weight",
|
||||
"model.mtp.layers.0.self_attn.q_a_proj.weight_scale_inv": "model.layers.0.self_attn.q_a_proj.weight_scale_inv",
|
||||
"model.mtp.layers.0.self_attn.q_b_proj.weight": "model.layers.0.self_attn.q_b_proj.weight",
|
||||
"model.mtp.layers.0.self_attn.q_b_proj.weight_scale_inv": "model.layers.0.self_attn.q_b_proj.weight_scale_inv",
|
||||
"model.mtp.layers.0.transformer_layer.mlp.down_proj.weight": "model.layers.0.mlp.down_proj.weight",
|
||||
"model.mtp.layers.0.transformer_layer.mlp.down_proj.weight_scale_inv": "model.layers.0.mlp.down_proj.weight_scale_inv",
|
||||
"model.mtp.layers.0.transformer_layer.mlp.gate_proj.weight": "model.layers.0.mlp.gate_proj.weight",
|
||||
"model.mtp.layers.0.transformer_layer.mlp.gate_proj.weight_scale_inv": "model.layers.0.mlp.gate_proj.weight_scale_inv",
|
||||
"model.mtp.layers.0.transformer_layer.mlp.up_proj.weight": "model.layers.0.mlp.up_proj.weight",
|
||||
"model.mtp.layers.0.transformer_layer.mlp.up_proj.weight_scale_inv": "model.layers.0.mlp.up_proj.weight_scale_inv",
|
||||
"model.mtp.layers.0.input_layernorm.weight": "model.layers.0.input_layernorm.weight", # noqa: E501
|
||||
"model.mtp.layers.0.post_attention_layernorm.weight": "model.layers.0.post_attention_layernorm.weight", # noqa: E501
|
||||
"model.mtp.layers.0.self_attn.kv_a_layernorm.weight": "model.layers.0.self_attn.kv_a_layernorm.weight", # noqa: E501
|
||||
"model.mtp.layers.0.self_attn.kv_a_proj_with_mqa.weight": "model.layers.0.self_attn.kv_a_proj_with_mqa.weight", # noqa: E501
|
||||
"model.mtp.layers.0.self_attn.kv_a_proj_with_mqa.weight_scale_inv": "model.layers.0.self_attn.kv_a_proj_with_mqa.weight_scale_inv", # noqa: E501
|
||||
"model.mtp.layers.0.self_attn.kv_b_proj.weight": "model.layers.0.self_attn.kv_b_proj.weight", # noqa: E501
|
||||
"model.mtp.layers.0.self_attn.kv_b_proj.weight_scale_inv": "model.layers.0.self_attn.kv_b_proj.weight_scale_inv", # noqa: E501
|
||||
"model.mtp.layers.0.self_attn.o_proj.weight": "model.layers.0.self_attn.o_proj.weight", # noqa: E501
|
||||
"model.mtp.layers.0.self_attn.o_proj.weight_scale_inv": "model.layers.0.self_attn.o_proj.weight_scale_inv", # noqa: E501
|
||||
"model.mtp.layers.0.self_attn.q_a_layernorm.weight": "model.layers.0.self_attn.q_a_layernorm.weight", # noqa: E501
|
||||
"model.mtp.layers.0.self_attn.q_a_proj.weight": "model.layers.0.self_attn.q_a_proj.weight", # noqa: E501
|
||||
"model.mtp.layers.0.self_attn.q_a_proj.weight_scale_inv": "model.layers.0.self_attn.q_a_proj.weight_scale_inv", # noqa: E501
|
||||
"model.mtp.layers.0.self_attn.q_b_proj.weight": "model.layers.0.self_attn.q_b_proj.weight", # noqa: E501
|
||||
"model.mtp.layers.0.self_attn.q_b_proj.weight_scale_inv": "model.layers.0.self_attn.q_b_proj.weight_scale_inv", # noqa: E501
|
||||
"model.mtp.layers.0.transformer_layer.mlp.down_proj.weight": "model.layers.0.mlp.down_proj.weight", # noqa: E501
|
||||
"model.mtp.layers.0.transformer_layer.mlp.down_proj.weight_scale_inv": "model.layers.0.mlp.down_proj.weight_scale_inv", # noqa: E501
|
||||
"model.mtp.layers.0.transformer_layer.mlp.gate_proj.weight": "model.layers.0.mlp.gate_proj.weight", # noqa: E501
|
||||
"model.mtp.layers.0.transformer_layer.mlp.gate_proj.weight_scale_inv": "model.layers.0.mlp.gate_proj.weight_scale_inv", # noqa: E501
|
||||
"model.mtp.layers.0.transformer_layer.mlp.up_proj.weight": "model.layers.0.mlp.up_proj.weight", # noqa: E501
|
||||
"model.mtp.layers.0.transformer_layer.mlp.up_proj.weight_scale_inv": "model.layers.0.mlp.up_proj.weight_scale_inv", # noqa: E501
|
||||
"model.mtp.norm.weight": "final_layernorm.weight",
|
||||
}
|
||||
|
||||
|
@ -1000,8 +1000,8 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
|
||||
"base_layer.": "",
|
||||
},
|
||||
orig_to_new_prefix={
|
||||
"model.embed_tokens_extend.audio_embed.audio_projection.vision.": "embed_tokens_extend.audio_projection_for_vision.",
|
||||
"model.embed_tokens_extend.audio_embed.audio_projection.speech.": "embed_tokens_extend.audio_projection.",
|
||||
"model.embed_tokens_extend.audio_embed.audio_projection.vision.": "embed_tokens_extend.audio_projection_for_vision.", # noqa: E501
|
||||
"model.embed_tokens_extend.audio_embed.audio_projection.speech.": "embed_tokens_extend.audio_projection.", # noqa: E501
|
||||
"model.embed_tokens_extend.audio_embed.": "embed_tokens_extend.",
|
||||
"model.embed_tokens_extend.image_embed.": "vision_encoder.",
|
||||
},
|
||||
|
@ -916,8 +916,9 @@ class Qwen3NextDecoderLayer(nn.Module):
|
||||
)
|
||||
else:
|
||||
assert len(hidden_states.shape) == len(self.ffn_layer_scale.shape), (
|
||||
f"shape must be the same {len(hidden_states.shape)}, {len(self.ffn_layer_scale.shape)}"
|
||||
) # noqa: E501
|
||||
f"shape must be the same {len(hidden_states.shape)}, "
|
||||
f"{len(self.ffn_layer_scale.shape)}"
|
||||
)
|
||||
hidden_states = hidden_states * (
|
||||
self.ffn_layer_scale.to(hidden_states.dtype) + 1
|
||||
)
|
||||
|
@ -255,8 +255,8 @@ def is_rocm_aiter_fp8bmm_enabled() -> bool:
|
||||
|
||||
|
||||
if is_rocm_aiter_fp8bmm_enabled():
|
||||
from aiter.ops.triton.batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant import ( # noqa: E501 # isort: skip
|
||||
batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant as aiter_triton_fp8_bmm,
|
||||
from aiter.ops.triton.batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant import ( # noqa: E501
|
||||
batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant as aiter_triton_fp8_bmm, # noqa: E501
|
||||
)
|
||||
|
||||
def dynamic_per_batched_tensor_quant(
|
||||
@ -1284,8 +1284,10 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
|
||||
actual_seq_lens_q=prefill.query_seq_lens.view(-1, 1, 1, 1),
|
||||
actual_seq_lens_kv=prefill.query_seq_lens.view(-1, 1, 1, 1),
|
||||
causal=True,
|
||||
return_lse=True, # do not support False for now
|
||||
is_cuda_graph_compatible=True, # Indicates actual_seq_lens are on GPU or CPU.
|
||||
# Do not support False for now
|
||||
return_lse=True,
|
||||
# Indicates actual_seq_lens are on GPU or CPU.
|
||||
is_cuda_graph_compatible=True,
|
||||
)
|
||||
if return_softmax_lse:
|
||||
return output, lse
|
||||
@ -1342,7 +1344,8 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
|
||||
),
|
||||
causal=False,
|
||||
return_lse=True,
|
||||
is_cuda_graph_compatible=True, # Indicates actual_seq_lens are on GPU or CPU.
|
||||
# Indicates actual_seq_lens are on GPU or CPU.
|
||||
is_cuda_graph_compatible=True,
|
||||
)
|
||||
|
||||
def process_weights_after_loading(self, act_dtype: torch.dtype):
|
||||
|
@ -872,10 +872,13 @@ def wait_for_engine_startup(
|
||||
EngineHandshakeMetadata(
|
||||
addresses=addresses,
|
||||
parallel_config={
|
||||
"data_parallel_master_ip": parallel_config.data_parallel_master_ip,
|
||||
"data_parallel_master_port": parallel_config.data_parallel_master_port,
|
||||
"_data_parallel_master_port_list": parallel_config._data_parallel_master_port_list,
|
||||
"data_parallel_size": parallel_config.data_parallel_size,
|
||||
k: getattr(parallel_config, k)
|
||||
for k in (
|
||||
"data_parallel_master_ip",
|
||||
"data_parallel_master_port",
|
||||
"_data_parallel_master_port_list",
|
||||
"data_parallel_size",
|
||||
)
|
||||
},
|
||||
)
|
||||
)
|
||||
|
@ -345,13 +345,15 @@ def report_usage_stats(
|
||||
|
||||
from vllm.model_executor.model_loader import get_architecture_class_name
|
||||
|
||||
parallel_config = vllm_config.parallel_config
|
||||
|
||||
usage_message.report_usage(
|
||||
get_architecture_class_name(vllm_config.model_config),
|
||||
usage_context,
|
||||
extra_kvs={
|
||||
# Common configuration
|
||||
"dtype": str(vllm_config.model_config.dtype),
|
||||
"tensor_parallel_size": vllm_config.parallel_config.tensor_parallel_size,
|
||||
"tensor_parallel_size": parallel_config.tensor_parallel_size,
|
||||
"block_size": vllm_config.cache_config.block_size,
|
||||
"gpu_memory_utilization": vllm_config.cache_config.gpu_memory_utilization,
|
||||
"kv_cache_memory_bytes": vllm_config.cache_config.kv_cache_memory_bytes,
|
||||
@ -362,7 +364,7 @@ def report_usage_stats(
|
||||
"enable_lora": bool(vllm_config.lora_config),
|
||||
"enable_prefix_caching": vllm_config.cache_config.enable_prefix_caching,
|
||||
"enforce_eager": vllm_config.model_config.enforce_eager,
|
||||
"disable_custom_all_reduce": vllm_config.parallel_config.disable_custom_all_reduce,
|
||||
"disable_custom_all_reduce": parallel_config.disable_custom_all_reduce,
|
||||
},
|
||||
)
|
||||
|
||||
|
@ -3391,7 +3391,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
attn_metadata[ubid][layer_name] = attn_metadata_i
|
||||
else:
|
||||
assert type(attn_metadata) is dict
|
||||
attn_metadata_i = attn_group.get_metadata_builder().build_for_cudagraph_capture(
|
||||
metadata_builder = attn_group.get_metadata_builder()
|
||||
attn_metadata_i = metadata_builder.build_for_cudagraph_capture(
|
||||
common_attn_metadata
|
||||
)
|
||||
for layer_name in attn_group.layer_names:
|
||||
|
Reference in New Issue
Block a user