mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 23:03:52 +08:00
Fix per file ruff ignores related to line length (#26262)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@ -164,7 +164,7 @@ def invoke_main() -> None:
|
|||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--batched", action="store_true", help="consider time to prepare batch"
|
"--batched", action="store_true", help="consider time to prepare batch"
|
||||||
) # noqa: E501
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--num-iteration",
|
"--num-iteration",
|
||||||
type=int,
|
type=int,
|
||||||
|
@ -909,13 +909,13 @@ def create_argument_parser():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--tokenizer",
|
"--tokenizer",
|
||||||
type=str,
|
type=str,
|
||||||
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
help="Name or path of the tokenizer, if not using the default tokenizer.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--tokenizer-mode",
|
"--tokenizer-mode",
|
||||||
type=str,
|
type=str,
|
||||||
default="auto",
|
default="auto",
|
||||||
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
help="Name or path of the tokenizer, if not using the default tokenizer.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--num-prompts",
|
"--num-prompts",
|
||||||
|
@ -72,8 +72,8 @@ VLLMKernelScheduleTag: dict[
|
|||||||
] = {
|
] = {
|
||||||
**KernelScheduleTag, # type: ignore
|
**KernelScheduleTag, # type: ignore
|
||||||
**{
|
**{
|
||||||
MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized",
|
MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized", # noqa: E501
|
||||||
MixedInputKernelScheduleType.TmaWarpSpecializedPingpong: "cutlass::gemm::KernelTmaWarpSpecializedPingpong",
|
MixedInputKernelScheduleType.TmaWarpSpecializedPingpong: "cutlass::gemm::KernelTmaWarpSpecializedPingpong", # noqa: E501
|
||||||
MixedInputKernelScheduleType.TmaWarpSpecializedCooperative: "cutlass::gemm::KernelTmaWarpSpecializedCooperative",
|
MixedInputKernelScheduleType.TmaWarpSpecializedCooperative: "cutlass::gemm::KernelTmaWarpSpecializedCooperative", # noqa: E501
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -113,7 +113,7 @@ def run_e5_v(query: Query) -> ModelRequestData:
|
|||||||
def _get_vlm2vec_prompt_image(query: Query, image_token: str):
|
def _get_vlm2vec_prompt_image(query: Query, image_token: str):
|
||||||
if query["modality"] == "text":
|
if query["modality"] == "text":
|
||||||
text = query["text"]
|
text = query["text"]
|
||||||
prompt = f"Find me an everyday image that matches the given caption: {text}" # noqa: E501
|
prompt = f"Find me an everyday image that matches the given caption: {text}"
|
||||||
image = None
|
image = None
|
||||||
elif query["modality"] == "image":
|
elif query["modality"] == "image":
|
||||||
prompt = f"{image_token} Find a day-to-day image that looks similar to the provided image." # noqa: E501
|
prompt = f"{image_token} Find a day-to-day image that looks similar to the provided image." # noqa: E501
|
||||||
|
@ -203,9 +203,9 @@ class Proxy:
|
|||||||
async with session.post(
|
async with session.post(
|
||||||
url=url, json=data, headers=headers
|
url=url, json=data, headers=headers
|
||||||
) as response:
|
) as response:
|
||||||
if 200 <= response.status < 300 or 400 <= response.status < 500: # noqa: E501
|
if 200 <= response.status < 300 or 400 <= response.status < 500:
|
||||||
if use_chunked:
|
if use_chunked:
|
||||||
async for chunk_bytes in response.content.iter_chunked( # noqa: E501
|
async for chunk_bytes in response.content.iter_chunked(
|
||||||
1024
|
1024
|
||||||
):
|
):
|
||||||
yield chunk_bytes
|
yield chunk_bytes
|
||||||
|
@ -56,52 +56,6 @@ include = ["vllm*"]
|
|||||||
"vllm/third_party/**" = ["ALL"]
|
"vllm/third_party/**" = ["ALL"]
|
||||||
"vllm/version.py" = ["F401"]
|
"vllm/version.py" = ["F401"]
|
||||||
"vllm/_version.py" = ["ALL"]
|
"vllm/_version.py" = ["ALL"]
|
||||||
# TEMPORARY! These ignores will be fixed forward
|
|
||||||
## Line length violations
|
|
||||||
"csrc/cutlass_extensions/vllm_cutlass_library_extension.py" = ["E501"]
|
|
||||||
"tests/compile/piecewise/test_simple.py" = ["E501"]
|
|
||||||
"tests/compile/piecewise/test_toy_llama.py" = ["E501", "B023"]
|
|
||||||
"tests/entrypoints/conftest.py" = ["E501"]
|
|
||||||
"tests/entrypoints/openai/test_audio.py" = ["E501"]
|
|
||||||
"tests/entrypoints/openai/test_chat.py" = ["E501"]
|
|
||||||
"tests/entrypoints/openai/test_chat_template.py" = ["E501"]
|
|
||||||
"tests/entrypoints/openai/test_chat_with_tool_reasoning.py" = ["E501"]
|
|
||||||
"tests/entrypoints/openai/test_completion_with_function_calling.py" = ["E501"]
|
|
||||||
"tests/entrypoints/openai/test_video.py" = ["E501"]
|
|
||||||
"tests/entrypoints/openai/test_vision.py" = ["E501"]
|
|
||||||
"tests/entrypoints/test_chat_utils.py" = ["E501"]
|
|
||||||
"tests/kernels/moe/modular_kernel_tools/common.py" = ["E501"]
|
|
||||||
"tests/models/language/generation/test_gemma.py" = ["E501"]
|
|
||||||
"tests/models/language/generation/test_mistral.py" = ["E501"]
|
|
||||||
"tests/models/multimodal/generation/test_ultravox.py" = ["E501"]
|
|
||||||
"tests/models/multimodal/generation/test_voxtral.py" = ["E501"]
|
|
||||||
"tests/models/multimodal/generation/vlm_utils/custom_inputs.py" = ["E501"]
|
|
||||||
"tests/tool_use/test_tool_choice_required.py" = ["E501"]
|
|
||||||
"tests/v1/attention/utils.py" = ["E501"]
|
|
||||||
"tests/v1/entrypoints/openai/responses/test_image.py" = ["E501"]
|
|
||||||
"tests/v1/kv_connector/nixl_integration/test_accuracy.py" = ["E501"]
|
|
||||||
"tests/v1/kv_connector/unit/test_offloading_connector.py" = ["E501"]
|
|
||||||
"tests/v1/logits_processors/test_custom_offline.py" = ["E501"]
|
|
||||||
"vllm/attention/ops/pallas_kv_cache_update.py" = ["E501"]
|
|
||||||
"vllm/compilation/collective_fusion.py" = ["E501"]
|
|
||||||
"vllm/compilation/wrapper.py" = ["E501"]
|
|
||||||
"vllm/config/vllm.py" = ["E501"]
|
|
||||||
"vllm/distributed/device_communicators/all2all.py" = ["E501"]
|
|
||||||
"vllm/entrypoints/openai/protocol.py" = ["E501"]
|
|
||||||
"vllm/lora/layers/vocal_parallel_embedding.py" = ["E501"]
|
|
||||||
"vllm/model_executor/model_loader/bitsandbytes_loader.py" = ["E501"]
|
|
||||||
"vllm/model_executor/models/bailing_moe.py" = ["E501"]
|
|
||||||
"vllm/model_executor/models/hyperclovax_vision.py" = ["E501"]
|
|
||||||
"vllm/model_executor/models/llama4_eagle.py" = ["E501"]
|
|
||||||
"vllm/model_executor/models/longcat_flash_mtp.py" = ["E501"]
|
|
||||||
"vllm/model_executor/models/phi4mm.py" = ["E501"]
|
|
||||||
"vllm/model_executor/models/qwen3_next.py" = ["E501"]
|
|
||||||
"vllm/model_executor/layers/quantization/ptpc_fp8.py" = ["E501"]
|
|
||||||
"vllm/v1/attention/backends/mla/common.py" = ["E501"]
|
|
||||||
"vllm/v1/engine/utils.py" = ["E501"]
|
|
||||||
"vllm/v1/utils.py" = ["E501"]
|
|
||||||
"vllm/v1/worker/gpu_model_runner.py" = ["E501"]
|
|
||||||
# End of temporary ignores
|
|
||||||
|
|
||||||
[tool.ruff.lint]
|
[tool.ruff.lint]
|
||||||
select = [
|
select = [
|
||||||
|
@ -132,10 +132,14 @@ def test_simple_piecewise_compile(use_inductor):
|
|||||||
splitting_ops=["silly.attention"],
|
splitting_ops=["silly.attention"],
|
||||||
use_inductor_graph_partition=False,
|
use_inductor_graph_partition=False,
|
||||||
use_inductor=use_inductor,
|
use_inductor=use_inductor,
|
||||||
expected_num_piecewise_graphs_seen=5, # 2 * num_layers + 1
|
# 2 * num_layers + 1
|
||||||
expected_num_piecewise_capturable_graphs_seen=3, # 1 + num_layers
|
expected_num_piecewise_graphs_seen=5,
|
||||||
expected_num_backend_compilations=3, # num_piecewise_capturable_graphs_seen
|
# 1 + num_layers
|
||||||
expected_num_cudagraph_captured=6, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
expected_num_piecewise_capturable_graphs_seen=3,
|
||||||
|
# num_piecewise_capturable_graphs_seen
|
||||||
|
expected_num_backend_compilations=3,
|
||||||
|
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
||||||
|
expected_num_cudagraph_captured=6,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -147,14 +151,16 @@ def test_simple_inductor_graph_partition(splitting_ops):
|
|||||||
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
|
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
|
||||||
|
|
||||||
_run_simple_model(
|
_run_simple_model(
|
||||||
# inductor graph partition automatically resets splitting_ops
|
# Inductor graph partition automatically resets splitting_ops to an empty list
|
||||||
# to be an empty list
|
|
||||||
splitting_ops=splitting_ops,
|
splitting_ops=splitting_ops,
|
||||||
use_inductor_graph_partition=True,
|
use_inductor_graph_partition=True,
|
||||||
use_inductor=True,
|
use_inductor=True,
|
||||||
expected_num_piecewise_graphs_seen=1, # since not splitting at fx graph level
|
# Since not splitting at fx graph level
|
||||||
expected_num_piecewise_capturable_graphs_seen=1, # since not splitting at fx graph level
|
expected_num_piecewise_graphs_seen=1,
|
||||||
expected_num_backend_compilations=1, # since not splitting at fx graph level
|
# Since not splitting at fx graph level
|
||||||
expected_num_cudagraph_captured=6, # inductor graph partition still captures 6
|
expected_num_piecewise_capturable_graphs_seen=1,
|
||||||
# graph, same as fx graph partition.
|
# Since not splitting at fx graph level
|
||||||
|
expected_num_backend_compilations=1,
|
||||||
|
# Inductor graph partition still captures 6 graph, same as fx graph partition
|
||||||
|
expected_num_cudagraph_captured=6,
|
||||||
)
|
)
|
||||||
|
@ -367,11 +367,14 @@ def test_toy_llama(use_inductor: bool):
|
|||||||
kwargs = {"num_eager_compiles": 1, "num_inductor_compiles": 0}
|
kwargs = {"num_eager_compiles": 1, "num_inductor_compiles": 0}
|
||||||
|
|
||||||
with compilation_counter.expect(
|
with compilation_counter.expect(
|
||||||
num_graphs_seen=1, # one graph for the model
|
# One graph for the model
|
||||||
|
num_graphs_seen=1,
|
||||||
num_piecewise_graphs_seen=1,
|
num_piecewise_graphs_seen=1,
|
||||||
num_piecewise_capturable_graphs_seen=1,
|
num_piecewise_capturable_graphs_seen=1,
|
||||||
num_backend_compilations=1, # num_piecewise_capturable_graphs_seen
|
# num_piecewise_capturable_graphs_seen
|
||||||
num_cudagraph_captured=2, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
num_backend_compilations=1,
|
||||||
|
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
||||||
|
num_cudagraph_captured=2,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
outputs.append(
|
outputs.append(
|
||||||
@ -478,9 +481,10 @@ def benchmark():
|
|||||||
# it is fine here, because we only use the lambda function once.
|
# it is fine here, because we only use the lambda function once.
|
||||||
runtime = do_bench(
|
runtime = do_bench(
|
||||||
lambda: graphs[b][0]( # noqa
|
lambda: graphs[b][0]( # noqa
|
||||||
input_ids[:b], positions[:b]
|
input_ids[:b], # noqa
|
||||||
|
positions[:b], # noqa
|
||||||
|
)
|
||||||
)
|
)
|
||||||
) # noqa
|
|
||||||
piecewise_cudagraph_time[b] = runtime
|
piecewise_cudagraph_time[b] = runtime
|
||||||
else:
|
else:
|
||||||
runtime = do_bench(lambda: graphs[b][0].replay()) # noqa
|
runtime = do_bench(lambda: graphs[b][0].replay()) # noqa
|
||||||
|
@ -243,7 +243,7 @@ def test_fix_functionalization(model_class: torch.nn.Module, do_fusion: bool):
|
|||||||
# check if the functionalization pass is applied
|
# check if the functionalization pass is applied
|
||||||
for op in model.ops_in_model(do_fusion):
|
for op in model.ops_in_model(do_fusion):
|
||||||
find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
|
find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
|
||||||
assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None # noqa: E501
|
assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None
|
||||||
|
|
||||||
# make sure the ops were all de-functionalized
|
# make sure the ops were all de-functionalized
|
||||||
found = dict()
|
found = dict()
|
||||||
|
@ -565,7 +565,7 @@ def test_attention_quant_pattern(
|
|||||||
elif quant_key.dtype == FP4_DTYPE:
|
elif quant_key.dtype == FP4_DTYPE:
|
||||||
assert attn_nodes_post[0].kwargs.get("output_block_scale") is not None, (
|
assert attn_nodes_post[0].kwargs.get("output_block_scale") is not None, (
|
||||||
"Attention should have output_block_scale after FP4 fusion"
|
"Attention should have output_block_scale after FP4 fusion"
|
||||||
) # noqa: E501
|
)
|
||||||
|
|
||||||
# Check that results are close
|
# Check that results are close
|
||||||
torch.testing.assert_close(result_unfused, result_fused_1, atol=1e-2, rtol=1e-2)
|
torch.testing.assert_close(result_unfused, result_fused_1, atol=1e-2, rtol=1e-2)
|
||||||
|
@ -186,7 +186,7 @@ class TestQuantModel(torch.nn.Module):
|
|||||||
):
|
):
|
||||||
# If fusion happens, the fused op is the one
|
# If fusion happens, the fused op is the one
|
||||||
# we check for (de)functionalization
|
# we check for (de)functionalization
|
||||||
return [torch.ops._C.fused_add_rms_norm_static_fp8_quant.default] # noqa: E501
|
return [torch.ops._C.fused_add_rms_norm_static_fp8_quant.default]
|
||||||
else:
|
else:
|
||||||
# If no fusion, the original ops are checked
|
# If no fusion, the original ops are checked
|
||||||
return [
|
return [
|
||||||
@ -322,7 +322,7 @@ def sequence_parallelism_pass_on_test_model(
|
|||||||
# check if the functionalization pass is applied
|
# check if the functionalization pass is applied
|
||||||
for op in model.ops_in_model():
|
for op in model.ops_in_model():
|
||||||
find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
|
find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
|
||||||
assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None # noqa: E501
|
assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None
|
||||||
|
|
||||||
# make sure the ops were all de-functionalized
|
# make sure the ops were all de-functionalized
|
||||||
found = dict()
|
found = dict()
|
||||||
|
@ -104,7 +104,7 @@ TEXT_GENERATION_MODELS = {
|
|||||||
# [Decoder-only]
|
# [Decoder-only]
|
||||||
# Uses Llama
|
# Uses Llama
|
||||||
# "BAAI/AquilaChat-7B": PPTestSettings.fast(),
|
# "BAAI/AquilaChat-7B": PPTestSettings.fast(),
|
||||||
"Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(load_format="dummy"), # noqa: E501
|
"Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(load_format="dummy"),
|
||||||
"baichuan-inc/Baichuan-7B": PPTestSettings.fast(),
|
"baichuan-inc/Baichuan-7B": PPTestSettings.fast(),
|
||||||
"baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(),
|
"baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(),
|
||||||
"bigscience/bloomz-1b1": PPTestSettings.fast(),
|
"bigscience/bloomz-1b1": PPTestSettings.fast(),
|
||||||
@ -138,7 +138,7 @@ TEXT_GENERATION_MODELS = {
|
|||||||
# Uses Llama
|
# Uses Llama
|
||||||
# "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
|
# "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
|
||||||
"state-spaces/mamba-130m-hf": PPTestSettings.fast(),
|
"state-spaces/mamba-130m-hf": PPTestSettings.fast(),
|
||||||
"mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(load_format="dummy"), # noqa: E501
|
"mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(load_format="dummy"),
|
||||||
"mosaicml/mpt-7b": PPTestSettings.fast(),
|
"mosaicml/mpt-7b": PPTestSettings.fast(),
|
||||||
"nvidia/Minitron-8B-Base": PPTestSettings.fast(),
|
"nvidia/Minitron-8B-Base": PPTestSettings.fast(),
|
||||||
"allenai/OLMo-1B-hf": PPTestSettings.fast(),
|
"allenai/OLMo-1B-hf": PPTestSettings.fast(),
|
||||||
@ -151,13 +151,13 @@ TEXT_GENERATION_MODELS = {
|
|||||||
"microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(),
|
"microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(),
|
||||||
"microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(
|
"microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(
|
||||||
multi_node_only=True, load_format="dummy"
|
multi_node_only=True, load_format="dummy"
|
||||||
), # noqa: E501
|
),
|
||||||
"Qwen/Qwen-7B-Chat": PPTestSettings.fast(),
|
"Qwen/Qwen-7B-Chat": PPTestSettings.fast(),
|
||||||
"Qwen/Qwen2.5-0.5B-Instruct": PPTestSettings.fast(),
|
"Qwen/Qwen2.5-0.5B-Instruct": PPTestSettings.fast(),
|
||||||
"Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
|
"Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
|
||||||
"stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
|
"stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
|
||||||
"bigcode/starcoder2-3b": PPTestSettings.fast(),
|
"bigcode/starcoder2-3b": PPTestSettings.fast(),
|
||||||
"upstage/solar-pro-preview-instruct": PPTestSettings.fast(load_format="dummy"), # noqa: E501
|
"upstage/solar-pro-preview-instruct": PPTestSettings.fast(load_format="dummy"),
|
||||||
# FIXME: Cannot load tokenizer in latest transformers version.
|
# FIXME: Cannot load tokenizer in latest transformers version.
|
||||||
# Need to use tokenizer from `meta-llama/Llama-2-7b-chat-hf`
|
# Need to use tokenizer from `meta-llama/Llama-2-7b-chat-hf`
|
||||||
# "xverse/XVERSE-7B-Chat": PPTestSettings.fast(),
|
# "xverse/XVERSE-7B-Chat": PPTestSettings.fast(),
|
||||||
|
@ -83,7 +83,8 @@ def sample_complex_json_schema():
|
|||||||
"type": "array",
|
"type": "array",
|
||||||
"items": {
|
"items": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"pattern": "^[a-z]{1,10}$", # Combining length and pattern restrictions
|
# Combining length and pattern restrictions
|
||||||
|
"pattern": "^[a-z]{1,10}$",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
@ -145,7 +145,7 @@ async def test_single_chat_session_audio_base64encoded(
|
|||||||
{
|
{
|
||||||
"type": "audio_url",
|
"type": "audio_url",
|
||||||
"audio_url": {
|
"audio_url": {
|
||||||
"url": f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}"
|
"url": f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}" # noqa: E501
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{"type": "text", "text": "What's happening in this audio?"},
|
{"type": "text", "text": "What's happening in this audio?"},
|
||||||
|
@ -835,17 +835,18 @@ async def test_extra_fields_allowed(client: openai.AsyncOpenAI):
|
|||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_complex_message_content(client: openai.AsyncOpenAI):
|
async def test_complex_message_content(client: openai.AsyncOpenAI):
|
||||||
|
content = [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "what is 1+1? please provide the result without any other text.",
|
||||||
|
}
|
||||||
|
]
|
||||||
resp = await client.chat.completions.create(
|
resp = await client.chat.completions.create(
|
||||||
model=MODEL_NAME,
|
model=MODEL_NAME,
|
||||||
messages=[
|
messages=[
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": [
|
"content": content,
|
||||||
{
|
|
||||||
"type": "text",
|
|
||||||
"text": "what is 1+1? please provide the result without any other text.",
|
|
||||||
}
|
|
||||||
],
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
temperature=0,
|
temperature=0,
|
||||||
|
@ -76,8 +76,8 @@ def test_load_chat_template():
|
|||||||
assert (
|
assert (
|
||||||
template_content
|
template_content
|
||||||
== """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %}
|
== """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %}
|
||||||
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}"""
|
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}""" # noqa: E501
|
||||||
) # noqa: E501
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_no_load_chat_template_filelike():
|
def test_no_load_chat_template_filelike():
|
||||||
|
@ -45,12 +45,13 @@ TOOLS = [
|
|||||||
"properties": {
|
"properties": {
|
||||||
"city": {
|
"city": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "The city to find the weather for, e.g. 'San Francisco'",
|
"description": "The city to find the weather for, e.g. "
|
||||||
|
"'San Francisco'",
|
||||||
},
|
},
|
||||||
"state": {
|
"state": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "the two-letter abbreviation for the state that the city is"
|
"description": "the two-letter abbreviation for the state that "
|
||||||
" in, e.g. 'CA' which would mean 'California'",
|
"the city is in, e.g. 'CA' which would mean 'California'",
|
||||||
},
|
},
|
||||||
"unit": {
|
"unit": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
@ -69,7 +70,8 @@ MESSAGES = [
|
|||||||
{"role": "assistant", "content": "I'm doing well! How can I help you?"},
|
{"role": "assistant", "content": "I'm doing well! How can I help you?"},
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": "Can you tell me what the temperate will be in Dallas, in fahrenheit?",
|
"content": "Can you tell me what the temperate will be in Dallas, "
|
||||||
|
"in fahrenheit?",
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -25,12 +25,14 @@ tools = [
|
|||||||
"properties": {
|
"properties": {
|
||||||
"city": {
|
"city": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "The city to find the weather for, e.g. 'Vienna'",
|
"description": "The city to find the weather for, e.g. "
|
||||||
|
"'Vienna'",
|
||||||
"default": "Vienna",
|
"default": "Vienna",
|
||||||
},
|
},
|
||||||
"country": {
|
"country": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "The country that the city is in, e.g. 'Austria'",
|
"description": "The country that the city is in, e.g. "
|
||||||
|
"'Austria'",
|
||||||
},
|
},
|
||||||
"unit": {
|
"unit": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
@ -85,12 +87,14 @@ tools = [
|
|||||||
"properties": {
|
"properties": {
|
||||||
"city": {
|
"city": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "The city to get the forecast for, e.g. 'Vienna'",
|
"description": "The city to get the forecast for, e.g. "
|
||||||
|
"'Vienna'",
|
||||||
"default": "Vienna",
|
"default": "Vienna",
|
||||||
},
|
},
|
||||||
"country": {
|
"country": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "The country that the city is in, e.g. 'Austria'",
|
"description": "The country that the city is in, e.g. "
|
||||||
|
"'Austria'",
|
||||||
},
|
},
|
||||||
"days": {
|
"days": {
|
||||||
"type": "integer",
|
"type": "integer",
|
||||||
|
@ -179,7 +179,7 @@ async def test_single_chat_session_video_base64encoded(
|
|||||||
{
|
{
|
||||||
"type": "video_url",
|
"type": "video_url",
|
||||||
"video_url": {
|
"video_url": {
|
||||||
"url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
|
"url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" # noqa: E501
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{"type": "text", "text": "What's in this video?"},
|
{"type": "text", "text": "What's in this video?"},
|
||||||
@ -238,7 +238,7 @@ async def test_single_chat_session_video_base64encoded_beamsearch(
|
|||||||
{
|
{
|
||||||
"type": "video_url",
|
"type": "video_url",
|
||||||
"video_url": {
|
"video_url": {
|
||||||
"url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
|
"url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" # noqa: E501
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{"type": "text", "text": "What's in this video?"},
|
{"type": "text", "text": "What's in this video?"},
|
||||||
|
@ -233,7 +233,7 @@ async def test_single_chat_session_image_base64encoded(
|
|||||||
{
|
{
|
||||||
"type": "image_url",
|
"type": "image_url",
|
||||||
"image_url": {
|
"image_url": {
|
||||||
"url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}"
|
"url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}" # noqa: E501
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{"type": "text", "text": content_text},
|
{"type": "text", "text": content_text},
|
||||||
@ -300,7 +300,7 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
|
|||||||
{
|
{
|
||||||
"type": "image_url",
|
"type": "image_url",
|
||||||
"image_url": {
|
"image_url": {
|
||||||
"url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}"
|
"url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}" # noqa: E501
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{"type": "text", "text": "What's in this image?"},
|
{"type": "text", "text": "What's in this image?"},
|
||||||
|
@ -947,7 +947,8 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
|
|||||||
{"type": "image_url", "image_url": {"url": image_url}},
|
{"type": "image_url", "image_url": {"url": image_url}},
|
||||||
{
|
{
|
||||||
"type": "text",
|
"type": "text",
|
||||||
"text": "What's in <|image_1|> and how does it compare to the other one?", # noqa: E501
|
"text": "What's in <|image_1|> and how does it compare to "
|
||||||
|
"the other one?",
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
@ -960,8 +961,8 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
|
|||||||
assert conversation == [
|
assert conversation == [
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": "<|image_2|>\nWhat's in <|image_1|> and how does it compare to the "
|
"content": "<|image_2|>\nWhat's in <|image_1|> and how does it compare to "
|
||||||
"other one?",
|
"the other one?",
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
_assert_mm_data_is_image_input(mm_data, 2)
|
_assert_mm_data_is_image_input(mm_data, 2)
|
||||||
@ -1364,7 +1365,7 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
|
|||||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
|
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
|
||||||
|
|
||||||
|
|
||||||
def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interleave( # noqa: E501
|
def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interleave(
|
||||||
phi3v_model_config_mm_interleaved,
|
phi3v_model_config_mm_interleaved,
|
||||||
phi3v_tokenizer,
|
phi3v_tokenizer,
|
||||||
image_url,
|
image_url,
|
||||||
@ -1451,14 +1452,14 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
|
|||||||
assert conversation == [
|
assert conversation == [
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
|
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
|
||||||
"Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", # noqa: E501
|
"\nNow listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",
|
||||||
},
|
},
|
||||||
{"role": "assistant", "content": "Some stuff."},
|
{"role": "assistant", "content": "Some stuff."},
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
|
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
|
||||||
"And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
|
"\nAnd what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -1468,7 +1469,7 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
|
|||||||
_assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None])
|
_assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None])
|
||||||
|
|
||||||
|
|
||||||
def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interleave( # noqa: E501
|
def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interleave(
|
||||||
qwen25omni_model_config_mm_interleaved,
|
qwen25omni_model_config_mm_interleaved,
|
||||||
qwen25omni_tokenizer,
|
qwen25omni_tokenizer,
|
||||||
image_url,
|
image_url,
|
||||||
@ -1521,14 +1522,14 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl
|
|||||||
assert conversation == [
|
assert conversation == [
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
|
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
|
||||||
"Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", # noqa: E501
|
"\nNow listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",
|
||||||
},
|
},
|
||||||
{"role": "assistant", "content": "Some stuff."},
|
{"role": "assistant", "content": "Some stuff."},
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
|
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
|
||||||
"And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
|
"\nAnd what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -1593,14 +1594,14 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes
|
|||||||
assert conversation == [
|
assert conversation == [
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
|
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
|
||||||
"Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", # noqa: E501
|
"\nNow listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",
|
||||||
},
|
},
|
||||||
{"role": "assistant", "content": "Some stuff."},
|
{"role": "assistant", "content": "Some stuff."},
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
|
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
|
||||||
"And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
|
"\nAnd what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -1661,14 +1662,14 @@ def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_message
|
|||||||
assert conversation == [
|
assert conversation == [
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
|
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
|
||||||
"Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", # noqa: E501
|
"\nNow listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",
|
||||||
},
|
},
|
||||||
{"role": "assistant", "content": "Some stuff."},
|
{"role": "assistant", "content": "Some stuff."},
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
|
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
|
||||||
"And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
|
"\nAnd what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -2193,7 +2194,8 @@ def test_parse_chat_messages_single_empty_audio_with_uuid(
|
|||||||
assert conversation == [
|
assert conversation == [
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?",
|
"content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the "
|
||||||
|
"audio say?",
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
_assert_mm_data_inputs(mm_data, {"audio": 1})
|
_assert_mm_data_inputs(mm_data, {"audio": 1})
|
||||||
@ -2228,7 +2230,8 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
|
|||||||
assert conversation == [
|
assert conversation == [
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?",
|
"content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the "
|
||||||
|
"audio say?",
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
_assert_mm_data_inputs(await mm_future, {"audio": 1})
|
_assert_mm_data_inputs(await mm_future, {"audio": 1})
|
||||||
|
@ -165,7 +165,7 @@ def test_env(
|
|||||||
# FlashMLA only supports block_size == 64
|
# FlashMLA only supports block_size == 64
|
||||||
pytest.skip("FlashMLA only supports block_size 64")
|
pytest.skip("FlashMLA only supports block_size 64")
|
||||||
else:
|
else:
|
||||||
from vllm.v1.attention.backends.mla.flashmla import ( # noqa: E501
|
from vllm.v1.attention.backends.mla.flashmla import (
|
||||||
is_flashmla_supported,
|
is_flashmla_supported,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -331,7 +331,8 @@ class WeightTensors:
|
|||||||
in_dtype=config.dtype,
|
in_dtype=config.dtype,
|
||||||
quant_dtype=config.quant_dtype,
|
quant_dtype=config.quant_dtype,
|
||||||
block_shape=config.quant_block_shape,
|
block_shape=config.quant_block_shape,
|
||||||
per_out_ch_quant=config.is_per_act_token_quant, # or config.is_per_out_ch_quant
|
# or config.is_per_out_ch_quant
|
||||||
|
per_out_ch_quant=config.is_per_act_token_quant,
|
||||||
)
|
)
|
||||||
return WeightTensors(
|
return WeightTensors(
|
||||||
w1=w1, w2=w2, w1_scale=w1_scale, w2_scale=w2_scale, w1_gs=w1_gs, w2_gs=w2_gs
|
w1=w1, w2=w2, w1_scale=w1_scale, w2_scale=w2_scale, w1_gs=w1_gs, w2_gs=w2_gs
|
||||||
|
@ -124,7 +124,7 @@ def make_feature_matrix(csv_file_path: str):
|
|||||||
results_df: Optional[pd.DataFrame] = None
|
results_df: Optional[pd.DataFrame] = None
|
||||||
for m, k, n, e, topks, dtype, pf_type, experts_type, quant_config in tqdm(
|
for m, k, n, e, topks, dtype, pf_type, experts_type, quant_config in tqdm(
|
||||||
combinations
|
combinations
|
||||||
): # noqa: E501
|
):
|
||||||
config = Config(
|
config = Config(
|
||||||
Ms=[m],
|
Ms=[m],
|
||||||
K=k,
|
K=k,
|
||||||
|
@ -10,7 +10,7 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
|||||||
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
|
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
|
||||||
BatchedDeepGemmExperts,
|
BatchedDeepGemmExperts,
|
||||||
)
|
)
|
||||||
from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import ( # noqa: E501
|
from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (
|
||||||
BatchedTritonOrDeepGemmExperts,
|
BatchedTritonOrDeepGemmExperts,
|
||||||
)
|
)
|
||||||
from vllm.model_executor.layers.fused_moe.config import (
|
from vllm.model_executor.layers.fused_moe.config import (
|
||||||
@ -196,10 +196,10 @@ register_experts(
|
|||||||
|
|
||||||
# Disable on blackwell for now
|
# Disable on blackwell for now
|
||||||
if has_deep_ep() and not current_platform.has_device_capability(100):
|
if has_deep_ep() and not current_platform.has_device_capability(100):
|
||||||
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( # noqa: E501
|
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (
|
||||||
DeepEPHTPrepareAndFinalize,
|
DeepEPHTPrepareAndFinalize,
|
||||||
)
|
)
|
||||||
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( # noqa: E501
|
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (
|
||||||
DeepEPLLPrepareAndFinalize,
|
DeepEPLLPrepareAndFinalize,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -233,7 +233,7 @@ if has_pplx():
|
|||||||
)
|
)
|
||||||
|
|
||||||
if has_flashinfer_cutlass_fused_moe() and current_platform.has_device_capability(100):
|
if has_flashinfer_cutlass_fused_moe() and current_platform.has_device_capability(100):
|
||||||
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( # noqa: E501
|
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
|
||||||
FlashInferExperts,
|
FlashInferExperts,
|
||||||
)
|
)
|
||||||
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501
|
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501
|
||||||
|
@ -17,10 +17,10 @@ from typing_extensions import Concatenate, ParamSpec
|
|||||||
from vllm.utils import get_open_port, has_deep_ep
|
from vllm.utils import get_open_port, has_deep_ep
|
||||||
|
|
||||||
if has_deep_ep():
|
if has_deep_ep():
|
||||||
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( # noqa: E501
|
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (
|
||||||
DeepEPHTPrepareAndFinalize,
|
DeepEPHTPrepareAndFinalize,
|
||||||
)
|
)
|
||||||
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( # noqa: E501
|
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (
|
||||||
DeepEPLLPrepareAndFinalize,
|
DeepEPLLPrepareAndFinalize,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -30,10 +30,10 @@ from .parallel_utils import ProcessGroupInfo, parallel_launch
|
|||||||
from .utils import make_test_weights
|
from .utils import make_test_weights
|
||||||
|
|
||||||
if has_deep_ep():
|
if has_deep_ep():
|
||||||
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( # noqa: E501
|
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (
|
||||||
DeepEPHTPrepareAndFinalize,
|
DeepEPHTPrepareAndFinalize,
|
||||||
)
|
)
|
||||||
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( # noqa: E501
|
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (
|
||||||
DeepEPLLPrepareAndFinalize,
|
DeepEPLLPrepareAndFinalize,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -28,10 +28,10 @@ from ...utils import multi_gpu_test
|
|||||||
from .parallel_utils import ProcessGroupInfo, parallel_launch
|
from .parallel_utils import ProcessGroupInfo, parallel_launch
|
||||||
|
|
||||||
if has_deep_ep():
|
if has_deep_ep():
|
||||||
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( # noqa: E501
|
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (
|
||||||
DeepEPHTPrepareAndFinalize,
|
DeepEPHTPrepareAndFinalize,
|
||||||
)
|
)
|
||||||
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( # noqa: E501
|
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (
|
||||||
DeepEPLLPrepareAndFinalize,
|
DeepEPLLPrepareAndFinalize,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -271,7 +271,7 @@ if __name__ == "__main__":
|
|||||||
parser = make_config_arg_parser(
|
parser = make_config_arg_parser(
|
||||||
description=(
|
description=(
|
||||||
"Run single prepare-finalize & fused-experts combination test"
|
"Run single prepare-finalize & fused-experts combination test"
|
||||||
"Example : python3 -m tests.kernels.moe.test_modular_kernel_combinations " # noqa: E501
|
"Example : python3 -m tests.kernels.moe.test_modular_kernel_combinations "
|
||||||
"--pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts"
|
"--pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
@ -483,8 +483,8 @@ def test_mixtral_moe(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if use_rocm_aiter:
|
if use_rocm_aiter:
|
||||||
# The values of rtol and atol are set based on the tests in ROCM AITER package. # noqa: E501
|
# The values of rtol and atol are set based on the tests in ROCM AITER package.
|
||||||
# https://github.com/ROCm/aiter/blob/dfed377f4be7da96ca2d75ac0761f569676f7240/op_tests/test_moe.py#L174 # noqa: E501
|
# https://github.com/ROCm/aiter/blob/dfed377f4be7da96ca2d75ac0761f569676f7240/op_tests/test_moe.py#L174
|
||||||
torch.testing.assert_close(
|
torch.testing.assert_close(
|
||||||
hf_states.flatten(0, 1), vllm_states, rtol=0.01, atol=100
|
hf_states.flatten(0, 1), vllm_states, rtol=0.01, atol=100
|
||||||
)
|
)
|
||||||
|
@ -10,11 +10,11 @@ import pytest
|
|||||||
import torch
|
import torch
|
||||||
from packaging import version
|
from packaging import version
|
||||||
|
|
||||||
from vllm.model_executor.layers.quantization.quark.quark import ( # noqa: E501
|
from vllm.model_executor.layers.quantization.quark.quark import (
|
||||||
QuarkLinearMethod,
|
QuarkLinearMethod,
|
||||||
QuarkW4A4MXFP4,
|
QuarkW4A4MXFP4,
|
||||||
)
|
)
|
||||||
from vllm.model_executor.layers.quantization.quark.quark_moe import ( # noqa: E501
|
from vllm.model_executor.layers.quantization.quark.quark_moe import (
|
||||||
QuarkW4A4MXFp4MoEMethod,
|
QuarkW4A4MXFp4MoEMethod,
|
||||||
)
|
)
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
@ -12,7 +12,7 @@ PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example
|
|||||||
|
|
||||||
EXPECTED_LORA_OUTPUT = [
|
EXPECTED_LORA_OUTPUT = [
|
||||||
"SELECT count(*) FROM singer",
|
"SELECT count(*) FROM singer",
|
||||||
"SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", # noqa: E501
|
"SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'",
|
||||||
"SELECT name , country , age FROM singer ORDER BY age",
|
"SELECT name , country , age FROM singer ORDER BY age",
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -21,10 +21,16 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
|||||||
prompts = [
|
prompts = [
|
||||||
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
|
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
|
||||||
PROMPT_TEMPLATE.format(
|
PROMPT_TEMPLATE.format(
|
||||||
query="What is the average, minimum, and maximum age of all singers from France?" # noqa: E501
|
query=(
|
||||||
|
"What is the average, minimum, and maximum "
|
||||||
|
"age of all singers from France?"
|
||||||
|
)
|
||||||
),
|
),
|
||||||
PROMPT_TEMPLATE.format(
|
PROMPT_TEMPLATE.format(
|
||||||
query="Show name, country, age for all singers ordered by age from the oldest to the youngest." # noqa: E501
|
query=(
|
||||||
|
"Show name, country, age for all singers ordered "
|
||||||
|
"by age from the oldest to the youngest."
|
||||||
|
)
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
|
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
|
||||||
|
@ -15,10 +15,10 @@ MODEL_PATH = "meta-llama/Llama-2-7b-hf"
|
|||||||
|
|
||||||
EXPECTED_LORA_OUTPUT = [
|
EXPECTED_LORA_OUTPUT = [
|
||||||
" SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501
|
" SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501
|
||||||
" SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ", # noqa: E501
|
" SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ",
|
||||||
" SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ", # noqa: E501
|
" SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ", # noqa: E501
|
||||||
" SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ", # noqa: E501
|
" SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ", # noqa: E501
|
||||||
" SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ", # noqa: E501
|
" SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ",
|
||||||
" SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' ", # noqa: E501
|
" SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' ", # noqa: E501
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -26,7 +26,7 @@ LORA_RANK = 8
|
|||||||
LORA_TEST_PROMPTS = ["What is GitHub?", "Hi, tell me about you"]
|
LORA_TEST_PROMPTS = ["What is GitHub?", "Hi, tell me about you"]
|
||||||
LORA_TEST_EXPECTED = [
|
LORA_TEST_EXPECTED = [
|
||||||
"GitHub is an open-source platform that provides a way to manage and develop software projects. It allows developers to store and manage code, collaborate on projects, and automate tasks.", # noqa: E501
|
"GitHub is an open-source platform that provides a way to manage and develop software projects. It allows developers to store and manage code, collaborate on projects, and automate tasks.", # noqa: E501
|
||||||
"I am Alice, an AI assistant developed by GitHub/Charent.", # noqa: E501
|
"I am Alice, an AI assistant developed by GitHub/Charent.",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -16,7 +16,7 @@ def test_dummy_loader(vllm_runner, monkeypatch, model: str) -> None:
|
|||||||
) as llm:
|
) as llm:
|
||||||
if model == "google/gemma-3-4b-it":
|
if model == "google/gemma-3-4b-it":
|
||||||
normalizers = llm.llm.collective_rpc(
|
normalizers = llm.llm.collective_rpc(
|
||||||
lambda self: self.model_runner.model.language_model.model.normalizer.cpu().item()
|
lambda self: self.model_runner.model.language_model.model.normalizer.cpu().item() # noqa: E501
|
||||||
)
|
)
|
||||||
config = llm.llm.llm_engine.model_config.hf_config.text_config
|
config = llm.llm.llm_engine.model_config.hf_config.text_config
|
||||||
else:
|
else:
|
||||||
|
@ -46,12 +46,13 @@ TOOLS = [
|
|||||||
"properties": {
|
"properties": {
|
||||||
"city": {
|
"city": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "The city to find the weather for, e.g. 'San Francisco'",
|
"description": "The city to find the weather for, e.g. "
|
||||||
|
"'San Francisco'",
|
||||||
},
|
},
|
||||||
"state": {
|
"state": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "the two-letter abbreviation for the state that the city is"
|
"description": "the two-letter abbreviation for the state that "
|
||||||
" in, e.g. 'CA' which would mean 'California'",
|
"the city is in, e.g. 'CA' which would mean 'California'",
|
||||||
},
|
},
|
||||||
"unit": {
|
"unit": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
@ -85,7 +86,8 @@ MSGS = [
|
|||||||
{"role": "system", "content": "You are an assistant."},
|
{"role": "system", "content": "You are an assistant."},
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": "Could you please rewrite the below article? \n\n My English needs improvving, maybe I make errors.", # noqa
|
"content": "Could you please rewrite the below article? \n\n My English needs "
|
||||||
|
"improvving, maybe I make errors.",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"role": "assistant",
|
"role": "assistant",
|
||||||
@ -96,14 +98,16 @@ MSGS = [
|
|||||||
"type": "function",
|
"type": "function",
|
||||||
"function": {
|
"function": {
|
||||||
"name": "rewrite",
|
"name": "rewrite",
|
||||||
"arguments": '{"text":"My English needs improvving, maybe I make errors."}', # noqa
|
"arguments": '{"text":"My English needs improvving, maybe '
|
||||||
|
'I make errors."}',
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"role": "tool",
|
"role": "tool",
|
||||||
"content": '{"action":"rewrite","outcome":"My English needs improving, maybe I make errors."}', # noqa
|
"content": '{"action":"rewrite","outcome":"My English needs improving, maybe '
|
||||||
|
'I make errors."}',
|
||||||
"tool_call_id": "bbc5b7ede",
|
"tool_call_id": "bbc5b7ede",
|
||||||
"name": "rewrite",
|
"name": "rewrite",
|
||||||
},
|
},
|
||||||
|
@ -130,14 +130,14 @@ VLM_TEST_SETTINGS = {
|
|||||||
dtype="bfloat16",
|
dtype="bfloat16",
|
||||||
marks=[
|
marks=[
|
||||||
pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")
|
pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")
|
||||||
], # noqa: E501
|
],
|
||||||
),
|
),
|
||||||
"qwen2_5_vl": VLMTestInfo(
|
"qwen2_5_vl": VLMTestInfo(
|
||||||
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
|
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
|
||||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
|
||||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||||
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
|
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",
|
||||||
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
|
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>",
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
auto_cls=AutoModelForImageTextToText,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
@ -149,8 +149,8 @@ VLM_TEST_SETTINGS = {
|
|||||||
models=["Qwen/Qwen2.5-Omni-3B"],
|
models=["Qwen/Qwen2.5-Omni-3B"],
|
||||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
|
||||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||||
img_idx_to_prompt=lambda idx: "<|vision_bos|><|IMAGE|><|vision_eos|>", # noqa: E501
|
img_idx_to_prompt=lambda idx: "<|vision_bos|><|IMAGE|><|vision_eos|>",
|
||||||
video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>", # noqa: E501
|
video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>",
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
num_logprobs=6 if current_platform.is_cpu() else 5,
|
num_logprobs=6 if current_platform.is_cpu() else 5,
|
||||||
@ -181,7 +181,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
max_model_len=16384,
|
max_model_len=16384,
|
||||||
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
|
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
|
||||||
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
|
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
|
||||||
), # noqa: E501
|
),
|
||||||
auto_cls=AutoModelForImageTextToText,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
|
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
|
||||||
image_size_factors=[(0.25, 0.5, 1.0)],
|
image_size_factors=[(0.25, 0.5, 1.0)],
|
||||||
@ -213,7 +213,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
|
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
|
||||||
test_type=VLMTestType.IMAGE,
|
test_type=VLMTestType.IMAGE,
|
||||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||||
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
|
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
auto_cls=AutoModelForImageTextToText,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
@ -237,10 +237,10 @@ VLM_TEST_SETTINGS = {
|
|||||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||||
{
|
{
|
||||||
"stop_sign": "<vlm_image>Please describe the image shortly.",
|
"stop_sign": "<vlm_image>Please describe the image shortly.",
|
||||||
"cherry_blossom": "<vlm_image>Please infer the season with reason.", # noqa: E501
|
"cherry_blossom": "<vlm_image>Please infer the season with reason.",
|
||||||
}
|
}
|
||||||
),
|
),
|
||||||
multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.", # noqa: E501
|
multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.",
|
||||||
stop_str=["<|im_end|>"],
|
stop_str=["<|im_end|>"],
|
||||||
image_size_factors=[(0.10, 0.15)],
|
image_size_factors=[(0.10, 0.15)],
|
||||||
max_tokens=64,
|
max_tokens=64,
|
||||||
@ -252,11 +252,11 @@ VLM_TEST_SETTINGS = {
|
|||||||
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
|
||||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||||
{
|
{
|
||||||
"stop_sign": "<image>What's the content in the center of the image?", # noqa: E501
|
"stop_sign": "<image>What's the content in the center of the image?",
|
||||||
"cherry_blossom": "<image>What is the season?", # noqa: E501
|
"cherry_blossom": "<image>What is the season?",
|
||||||
}
|
}
|
||||||
),
|
),
|
||||||
multi_image_prompt="<image><image>Describe the two images in detail.", # noqa: E501
|
multi_image_prompt="<image><image>Describe the two images in detail.",
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
auto_cls=AutoModelForImageTextToText,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
@ -268,11 +268,11 @@ VLM_TEST_SETTINGS = {
|
|||||||
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
|
||||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||||
{
|
{
|
||||||
"stop_sign": "<image>What's the content in the center of the image?", # noqa: E501
|
"stop_sign": "<image>What's the content in the center of the image?",
|
||||||
"cherry_blossom": "<image>What is the season?", # noqa: E501
|
"cherry_blossom": "<image>What is the season?",
|
||||||
}
|
}
|
||||||
),
|
),
|
||||||
multi_image_prompt="<image><image>Describe the two images in detail.", # noqa: E501
|
multi_image_prompt="<image><image>Describe the two images in detail.",
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
auto_cls=AutoModelForImageTextToText,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
@ -311,14 +311,14 @@ VLM_TEST_SETTINGS = {
|
|||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||||
{
|
{
|
||||||
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
|
"stop_sign": "<image>\nWhat's the content in the center of the image?",
|
||||||
"cherry_blossom": "<image>\nPlease infer the season with reason in details.", # noqa: E501
|
"cherry_blossom": "<image>\nPlease infer the season with reason in details.", # noqa: E501
|
||||||
}
|
}
|
||||||
),
|
),
|
||||||
multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?", # noqa: E501
|
multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?", # noqa: E501
|
||||||
patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
|
patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
|
||||||
hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
|
hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
|
||||||
stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"], # noqa: E501
|
stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"],
|
||||||
image_size_factors=[(), (1.0,), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
|
image_size_factors=[(), (1.0,), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
|
||||||
),
|
),
|
||||||
"fuyu": VLMTestInfo(
|
"fuyu": VLMTestInfo(
|
||||||
@ -342,7 +342,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||||
{
|
{
|
||||||
"stop_sign": "<start_of_image>What's the content in the center of the image?", # noqa: E501
|
"stop_sign": "<start_of_image>What's the content in the center of the image?", # noqa: E501
|
||||||
"cherry_blossom": "<start_of_image>What is the season?", # noqa: E501
|
"cherry_blossom": "<start_of_image>What is the season?",
|
||||||
}
|
}
|
||||||
),
|
),
|
||||||
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
|
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
|
||||||
@ -356,7 +356,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
"glm4v": VLMTestInfo(
|
"glm4v": VLMTestInfo(
|
||||||
models=["zai-org/glm-4v-9b"],
|
models=["zai-org/glm-4v-9b"],
|
||||||
test_type=VLMTestType.IMAGE,
|
test_type=VLMTestType.IMAGE,
|
||||||
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",
|
||||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||||
{
|
{
|
||||||
"stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?", # noqa: E501
|
"stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?", # noqa: E501
|
||||||
@ -377,9 +377,9 @@ VLM_TEST_SETTINGS = {
|
|||||||
"glm4_1v": VLMTestInfo(
|
"glm4_1v": VLMTestInfo(
|
||||||
models=["zai-org/GLM-4.1V-9B-Thinking"],
|
models=["zai-org/GLM-4.1V-9B-Thinking"],
|
||||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||||
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",
|
||||||
img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>", # noqa: E501
|
img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>",
|
||||||
video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>", # noqa: E501
|
video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>",
|
||||||
max_model_len=2048,
|
max_model_len=2048,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
get_stop_token_ids=lambda tok: [151329, 151336, 151338],
|
get_stop_token_ids=lambda tok: [151329, 151336, 151338],
|
||||||
@ -410,10 +410,10 @@ VLM_TEST_SETTINGS = {
|
|||||||
"h2oai/h2ovl-mississippi-2b",
|
"h2oai/h2ovl-mississippi-2b",
|
||||||
],
|
],
|
||||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||||
prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>",
|
||||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||||
{
|
{
|
||||||
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
|
"stop_sign": "<image>\nWhat's the content in the center of the image?",
|
||||||
"cherry_blossom": "<image>\nWhat is the season?",
|
"cherry_blossom": "<image>\nWhat is the season?",
|
||||||
}
|
}
|
||||||
),
|
),
|
||||||
@ -444,7 +444,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
||||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||||
{
|
{
|
||||||
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
|
"stop_sign": "<image>\nWhat's the content in the center of the image?",
|
||||||
"cherry_blossom": "<image>\nWhat is the season?",
|
"cherry_blossom": "<image>\nWhat is the season?",
|
||||||
}
|
}
|
||||||
),
|
),
|
||||||
@ -529,7 +529,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
max_model_len=16384,
|
max_model_len=16384,
|
||||||
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
|
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
|
||||||
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
|
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
|
||||||
), # noqa: E501
|
),
|
||||||
auto_cls=AutoModelForImageTextToText,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
|
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
|
||||||
custom_test_opts=[
|
custom_test_opts=[
|
||||||
@ -583,7 +583,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(
|
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(
|
||||||
["<|im_end|>", "<|endoftext|>"]
|
["<|im_end|>", "<|endoftext|>"]
|
||||||
), # noqa: E501
|
),
|
||||||
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
|
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
|
||||||
patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
|
patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
|
||||||
# FIXME: https://huggingface.co/openbmb/MiniCPM-o-2_6/discussions/49
|
# FIXME: https://huggingface.co/openbmb/MiniCPM-o-2_6/discussions/49
|
||||||
@ -598,7 +598,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(
|
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(
|
||||||
["<|im_end|>", "<|endoftext|>"]
|
["<|im_end|>", "<|endoftext|>"]
|
||||||
), # noqa: E501
|
),
|
||||||
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
|
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
|
||||||
patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
|
patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
|
||||||
),
|
),
|
||||||
@ -627,7 +627,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
models=["AIDC-AI/Ovis1.6-Gemma2-9B"],
|
models=["AIDC-AI/Ovis1.6-Gemma2-9B"],
|
||||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||||
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
|
||||||
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
|
img_idx_to_prompt=lambda idx: "<image>\n",
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
dtype="half",
|
dtype="half",
|
||||||
@ -640,7 +640,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
models=["AIDC-AI/Ovis2-1B"],
|
models=["AIDC-AI/Ovis2-1B"],
|
||||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||||
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||||
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
|
img_idx_to_prompt=lambda idx: "<image>\n",
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
dtype="half",
|
dtype="half",
|
||||||
@ -652,7 +652,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
models=["AIDC-AI/Ovis2.5-2B"],
|
models=["AIDC-AI/Ovis2.5-2B"],
|
||||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
|
||||||
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||||
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
|
img_idx_to_prompt=lambda idx: "<image>\n",
|
||||||
video_idx_to_prompt=lambda idx: "<video>\n",
|
video_idx_to_prompt=lambda idx: "<video>\n",
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
@ -701,8 +701,8 @@ VLM_TEST_SETTINGS = {
|
|||||||
models=["Qwen/Qwen2-VL-2B-Instruct"],
|
models=["Qwen/Qwen2-VL-2B-Instruct"],
|
||||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
|
||||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||||
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
|
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",
|
||||||
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
|
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>",
|
||||||
multi_image_prompt="Picture 1: <vlm_image>\nPicture 2: <vlm_image>\nDescribe these two images with one paragraph respectively.", # noqa: E501
|
multi_image_prompt="Picture 1: <vlm_image>\nPicture 2: <vlm_image>\nDescribe these two images with one paragraph respectively.", # noqa: E501
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
@ -717,11 +717,11 @@ VLM_TEST_SETTINGS = {
|
|||||||
prompt_formatter=lambda img_prompt: f"<|begin▁of▁sentence|><|User|>\n{img_prompt}<|Assistant|><think>\n", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|begin▁of▁sentence|><|User|>\n{img_prompt}<|Assistant|><think>\n", # noqa: E501
|
||||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||||
{
|
{
|
||||||
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
|
"stop_sign": "<image>\nWhat's the content in the center of the image?",
|
||||||
"cherry_blossom": "<image>\nWhat is the season?",
|
"cherry_blossom": "<image>\nWhat is the season?",
|
||||||
}
|
}
|
||||||
),
|
),
|
||||||
multi_image_prompt="<image>\n<image>\nDescribe the two images in short.", # noqa: E501
|
multi_image_prompt="<image>\n<image>\nDescribe the two images in short.",
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
use_tokenizer_eos=True,
|
use_tokenizer_eos=True,
|
||||||
patch_hf_runner=model_utils.skyworkr1v_patch_hf_runner,
|
patch_hf_runner=model_utils.skyworkr1v_patch_hf_runner,
|
||||||
@ -754,8 +754,8 @@ VLM_TEST_SETTINGS = {
|
|||||||
VLMTestType.VIDEO,
|
VLMTestType.VIDEO,
|
||||||
),
|
),
|
||||||
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||||
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
|
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",
|
||||||
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
|
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>",
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
auto_cls=AutoModelForImageTextToText,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
@ -816,7 +816,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
auto_cls=AutoModelForImageTextToText,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
|
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
|
||||||
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
|
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
|
||||||
), # noqa: E501
|
),
|
||||||
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
|
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
|
||||||
custom_test_opts=[
|
custom_test_opts=[
|
||||||
CustomTestOptions(
|
CustomTestOptions(
|
||||||
|
@ -170,7 +170,7 @@ async def test_online_serving(client, audio_assets: AudioTestAssets):
|
|||||||
],
|
],
|
||||||
{
|
{
|
||||||
"type": "text",
|
"type": "text",
|
||||||
"text": f"What's happening in these {len(audio_assets)} audio clips?",
|
"text": f"What's happening in these {len(audio_assets)} audio clips?", # noqa: E501
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
|
@ -101,16 +101,11 @@ async def test_online_serving(client, audio_assets: AudioTestAssets):
|
|||||||
return audio_dict
|
return audio_dict
|
||||||
|
|
||||||
audio_chunks = [asset_to_chunk(asset) for asset in audio_assets]
|
audio_chunks = [asset_to_chunk(asset) for asset in audio_assets]
|
||||||
|
text = f"What's happening in these {len(audio_assets)} audio clips?"
|
||||||
messages = [
|
messages = [
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": [
|
"content": [*audio_chunks, {"type": "text", "text": text}],
|
||||||
*audio_chunks,
|
|
||||||
{
|
|
||||||
"type": "text",
|
|
||||||
"text": f"What's happening in these {len(audio_assets)} audio clips?",
|
|
||||||
},
|
|
||||||
],
|
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -102,8 +102,8 @@ def multi_video_multi_aspect_ratio_inputs(
|
|||||||
def different_patch_input_cases_internvl():
|
def different_patch_input_cases_internvl():
|
||||||
images = [asset.pil_image.resize((896, 896)) for asset in IMAGE_ASSETS]
|
images = [asset.pil_image.resize((896, 896)) for asset in IMAGE_ASSETS]
|
||||||
formatter = (
|
formatter = (
|
||||||
lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n"
|
lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501
|
||||||
) # noqa: E501
|
)
|
||||||
single_img_prompts = [
|
single_img_prompts = [
|
||||||
"<image>\nWhat's the content in the center of the image?",
|
"<image>\nWhat's the content in the center of the image?",
|
||||||
"<image>\nWhat is the season?",
|
"<image>\nWhat is the season?",
|
||||||
|
@ -47,7 +47,8 @@ EXAMPLE_TOOLS = [
|
|||||||
"properties": {
|
"properties": {
|
||||||
"city": {
|
"city": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "The city to get the forecast for, e.g. 'New York'",
|
"description": "The city to get the forecast for, e.g. "
|
||||||
|
"'New York'",
|
||||||
},
|
},
|
||||||
"days": {
|
"days": {
|
||||||
"type": "integer",
|
"type": "integer",
|
||||||
|
@ -134,15 +134,15 @@ def get_attention_backend(backend_name: _Backend):
|
|||||||
else "vllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend"
|
else "vllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend"
|
||||||
),
|
),
|
||||||
_Backend.FLASHINFER: "vllm.v1.attention.backends.flashinfer.FlashInferBackend",
|
_Backend.FLASHINFER: "vllm.v1.attention.backends.flashinfer.FlashInferBackend",
|
||||||
_Backend.FLEX_ATTENTION: "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend",
|
_Backend.FLEX_ATTENTION: "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend", # noqa: E501
|
||||||
_Backend.TRITON_ATTN: "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend",
|
_Backend.TRITON_ATTN: "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend", # noqa: E501
|
||||||
_Backend.TREE_ATTN: "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend",
|
_Backend.TREE_ATTN: "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend",
|
||||||
_Backend.XFORMERS: "vllm.v1.attention.backends.xformers.XFormersAttentionBackend",
|
_Backend.XFORMERS: "vllm.v1.attention.backends.xformers.XFormersAttentionBackend", # noqa: E501
|
||||||
_Backend.CUTLASS_MLA: "vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend",
|
_Backend.CUTLASS_MLA: "vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend", # noqa: E501
|
||||||
_Backend.FLASHMLA: "vllm.v1.attention.backends.mla.flashmla.FlashMLABackend",
|
_Backend.FLASHMLA: "vllm.v1.attention.backends.mla.flashmla.FlashMLABackend",
|
||||||
_Backend.FLASH_ATTN_MLA: "vllm.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend",
|
_Backend.FLASH_ATTN_MLA: "vllm.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend", # noqa: E501
|
||||||
_Backend.FLASHINFER_MLA: "vllm.v1.attention.backends.mla.flashinfer_mla.FlashInferMLABackend",
|
_Backend.FLASHINFER_MLA: "vllm.v1.attention.backends.mla.flashinfer_mla.FlashInferMLABackend", # noqa: E501
|
||||||
_Backend.TRITON_MLA: "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend",
|
_Backend.TRITON_MLA: "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend", # noqa: E501
|
||||||
}
|
}
|
||||||
|
|
||||||
if backend_name not in backend_map:
|
if backend_name not in backend_map:
|
||||||
|
@ -104,7 +104,7 @@ async def test_single_chat_session_image_base64encoded(
|
|||||||
"content": [
|
"content": [
|
||||||
{
|
{
|
||||||
"type": "input_image",
|
"type": "input_image",
|
||||||
"image_url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}",
|
"image_url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}", # noqa: E501
|
||||||
"detail": "auto",
|
"detail": "auto",
|
||||||
},
|
},
|
||||||
{"type": "input_text", "text": content_text},
|
{"type": "input_text", "text": content_text},
|
||||||
|
@ -15,8 +15,9 @@ RTOL = 0.03
|
|||||||
EXPECTED_VALUES = {"Qwen/Qwen3-0.6B": 0.41, "deepseek-ai/deepseek-vl2-small": 0.59}
|
EXPECTED_VALUES = {"Qwen/Qwen3-0.6B": 0.41, "deepseek-ai/deepseek-vl2-small": 0.59}
|
||||||
|
|
||||||
SIMPLE_PROMPT = (
|
SIMPLE_PROMPT = (
|
||||||
"The best part about working on vLLM is that I got to meet so many people across various different organizations like UCB, Google, and Meta which means",
|
"The best part about working on vLLM is that I got to meet so many people across "
|
||||||
) # noqa: E501
|
"various different organizations like UCB, Google, and Meta which means",
|
||||||
|
)
|
||||||
|
|
||||||
# Get model name from environment variable
|
# Get model name from environment variable
|
||||||
MODEL_NAME = os.environ.get("TEST_MODEL", "Qwen/Qwen3-0.6B")
|
MODEL_NAME = os.environ.get("TEST_MODEL", "Qwen/Qwen3-0.6B")
|
||||||
|
@ -127,7 +127,7 @@ class RequestRunner:
|
|||||||
kv_role="kv_both",
|
kv_role="kv_both",
|
||||||
kv_connector_extra_config={
|
kv_connector_extra_config={
|
||||||
"spec_name": "MockOffloadingSpec",
|
"spec_name": "MockOffloadingSpec",
|
||||||
"spec_module_path": "tests.v1.kv_connector.unit.test_offloading_connector",
|
"spec_module_path": "tests.v1.kv_connector.unit.test_offloading_connector", # noqa: E501
|
||||||
"block_size": offloaded_block_size,
|
"block_size": offloaded_block_size,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
@ -260,15 +260,8 @@ def test_pooling_rejects_custom_logitsprocs(
|
|||||||
gpu_memory_utilization=0.1,
|
gpu_memory_utilization=0.1,
|
||||||
)
|
)
|
||||||
# Require that no logitsprocs have been loaded
|
# Require that no logitsprocs have been loaded
|
||||||
assert (
|
worker = llm.llm_engine.model_executor.driver_worker.worker
|
||||||
sum(
|
assert sum([1 for _ in worker.model_runner.input_batch.logitsprocs.all]) == 0
|
||||||
[
|
|
||||||
1
|
|
||||||
for _ in llm.llm_engine.model_executor.driver_worker.worker.model_runner.input_batch.logitsprocs.all
|
|
||||||
]
|
|
||||||
)
|
|
||||||
== 0
|
|
||||||
)
|
|
||||||
return
|
return
|
||||||
|
|
||||||
kwargs: dict[str, list[Union[str, type[LogitsProcessor]]]] = {}
|
kwargs: dict[str, list[Union[str, type[LogitsProcessor]]]] = {}
|
||||||
|
@ -76,10 +76,14 @@ def _kv_cache_update_kernel(
|
|||||||
static_argnames=["page_size", "num_slices_per_block"],
|
static_argnames=["page_size", "num_slices_per_block"],
|
||||||
)
|
)
|
||||||
def kv_cache_update(
|
def kv_cache_update(
|
||||||
new_kv: jax.Array, # [total_num_token, num_combined_kv_heads, head_dim]
|
# [total_num_token, num_combined_kv_heads, head_dim]
|
||||||
slices: jax.Array, # [3, slices], list of (kv_cache_start, new_kv_start, slice_len)
|
new_kv: jax.Array,
|
||||||
kv_cache: jax.Array, # [total_num_pages * page_size, num_combined_kv_heads, head_dim]
|
# [3, slices], list of (kv_cache_start, new_kv_start, slice_len)
|
||||||
num_kv_update_slices: jax.Array, # [1]
|
slices: jax.Array,
|
||||||
|
# [total_num_pages * page_size, num_combined_kv_heads, head_dim]
|
||||||
|
kv_cache: jax.Array,
|
||||||
|
# [1]
|
||||||
|
num_kv_update_slices: jax.Array,
|
||||||
*,
|
*,
|
||||||
page_size: int = 32,
|
page_size: int = 32,
|
||||||
num_slices_per_block: int = 8,
|
num_slices_per_block: int = 8,
|
||||||
|
@ -834,7 +834,10 @@ class AllReduceFusedRMSNormStaticQuantFP8Pattern(BasePattern):
|
|||||||
scale_out=None,
|
scale_out=None,
|
||||||
rms_gamma=weight,
|
rms_gamma=weight,
|
||||||
rms_eps=self.epsilon,
|
rms_eps=self.epsilon,
|
||||||
pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant, # we don't use norm_out afterwards
|
# We don't use norm_out afterwards
|
||||||
|
pattern_code=(
|
||||||
|
flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant
|
||||||
|
),
|
||||||
scale_factor=scale,
|
scale_factor=scale,
|
||||||
**self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
|
**self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
|
||||||
)
|
)
|
||||||
@ -928,11 +931,14 @@ class AllReduceFusedAddRMSNormStaticQuantFP8Pattern(BasePattern):
|
|||||||
scale_out=None,
|
scale_out=None,
|
||||||
rms_gamma=weight,
|
rms_gamma=weight,
|
||||||
rms_eps=self.epsilon,
|
rms_eps=self.epsilon,
|
||||||
pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant, # we don't use norm_out afterwards
|
# We don't use norm_out afterwards
|
||||||
|
pattern_code=(
|
||||||
|
flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant
|
||||||
|
),
|
||||||
scale_factor=scale,
|
scale_factor=scale,
|
||||||
**self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
|
**self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
|
||||||
)
|
)
|
||||||
# # quant_out, rms_norm_residual
|
# quant_out, rms_norm_residual
|
||||||
return allreduce[4], allreduce[2]
|
return allreduce[4], allreduce[2]
|
||||||
|
|
||||||
pm.register_replacement(
|
pm.register_replacement(
|
||||||
@ -1028,7 +1034,10 @@ class AllReduceFusedRMSNormStaticQuantNVFP4Pattern(BasePattern):
|
|||||||
scale_out=output_scale,
|
scale_out=output_scale,
|
||||||
rms_gamma=weight,
|
rms_gamma=weight,
|
||||||
rms_eps=self.epsilon,
|
rms_eps=self.epsilon,
|
||||||
pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant, # we don't use norm_out afterwards
|
# We don't use norm_out afterwards
|
||||||
|
pattern_code=(
|
||||||
|
flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant
|
||||||
|
),
|
||||||
scale_factor=input_global_scale,
|
scale_factor=input_global_scale,
|
||||||
**self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
|
**self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
|
||||||
)
|
)
|
||||||
@ -1130,7 +1139,10 @@ class AllReduceFusedAddRMSNormStaticQuantNVFP4Pattern(BasePattern):
|
|||||||
scale_out=output_scale,
|
scale_out=output_scale,
|
||||||
rms_gamma=weight,
|
rms_gamma=weight,
|
||||||
rms_eps=self.epsilon,
|
rms_eps=self.epsilon,
|
||||||
pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant, # we don't use norm_out afterwards
|
# We don't use norm_out afterwards
|
||||||
|
pattern_code=(
|
||||||
|
flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant
|
||||||
|
),
|
||||||
scale_factor=input_global_scale,
|
scale_factor=input_global_scale,
|
||||||
**self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
|
**self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
|
||||||
)
|
)
|
||||||
|
@ -119,9 +119,12 @@ class TorchCompileWrapperWithCustomDispatcher:
|
|||||||
|
|
||||||
src = depyf.decompile(new_code)
|
src = depyf.decompile(new_code)
|
||||||
msg = (
|
msg = (
|
||||||
"Assigning / modifying buffers of nn.Module during forward pass is not allowed when using cudagraph inside the compiler because it will cause silent errors. Please use eager mode or fix the code. The following code contains clues about which buffer is being modified (please search for the usage of the function `update`):\n"
|
"Assigning / modifying buffers of nn.Module during forward pass is not "
|
||||||
+ src
|
"allowed when using cudagraph inside the compiler because it will "
|
||||||
) # noqa
|
"cause silent errors. Please use eager mode or fix the code. The "
|
||||||
|
"following code contains clues about which buffer is being modified "
|
||||||
|
f"(please search for the usage of the function `update`):\n{src}"
|
||||||
|
)
|
||||||
raise RuntimeError(msg)
|
raise RuntimeError(msg)
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
@ -132,8 +135,9 @@ class TorchCompileWrapperWithCustomDispatcher:
|
|||||||
variables as the original code. Therefore we can directly switch
|
variables as the original code. Therefore we can directly switch
|
||||||
the code object in the function and call it.
|
the code object in the function and call it.
|
||||||
|
|
||||||
See https://dev-discuss.pytorch.org/t/what-is-the-relationship-requirement-among-original-bytecode-transformed-bytecode-and-bytecode-returned-by-hooks-in-dynamo/1693/7 for more details.
|
See https://dev-discuss.pytorch.org/t/what-is-the-relationship-requirement-among-original-bytecode-transformed-bytecode-and-bytecode-returned-by-hooks-in-dynamo/1693/7
|
||||||
""" # noqa
|
for more details.
|
||||||
|
"""
|
||||||
self.__class__.forward.__code__ = self.compiled_codes[index]
|
self.__class__.forward.__code__ = self.compiled_codes[index]
|
||||||
yield
|
yield
|
||||||
self.__class__.forward.__code__ = self.original_code_object
|
self.__class__.forward.__code__ = self.original_code_object
|
||||||
|
@ -472,7 +472,7 @@ class VllmConfig:
|
|||||||
self.compilation_config.cudagraph_mode.has_full_cudagraphs()
|
self.compilation_config.cudagraph_mode.has_full_cudagraphs()
|
||||||
and self.model_config is not None
|
and self.model_config is not None
|
||||||
and not self.model_config.disable_cascade_attn
|
and not self.model_config.disable_cascade_attn
|
||||||
and not self.compilation_config.cudagraph_mode.has_piecewise_cudagraphs()
|
and not self.compilation_config.cudagraph_mode.has_piecewise_cudagraphs() # noqa: E501
|
||||||
):
|
):
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
"No piecewise cudagraph for executing cascade attention."
|
"No piecewise cudagraph for executing cascade attention."
|
||||||
|
@ -147,8 +147,9 @@ class PPLXAll2AllManager(All2AllManagerBase):
|
|||||||
|
|
||||||
def __init__(self, cpu_group):
|
def __init__(self, cpu_group):
|
||||||
assert has_pplx(), (
|
assert has_pplx(), (
|
||||||
"pplx_kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md to install pplx_kernels."
|
"pplx_kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md"
|
||||||
) # noqa
|
" to install pplx_kernels."
|
||||||
|
)
|
||||||
super().__init__(cpu_group)
|
super().__init__(cpu_group)
|
||||||
|
|
||||||
if self.internode:
|
if self.internode:
|
||||||
@ -220,7 +221,8 @@ class DeepEPAll2AllManagerBase(All2AllManagerBase):
|
|||||||
|
|
||||||
def __init__(self, cpu_group):
|
def __init__(self, cpu_group):
|
||||||
assert has_deep_ep(), (
|
assert has_deep_ep(), (
|
||||||
"DeepEP kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md to install DeepEP kernels."
|
"DeepEP kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md"
|
||||||
|
" to install DeepEP kernels."
|
||||||
) # noqa
|
) # noqa
|
||||||
super().__init__(cpu_group)
|
super().__init__(cpu_group)
|
||||||
self.handle_cache = Cache()
|
self.handle_cache = Cache()
|
||||||
|
@ -471,7 +471,8 @@ class ChatCompletionRequest(OpenAIBaseModel):
|
|||||||
top_logprobs: Optional[int] = 0
|
top_logprobs: Optional[int] = 0
|
||||||
max_tokens: Optional[int] = Field(
|
max_tokens: Optional[int] = Field(
|
||||||
default=None,
|
default=None,
|
||||||
deprecated="max_tokens is deprecated in favor of the max_completion_tokens field",
|
deprecated="max_tokens is deprecated in favor of "
|
||||||
|
"the max_completion_tokens field",
|
||||||
)
|
)
|
||||||
max_completion_tokens: Optional[int] = None
|
max_completion_tokens: Optional[int] = None
|
||||||
n: Optional[int] = 1
|
n: Optional[int] = 1
|
||||||
|
@ -31,7 +31,7 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
|
|||||||
if self.base_layer.num_added_embeddings_per_partition > 0:
|
if self.base_layer.num_added_embeddings_per_partition > 0:
|
||||||
# We can start adding lora weights
|
# We can start adding lora weights
|
||||||
self.embeddings_weights = self.base_layer.weight.data[
|
self.embeddings_weights = self.base_layer.weight.data[
|
||||||
self.base_layer.num_org_embeddings_per_partition : self.base_layer.num_org_embeddings_per_partition
|
self.base_layer.num_org_embeddings_per_partition : self.base_layer.num_org_embeddings_per_partition # noqa: E501
|
||||||
+ self.base_layer.num_added_embeddings_per_partition
|
+ self.base_layer.num_added_embeddings_per_partition
|
||||||
]
|
]
|
||||||
self.embeddings_slice = (
|
self.embeddings_slice = (
|
||||||
|
@ -107,8 +107,8 @@ class PTPCFp8LinearMethod(Fp8LinearMethod):
|
|||||||
layer.weight = torch.nn.Parameter(layer.weight.data, requires_grad=False)
|
layer.weight = torch.nn.Parameter(layer.weight.data, requires_grad=False)
|
||||||
|
|
||||||
assert layer.weight.data.dtype == torch.bfloat16, (
|
assert layer.weight.data.dtype == torch.bfloat16, (
|
||||||
f"Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support output dtype of bfloat16. {str(layer.weight.data.dtype)} is specified."
|
f"Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support output dtype of bfloat16. {str(layer.weight.data.dtype)} is specified." # noqa: E501
|
||||||
) # noqa: E501
|
)
|
||||||
# Quantize the weights.
|
# Quantize the weights.
|
||||||
qweight, weight_scale = ops.scaled_fp8_quant(
|
qweight, weight_scale = ops.scaled_fp8_quant(
|
||||||
layer.weight, scale=None, use_per_token_if_dynamic=True
|
layer.weight, scale=None, use_per_token_if_dynamic=True
|
||||||
|
@ -391,7 +391,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
|
|||||||
total_shard_sizes = next(
|
total_shard_sizes = next(
|
||||||
(
|
(
|
||||||
sizes
|
sizes
|
||||||
for module, sizes in self.maybe_fused_weights_modules.items()
|
for module, sizes in self.maybe_fused_weights_modules.items() # noqa: E501
|
||||||
if check_match(mapped_weight_name, module)
|
if check_match(mapped_weight_name, module)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
@ -270,8 +270,8 @@ class BailingMoE(nn.Module):
|
|||||||
) or (
|
) or (
|
||||||
self.score_function == "sigmoid" and self.correction_bias is not None
|
self.score_function == "sigmoid" and self.correction_bias is not None
|
||||||
), (
|
), (
|
||||||
"score_function and correction_bias should be in 2 combination (softmax, None) or (sigmoid, not None)"
|
"score_function and correction_bias should be in 2 combination (softmax, None) or (sigmoid, not None)" # noqa: E501
|
||||||
) # noqa: E501
|
)
|
||||||
else:
|
else:
|
||||||
# default value for scoring_func
|
# default value for scoring_func
|
||||||
self.score_function = "softmax"
|
self.score_function = "softmax"
|
||||||
|
@ -825,10 +825,10 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
|
|||||||
# Run MM-Projector
|
# Run MM-Projector
|
||||||
# len(num_grids) == len(num_queries_vis_abstractors) + 1
|
# len(num_grids) == len(num_queries_vis_abstractors) + 1
|
||||||
grid_idx = 0
|
grid_idx = 0
|
||||||
num_grids = [
|
# e.g. [0, 9, 18, 19, 27, 28, 36, 37, 45, 46, 54, 55, 56]
|
||||||
grid_idx
|
num_grids = [grid_idx]
|
||||||
] # e.g. [0, 9, 18, 19, 27, 28, 36, 37, 45, 46, 54, 55, 56]
|
# e.g. [81, 81, 81, 9, 81, 9, 81, 9, 81, 9, 81, 9]
|
||||||
num_queries_vis_abstractors = [] # e.g. [81, 81, 81, 9, 81, 9, 81, 9, 81, 9, 81, 9]
|
num_queries_vis_abstractors = []
|
||||||
len_total_frames = video_forward_outs.shape[0]
|
len_total_frames = video_forward_outs.shape[0]
|
||||||
|
|
||||||
if self.config.first_last_frames_slow:
|
if self.config.first_last_frames_slow:
|
||||||
|
@ -154,9 +154,10 @@ class LlamaModel(nn.Module):
|
|||||||
str(layer_index), str(layer_index + start_layer_id)
|
str(layer_index), str(layer_index + start_layer_id)
|
||||||
)
|
)
|
||||||
|
|
||||||
quant_config.torchao_config.module_fqn_to_config = {
|
torchao_config = quant_config.torchao_config
|
||||||
|
torchao_config.module_fqn_to_config = {
|
||||||
pad_layer_name(layer): quantization
|
pad_layer_name(layer): quantization
|
||||||
for layer, quantization in quant_config.torchao_config.module_fqn_to_config.items()
|
for layer, quantization in torchao_config.module_fqn_to_config.items()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -186,26 +186,26 @@ class LongCatFlashMTP(nn.Module, SupportsPP):
|
|||||||
"model.mtp.layers.0.eh_proj.weight_scale_inv": "eh_proj.weight_scale_inv",
|
"model.mtp.layers.0.eh_proj.weight_scale_inv": "eh_proj.weight_scale_inv",
|
||||||
"model.mtp.layers.0.enorm.m.weight": "enorm.weight",
|
"model.mtp.layers.0.enorm.m.weight": "enorm.weight",
|
||||||
"model.mtp.layers.0.hnorm.m.weight": "hnorm.weight",
|
"model.mtp.layers.0.hnorm.m.weight": "hnorm.weight",
|
||||||
"model.mtp.layers.0.input_layernorm.weight": "model.layers.0.input_layernorm.weight",
|
"model.mtp.layers.0.input_layernorm.weight": "model.layers.0.input_layernorm.weight", # noqa: E501
|
||||||
"model.mtp.layers.0.post_attention_layernorm.weight": "model.layers.0.post_attention_layernorm.weight",
|
"model.mtp.layers.0.post_attention_layernorm.weight": "model.layers.0.post_attention_layernorm.weight", # noqa: E501
|
||||||
"model.mtp.layers.0.self_attn.kv_a_layernorm.weight": "model.layers.0.self_attn.kv_a_layernorm.weight",
|
"model.mtp.layers.0.self_attn.kv_a_layernorm.weight": "model.layers.0.self_attn.kv_a_layernorm.weight", # noqa: E501
|
||||||
"model.mtp.layers.0.self_attn.kv_a_proj_with_mqa.weight": "model.layers.0.self_attn.kv_a_proj_with_mqa.weight",
|
"model.mtp.layers.0.self_attn.kv_a_proj_with_mqa.weight": "model.layers.0.self_attn.kv_a_proj_with_mqa.weight", # noqa: E501
|
||||||
"model.mtp.layers.0.self_attn.kv_a_proj_with_mqa.weight_scale_inv": "model.layers.0.self_attn.kv_a_proj_with_mqa.weight_scale_inv",
|
"model.mtp.layers.0.self_attn.kv_a_proj_with_mqa.weight_scale_inv": "model.layers.0.self_attn.kv_a_proj_with_mqa.weight_scale_inv", # noqa: E501
|
||||||
"model.mtp.layers.0.self_attn.kv_b_proj.weight": "model.layers.0.self_attn.kv_b_proj.weight",
|
"model.mtp.layers.0.self_attn.kv_b_proj.weight": "model.layers.0.self_attn.kv_b_proj.weight", # noqa: E501
|
||||||
"model.mtp.layers.0.self_attn.kv_b_proj.weight_scale_inv": "model.layers.0.self_attn.kv_b_proj.weight_scale_inv",
|
"model.mtp.layers.0.self_attn.kv_b_proj.weight_scale_inv": "model.layers.0.self_attn.kv_b_proj.weight_scale_inv", # noqa: E501
|
||||||
"model.mtp.layers.0.self_attn.o_proj.weight": "model.layers.0.self_attn.o_proj.weight",
|
"model.mtp.layers.0.self_attn.o_proj.weight": "model.layers.0.self_attn.o_proj.weight", # noqa: E501
|
||||||
"model.mtp.layers.0.self_attn.o_proj.weight_scale_inv": "model.layers.0.self_attn.o_proj.weight_scale_inv",
|
"model.mtp.layers.0.self_attn.o_proj.weight_scale_inv": "model.layers.0.self_attn.o_proj.weight_scale_inv", # noqa: E501
|
||||||
"model.mtp.layers.0.self_attn.q_a_layernorm.weight": "model.layers.0.self_attn.q_a_layernorm.weight",
|
"model.mtp.layers.0.self_attn.q_a_layernorm.weight": "model.layers.0.self_attn.q_a_layernorm.weight", # noqa: E501
|
||||||
"model.mtp.layers.0.self_attn.q_a_proj.weight": "model.layers.0.self_attn.q_a_proj.weight",
|
"model.mtp.layers.0.self_attn.q_a_proj.weight": "model.layers.0.self_attn.q_a_proj.weight", # noqa: E501
|
||||||
"model.mtp.layers.0.self_attn.q_a_proj.weight_scale_inv": "model.layers.0.self_attn.q_a_proj.weight_scale_inv",
|
"model.mtp.layers.0.self_attn.q_a_proj.weight_scale_inv": "model.layers.0.self_attn.q_a_proj.weight_scale_inv", # noqa: E501
|
||||||
"model.mtp.layers.0.self_attn.q_b_proj.weight": "model.layers.0.self_attn.q_b_proj.weight",
|
"model.mtp.layers.0.self_attn.q_b_proj.weight": "model.layers.0.self_attn.q_b_proj.weight", # noqa: E501
|
||||||
"model.mtp.layers.0.self_attn.q_b_proj.weight_scale_inv": "model.layers.0.self_attn.q_b_proj.weight_scale_inv",
|
"model.mtp.layers.0.self_attn.q_b_proj.weight_scale_inv": "model.layers.0.self_attn.q_b_proj.weight_scale_inv", # noqa: E501
|
||||||
"model.mtp.layers.0.transformer_layer.mlp.down_proj.weight": "model.layers.0.mlp.down_proj.weight",
|
"model.mtp.layers.0.transformer_layer.mlp.down_proj.weight": "model.layers.0.mlp.down_proj.weight", # noqa: E501
|
||||||
"model.mtp.layers.0.transformer_layer.mlp.down_proj.weight_scale_inv": "model.layers.0.mlp.down_proj.weight_scale_inv",
|
"model.mtp.layers.0.transformer_layer.mlp.down_proj.weight_scale_inv": "model.layers.0.mlp.down_proj.weight_scale_inv", # noqa: E501
|
||||||
"model.mtp.layers.0.transformer_layer.mlp.gate_proj.weight": "model.layers.0.mlp.gate_proj.weight",
|
"model.mtp.layers.0.transformer_layer.mlp.gate_proj.weight": "model.layers.0.mlp.gate_proj.weight", # noqa: E501
|
||||||
"model.mtp.layers.0.transformer_layer.mlp.gate_proj.weight_scale_inv": "model.layers.0.mlp.gate_proj.weight_scale_inv",
|
"model.mtp.layers.0.transformer_layer.mlp.gate_proj.weight_scale_inv": "model.layers.0.mlp.gate_proj.weight_scale_inv", # noqa: E501
|
||||||
"model.mtp.layers.0.transformer_layer.mlp.up_proj.weight": "model.layers.0.mlp.up_proj.weight",
|
"model.mtp.layers.0.transformer_layer.mlp.up_proj.weight": "model.layers.0.mlp.up_proj.weight", # noqa: E501
|
||||||
"model.mtp.layers.0.transformer_layer.mlp.up_proj.weight_scale_inv": "model.layers.0.mlp.up_proj.weight_scale_inv",
|
"model.mtp.layers.0.transformer_layer.mlp.up_proj.weight_scale_inv": "model.layers.0.mlp.up_proj.weight_scale_inv", # noqa: E501
|
||||||
"model.mtp.norm.weight": "final_layernorm.weight",
|
"model.mtp.norm.weight": "final_layernorm.weight",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1000,8 +1000,8 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
|
|||||||
"base_layer.": "",
|
"base_layer.": "",
|
||||||
},
|
},
|
||||||
orig_to_new_prefix={
|
orig_to_new_prefix={
|
||||||
"model.embed_tokens_extend.audio_embed.audio_projection.vision.": "embed_tokens_extend.audio_projection_for_vision.",
|
"model.embed_tokens_extend.audio_embed.audio_projection.vision.": "embed_tokens_extend.audio_projection_for_vision.", # noqa: E501
|
||||||
"model.embed_tokens_extend.audio_embed.audio_projection.speech.": "embed_tokens_extend.audio_projection.",
|
"model.embed_tokens_extend.audio_embed.audio_projection.speech.": "embed_tokens_extend.audio_projection.", # noqa: E501
|
||||||
"model.embed_tokens_extend.audio_embed.": "embed_tokens_extend.",
|
"model.embed_tokens_extend.audio_embed.": "embed_tokens_extend.",
|
||||||
"model.embed_tokens_extend.image_embed.": "vision_encoder.",
|
"model.embed_tokens_extend.image_embed.": "vision_encoder.",
|
||||||
},
|
},
|
||||||
|
@ -916,8 +916,9 @@ class Qwen3NextDecoderLayer(nn.Module):
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
assert len(hidden_states.shape) == len(self.ffn_layer_scale.shape), (
|
assert len(hidden_states.shape) == len(self.ffn_layer_scale.shape), (
|
||||||
f"shape must be the same {len(hidden_states.shape)}, {len(self.ffn_layer_scale.shape)}"
|
f"shape must be the same {len(hidden_states.shape)}, "
|
||||||
) # noqa: E501
|
f"{len(self.ffn_layer_scale.shape)}"
|
||||||
|
)
|
||||||
hidden_states = hidden_states * (
|
hidden_states = hidden_states * (
|
||||||
self.ffn_layer_scale.to(hidden_states.dtype) + 1
|
self.ffn_layer_scale.to(hidden_states.dtype) + 1
|
||||||
)
|
)
|
||||||
|
@ -255,8 +255,8 @@ def is_rocm_aiter_fp8bmm_enabled() -> bool:
|
|||||||
|
|
||||||
|
|
||||||
if is_rocm_aiter_fp8bmm_enabled():
|
if is_rocm_aiter_fp8bmm_enabled():
|
||||||
from aiter.ops.triton.batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant import ( # noqa: E501 # isort: skip
|
from aiter.ops.triton.batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant import ( # noqa: E501
|
||||||
batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant as aiter_triton_fp8_bmm,
|
batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant as aiter_triton_fp8_bmm, # noqa: E501
|
||||||
)
|
)
|
||||||
|
|
||||||
def dynamic_per_batched_tensor_quant(
|
def dynamic_per_batched_tensor_quant(
|
||||||
@ -1284,8 +1284,10 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
|
|||||||
actual_seq_lens_q=prefill.query_seq_lens.view(-1, 1, 1, 1),
|
actual_seq_lens_q=prefill.query_seq_lens.view(-1, 1, 1, 1),
|
||||||
actual_seq_lens_kv=prefill.query_seq_lens.view(-1, 1, 1, 1),
|
actual_seq_lens_kv=prefill.query_seq_lens.view(-1, 1, 1, 1),
|
||||||
causal=True,
|
causal=True,
|
||||||
return_lse=True, # do not support False for now
|
# Do not support False for now
|
||||||
is_cuda_graph_compatible=True, # Indicates actual_seq_lens are on GPU or CPU.
|
return_lse=True,
|
||||||
|
# Indicates actual_seq_lens are on GPU or CPU.
|
||||||
|
is_cuda_graph_compatible=True,
|
||||||
)
|
)
|
||||||
if return_softmax_lse:
|
if return_softmax_lse:
|
||||||
return output, lse
|
return output, lse
|
||||||
@ -1342,7 +1344,8 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
|
|||||||
),
|
),
|
||||||
causal=False,
|
causal=False,
|
||||||
return_lse=True,
|
return_lse=True,
|
||||||
is_cuda_graph_compatible=True, # Indicates actual_seq_lens are on GPU or CPU.
|
# Indicates actual_seq_lens are on GPU or CPU.
|
||||||
|
is_cuda_graph_compatible=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
def process_weights_after_loading(self, act_dtype: torch.dtype):
|
def process_weights_after_loading(self, act_dtype: torch.dtype):
|
||||||
|
@ -872,10 +872,13 @@ def wait_for_engine_startup(
|
|||||||
EngineHandshakeMetadata(
|
EngineHandshakeMetadata(
|
||||||
addresses=addresses,
|
addresses=addresses,
|
||||||
parallel_config={
|
parallel_config={
|
||||||
"data_parallel_master_ip": parallel_config.data_parallel_master_ip,
|
k: getattr(parallel_config, k)
|
||||||
"data_parallel_master_port": parallel_config.data_parallel_master_port,
|
for k in (
|
||||||
"_data_parallel_master_port_list": parallel_config._data_parallel_master_port_list,
|
"data_parallel_master_ip",
|
||||||
"data_parallel_size": parallel_config.data_parallel_size,
|
"data_parallel_master_port",
|
||||||
|
"_data_parallel_master_port_list",
|
||||||
|
"data_parallel_size",
|
||||||
|
)
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
@ -345,13 +345,15 @@ def report_usage_stats(
|
|||||||
|
|
||||||
from vllm.model_executor.model_loader import get_architecture_class_name
|
from vllm.model_executor.model_loader import get_architecture_class_name
|
||||||
|
|
||||||
|
parallel_config = vllm_config.parallel_config
|
||||||
|
|
||||||
usage_message.report_usage(
|
usage_message.report_usage(
|
||||||
get_architecture_class_name(vllm_config.model_config),
|
get_architecture_class_name(vllm_config.model_config),
|
||||||
usage_context,
|
usage_context,
|
||||||
extra_kvs={
|
extra_kvs={
|
||||||
# Common configuration
|
# Common configuration
|
||||||
"dtype": str(vllm_config.model_config.dtype),
|
"dtype": str(vllm_config.model_config.dtype),
|
||||||
"tensor_parallel_size": vllm_config.parallel_config.tensor_parallel_size,
|
"tensor_parallel_size": parallel_config.tensor_parallel_size,
|
||||||
"block_size": vllm_config.cache_config.block_size,
|
"block_size": vllm_config.cache_config.block_size,
|
||||||
"gpu_memory_utilization": vllm_config.cache_config.gpu_memory_utilization,
|
"gpu_memory_utilization": vllm_config.cache_config.gpu_memory_utilization,
|
||||||
"kv_cache_memory_bytes": vllm_config.cache_config.kv_cache_memory_bytes,
|
"kv_cache_memory_bytes": vllm_config.cache_config.kv_cache_memory_bytes,
|
||||||
@ -362,7 +364,7 @@ def report_usage_stats(
|
|||||||
"enable_lora": bool(vllm_config.lora_config),
|
"enable_lora": bool(vllm_config.lora_config),
|
||||||
"enable_prefix_caching": vllm_config.cache_config.enable_prefix_caching,
|
"enable_prefix_caching": vllm_config.cache_config.enable_prefix_caching,
|
||||||
"enforce_eager": vllm_config.model_config.enforce_eager,
|
"enforce_eager": vllm_config.model_config.enforce_eager,
|
||||||
"disable_custom_all_reduce": vllm_config.parallel_config.disable_custom_all_reduce,
|
"disable_custom_all_reduce": parallel_config.disable_custom_all_reduce,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -3391,7 +3391,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
attn_metadata[ubid][layer_name] = attn_metadata_i
|
attn_metadata[ubid][layer_name] = attn_metadata_i
|
||||||
else:
|
else:
|
||||||
assert type(attn_metadata) is dict
|
assert type(attn_metadata) is dict
|
||||||
attn_metadata_i = attn_group.get_metadata_builder().build_for_cudagraph_capture(
|
metadata_builder = attn_group.get_metadata_builder()
|
||||||
|
attn_metadata_i = metadata_builder.build_for_cudagraph_capture(
|
||||||
common_attn_metadata
|
common_attn_metadata
|
||||||
)
|
)
|
||||||
for layer_name in attn_group.layer_names:
|
for layer_name in attn_group.layer_names:
|
||||||
|
Reference in New Issue
Block a user