mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 23:03:52 +08:00
Compare commits
8 Commits
Author | SHA1 | Date | |
---|---|---|---|
01efc7ef78 | |||
26b999c71a | |||
da3fa78dc9 | |||
bbb70036cb | |||
89da8d9d09 | |||
01085b134d | |||
66160a9943 | |||
eaca762c18 |
@ -1,24 +1,22 @@
|
|||||||
steps:
|
steps:
|
||||||
# aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
|
# aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
|
||||||
- label: "Build arm64 wheel - CUDA 12.9"
|
- label: "Build arm64 wheel - CUDA 12.9"
|
||||||
|
depends_on: ~
|
||||||
id: build-wheel-arm64-cuda-12-9
|
id: build-wheel-arm64-cuda-12-9
|
||||||
agents:
|
agents:
|
||||||
queue: arm64_cpu_queue_postmerge
|
queue: arm64_cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- block: "Build CUDA 12.8 wheel"
|
|
||||||
key: block-build-cu128-wheel
|
|
||||||
|
|
||||||
- label: "Build wheel - CUDA 12.8"
|
- label: "Build wheel - CUDA 12.8"
|
||||||
depends_on: block-build-cu128-wheel
|
depends_on: ~
|
||||||
id: build-wheel-cuda-12-8
|
id: build-wheel-cuda-12-8
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
@ -30,12 +28,8 @@ steps:
|
|||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- block: "Build CUDA 12.6 wheel"
|
|
||||||
key: block-build-cu126-wheel
|
|
||||||
depends_on: ~
|
|
||||||
|
|
||||||
- label: "Build wheel - CUDA 12.6"
|
- label: "Build wheel - CUDA 12.6"
|
||||||
depends_on: block-build-cu126-wheel
|
depends_on: ~
|
||||||
id: build-wheel-cuda-12-6
|
id: build-wheel-cuda-12-6
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
@ -102,8 +96,6 @@ steps:
|
|||||||
depends_on:
|
depends_on:
|
||||||
- create-multi-arch-manifest
|
- create-multi-arch-manifest
|
||||||
- build-wheel-cuda-12-8
|
- build-wheel-cuda-12-8
|
||||||
- build-wheel-cuda-12-6
|
|
||||||
- build-wheel-cuda-12-9
|
|
||||||
id: annotate-release-workflow
|
id: annotate-release-workflow
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
|
@ -14,18 +14,33 @@ buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
|
|||||||
To download the wheel:
|
To download the wheel:
|
||||||
\`\`\`
|
\`\`\`
|
||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
||||||
|
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
|
||||||
|
|
||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
|
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
|
||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl .
|
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
|
||||||
\`\`\`
|
\`\`\`
|
||||||
|
|
||||||
To download and upload the image:
|
To download and upload the image:
|
||||||
|
|
||||||
\`\`\`
|
\`\`\`
|
||||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
|
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
|
||||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
|
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
|
||||||
docker tag vllm/vllm-openai vllm/vllm-openai:latest
|
|
||||||
docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
|
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
|
||||||
docker push vllm/vllm-openai:latest
|
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
|
||||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}
|
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
|
||||||
|
docker push vllm/vllm-openai:latest-x86_64
|
||||||
|
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
|
||||||
|
|
||||||
|
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
|
||||||
|
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
|
||||||
|
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||||
|
docker push vllm/vllm-openai:latest-aarch64
|
||||||
|
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||||
|
|
||||||
|
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
|
||||||
|
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
|
||||||
|
docker manifest push vllm/vllm-openai:latest
|
||||||
|
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
|
||||||
\`\`\`
|
\`\`\`
|
||||||
EOF
|
EOF
|
@ -43,6 +43,7 @@ void sm100_cutlass_mla_decode(
|
|||||||
torch::Tensor const& seq_lens,
|
torch::Tensor const& seq_lens,
|
||||||
torch::Tensor const& page_table,
|
torch::Tensor const& page_table,
|
||||||
torch::Tensor const& workspace,
|
torch::Tensor const& workspace,
|
||||||
|
double sm_scale,
|
||||||
int64_t num_kv_splits) {
|
int64_t num_kv_splits) {
|
||||||
TORCH_CHECK(false, "CUDA version must be >= 12.4 for cutlass_mla_decode");
|
TORCH_CHECK(false, "CUDA version must be >= 12.4 for cutlass_mla_decode");
|
||||||
}
|
}
|
||||||
|
@ -196,6 +196,7 @@ ARG SCCACHE_S3_NO_CREDENTIALS=0
|
|||||||
|
|
||||||
# Flag to control whether to use pre-built vLLM wheels
|
# Flag to control whether to use pre-built vLLM wheels
|
||||||
ARG VLLM_USE_PRECOMPILED=""
|
ARG VLLM_USE_PRECOMPILED=""
|
||||||
|
ARG VLLM_MAIN_CUDA_VERSION=""
|
||||||
|
|
||||||
# if USE_SCCACHE is set, use sccache to speed up compilation
|
# if USE_SCCACHE is set, use sccache to speed up compilation
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
@ -213,6 +214,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
&& export SCCACHE_IDLE_TIMEOUT=0 \
|
&& export SCCACHE_IDLE_TIMEOUT=0 \
|
||||||
&& export CMAKE_BUILD_TYPE=Release \
|
&& export CMAKE_BUILD_TYPE=Release \
|
||||||
&& export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
|
&& export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
|
||||||
|
&& export VLLM_MAIN_CUDA_VERSION="${VLLM_MAIN_CUDA_VERSION}" \
|
||||||
&& export VLLM_DOCKER_BUILD_CONTEXT=1 \
|
&& export VLLM_DOCKER_BUILD_CONTEXT=1 \
|
||||||
&& sccache --show-stats \
|
&& sccache --show-stats \
|
||||||
&& python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
|
&& python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
|
||||||
|
6
setup.py
6
setup.py
@ -56,8 +56,6 @@ elif (sys.platform.startswith("linux") and torch.version.cuda is None
|
|||||||
# fallback to cpu
|
# fallback to cpu
|
||||||
VLLM_TARGET_DEVICE = "cpu"
|
VLLM_TARGET_DEVICE = "cpu"
|
||||||
|
|
||||||
MAIN_CUDA_VERSION = "12.8"
|
|
||||||
|
|
||||||
|
|
||||||
def is_sccache_available() -> bool:
|
def is_sccache_available() -> bool:
|
||||||
return which("sccache") is not None and \
|
return which("sccache") is not None and \
|
||||||
@ -507,7 +505,7 @@ def get_vllm_version() -> str:
|
|||||||
version += f"{sep}precompiled"
|
version += f"{sep}precompiled"
|
||||||
else:
|
else:
|
||||||
cuda_version = str(get_nvcc_cuda_version())
|
cuda_version = str(get_nvcc_cuda_version())
|
||||||
if cuda_version != MAIN_CUDA_VERSION:
|
if cuda_version != envs.VLLM_MAIN_CUDA_VERSION:
|
||||||
cuda_version_str = cuda_version.replace(".", "")[:3]
|
cuda_version_str = cuda_version.replace(".", "")[:3]
|
||||||
# skip this for source tarball, required for pypi
|
# skip this for source tarball, required for pypi
|
||||||
if "sdist" not in sys.argv:
|
if "sdist" not in sys.argv:
|
||||||
@ -515,7 +513,7 @@ def get_vllm_version() -> str:
|
|||||||
elif _is_hip():
|
elif _is_hip():
|
||||||
# Get the Rocm Version
|
# Get the Rocm Version
|
||||||
rocm_version = get_rocm_version() or torch.version.hip
|
rocm_version = get_rocm_version() or torch.version.hip
|
||||||
if rocm_version and rocm_version != MAIN_CUDA_VERSION:
|
if rocm_version and rocm_version != envs.VLLM_MAIN_CUDA_VERSION:
|
||||||
version += f"{sep}rocm{rocm_version.replace('.', '')[:3]}"
|
version += f"{sep}rocm{rocm_version.replace('.', '')[:3]}"
|
||||||
elif _is_tpu():
|
elif _is_tpu():
|
||||||
version += f"{sep}tpu"
|
version += f"{sep}tpu"
|
||||||
|
@ -771,11 +771,11 @@ def test_flashinfer_cutlass_mxfp4_mxfp8_fused_moe(
|
|||||||
w13_ref = dequant_mxfp4_batches(
|
w13_ref = dequant_mxfp4_batches(
|
||||||
w13_q.view(torch.uint8),
|
w13_q.view(torch.uint8),
|
||||||
w13_scale.view(torch.uint8).reshape(-1)).to(torch.float32).reshape(
|
w13_scale.view(torch.uint8).reshape(-1)).to(torch.float32).reshape(
|
||||||
num_experts, 2 * intermediate_size, hidden_size)
|
num_experts, 2 * intermediate_size, hidden_size).to(device)
|
||||||
w2_ref = dequant_mxfp4_batches(
|
w2_ref = dequant_mxfp4_batches(
|
||||||
w2_q.view(torch.uint8),
|
w2_q.view(torch.uint8),
|
||||||
w2_scale.view(torch.uint8).reshape(-1)).to(torch.float32).reshape(
|
w2_scale.view(torch.uint8).reshape(-1)).to(torch.float32).reshape(
|
||||||
num_experts, hidden_size, intermediate_size)
|
num_experts, hidden_size, intermediate_size).to(device)
|
||||||
|
|
||||||
# Quantize activations for SM100 path and dequantize for reference
|
# Quantize activations for SM100 path and dequantize for reference
|
||||||
hidden_states_q, hidden_states_sf = mxfp8_quantize(hidden_states, True, 32)
|
hidden_states_q, hidden_states_sf = mxfp8_quantize(hidden_states, True, 32)
|
||||||
|
114
tests/models/language/pooling/test_mm_classifier_conversion.py
Normal file
114
tests/models/language/pooling/test_mm_classifier_conversion.py
Normal file
@ -0,0 +1,114 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
|
|
||||||
|
def test_idefics_multimodal(
|
||||||
|
vllm_runner,
|
||||||
|
monkeypatch,
|
||||||
|
) -> None:
|
||||||
|
if current_platform.is_rocm():
|
||||||
|
# ROCm Triton FA does not currently support sliding window attention
|
||||||
|
# switch to use ROCm CK FA backend
|
||||||
|
monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
|
||||||
|
|
||||||
|
prompts = [
|
||||||
|
"Hello, my name is",
|
||||||
|
"The president of the United States is",
|
||||||
|
"The capital of France is",
|
||||||
|
"The future of AI is",
|
||||||
|
]
|
||||||
|
|
||||||
|
with vllm_runner(model_name="HuggingFaceM4/Idefics3-8B-Llama3",
|
||||||
|
runner="pooling",
|
||||||
|
task="classify",
|
||||||
|
convert="classify",
|
||||||
|
load_format="dummy",
|
||||||
|
max_model_len=512,
|
||||||
|
enforce_eager=True,
|
||||||
|
tensor_parallel_size=1,
|
||||||
|
disable_log_stats=True,
|
||||||
|
dtype="bfloat16") as vllm_model:
|
||||||
|
llm = vllm_model.get_llm()
|
||||||
|
outputs = llm.classify(prompts)
|
||||||
|
for output in outputs:
|
||||||
|
assert len(output.outputs.probs) == 2
|
||||||
|
|
||||||
|
|
||||||
|
def update_config(config):
|
||||||
|
config.text_config.update({
|
||||||
|
"architectures": ["Gemma3ForSequenceClassification"],
|
||||||
|
"classifier_from_token": ["A", "B", "C", "D", "E"],
|
||||||
|
"method":
|
||||||
|
"no_post_processing",
|
||||||
|
"id2label": {
|
||||||
|
"A": "Chair",
|
||||||
|
"B": "Couch",
|
||||||
|
"C": "Table",
|
||||||
|
"D": "Bed",
|
||||||
|
"E": "Cupboard"
|
||||||
|
},
|
||||||
|
})
|
||||||
|
return config
|
||||||
|
|
||||||
|
|
||||||
|
def test_gemma_multimodal(
|
||||||
|
vllm_runner,
|
||||||
|
monkeypatch,
|
||||||
|
) -> None:
|
||||||
|
if current_platform.is_rocm():
|
||||||
|
# ROCm Triton FA does not currently support sliding window attention
|
||||||
|
# switch to use ROCm CK FA backend
|
||||||
|
monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
|
||||||
|
|
||||||
|
messages = [{
|
||||||
|
"role":
|
||||||
|
"system",
|
||||||
|
"content":
|
||||||
|
"""
|
||||||
|
You are a helpful assistant. You will be given a product description
|
||||||
|
which may also include an image. Classify the following product into
|
||||||
|
one of the categories:
|
||||||
|
|
||||||
|
A = chair
|
||||||
|
B = couch
|
||||||
|
C = table
|
||||||
|
D = bed
|
||||||
|
E = cupboard
|
||||||
|
|
||||||
|
You'll answer with exactly one letter (A, B, C, D, or E)."""
|
||||||
|
}, {
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content": [{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url":
|
||||||
|
"https://upload.wikimedia.org/wikipedia/commons/c/c6/Set_of_fourteen_side_chairs_MET_DP110780.jpg"
|
||||||
|
}
|
||||||
|
}, {
|
||||||
|
"type": "text",
|
||||||
|
"text": "A fine 19th century piece of furniture."
|
||||||
|
}]
|
||||||
|
}]
|
||||||
|
|
||||||
|
with vllm_runner(model_name="google/gemma-3-4b-it",
|
||||||
|
runner="pooling",
|
||||||
|
task="classify",
|
||||||
|
convert="classify",
|
||||||
|
load_format="auto",
|
||||||
|
hf_overrides=update_config,
|
||||||
|
override_pooler_config={"pooling_type": "LAST"},
|
||||||
|
max_model_len=512,
|
||||||
|
enforce_eager=True,
|
||||||
|
tensor_parallel_size=1,
|
||||||
|
disable_log_stats=True,
|
||||||
|
dtype="bfloat16") as vllm_model:
|
||||||
|
|
||||||
|
llm = vllm_model.get_llm()
|
||||||
|
prompts = llm.preprocess_chat(messages)
|
||||||
|
|
||||||
|
result = llm.classify(prompts)
|
||||||
|
assert result[0].outputs.probs[0] > 0.95
|
||||||
|
assert all(c < 0.05 for c in result[0].outputs.probs[1:])
|
@ -703,6 +703,106 @@ class LLM:
|
|||||||
|
|
||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
|
def preprocess_chat(
|
||||||
|
self,
|
||||||
|
messages: Union[list[ChatCompletionMessageParam],
|
||||||
|
list[list[ChatCompletionMessageParam]]],
|
||||||
|
lora_request: Optional[LoRARequest] = None,
|
||||||
|
chat_template: Optional[str] = None,
|
||||||
|
chat_template_content_format: ChatTemplateContentFormatOption = "auto",
|
||||||
|
add_generation_prompt: bool = True,
|
||||||
|
continue_final_message: bool = False,
|
||||||
|
tools: Optional[list[dict[str, Any]]] = None,
|
||||||
|
chat_template_kwargs: Optional[dict[str, Any]] = None,
|
||||||
|
mm_processor_kwargs: Optional[dict[str, Any]] = None,
|
||||||
|
) -> list[TokensPrompt]:
|
||||||
|
"""
|
||||||
|
Generate prompt for a chat conversation. The pre-processed
|
||||||
|
prompt can then be used as input for the other LLM methods.
|
||||||
|
|
||||||
|
Refer to `chat` for a complete description of the arguments.
|
||||||
|
Returns:
|
||||||
|
A list of `TokensPrompts` objects containing the tokenized
|
||||||
|
prompt after chat template interpolation, and the
|
||||||
|
pre-processed multi-modal inputs.
|
||||||
|
"""
|
||||||
|
list_of_messages: list[list[ChatCompletionMessageParam]]
|
||||||
|
|
||||||
|
# Handle multi and single conversations
|
||||||
|
if is_list_of(messages, list):
|
||||||
|
# messages is list[list[...]]
|
||||||
|
list_of_messages = cast(list[list[ChatCompletionMessageParam]],
|
||||||
|
messages)
|
||||||
|
else:
|
||||||
|
# messages is list[...]
|
||||||
|
list_of_messages = [
|
||||||
|
cast(list[ChatCompletionMessageParam], messages)
|
||||||
|
]
|
||||||
|
|
||||||
|
tokenizer = self.get_tokenizer(lora_request)
|
||||||
|
model_config = self.llm_engine.get_model_config()
|
||||||
|
resolved_content_format = resolve_chat_template_content_format(
|
||||||
|
chat_template,
|
||||||
|
tools,
|
||||||
|
chat_template_content_format,
|
||||||
|
tokenizer,
|
||||||
|
model_config=model_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
_chat_template_kwargs: dict[str, Any] = dict(
|
||||||
|
chat_template=chat_template,
|
||||||
|
add_generation_prompt=add_generation_prompt,
|
||||||
|
continue_final_message=continue_final_message,
|
||||||
|
tools=tools,
|
||||||
|
)
|
||||||
|
_chat_template_kwargs.update(chat_template_kwargs or {})
|
||||||
|
|
||||||
|
prompts: list[TokensPrompt] = []
|
||||||
|
|
||||||
|
for msgs in list_of_messages:
|
||||||
|
# NOTE: _parse_chat_message_content_parts() currently doesn't
|
||||||
|
# handle mm_processor_kwargs, since there is no implementation in
|
||||||
|
# the chat message parsing for it.
|
||||||
|
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||||
|
msgs,
|
||||||
|
model_config,
|
||||||
|
tokenizer,
|
||||||
|
content_format=resolved_content_format,
|
||||||
|
)
|
||||||
|
|
||||||
|
if isinstance(tokenizer, MistralTokenizer):
|
||||||
|
prompt_token_ids = apply_mistral_chat_template(
|
||||||
|
tokenizer,
|
||||||
|
messages=msgs,
|
||||||
|
**_chat_template_kwargs,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
prompt_str = apply_hf_chat_template(
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
conversation=conversation,
|
||||||
|
model_config=model_config,
|
||||||
|
**_chat_template_kwargs,
|
||||||
|
)
|
||||||
|
# Special tokens are already included in chat templates so
|
||||||
|
# should not be added by the tokenizer in this case.
|
||||||
|
prompt_token_ids = tokenizer.encode(prompt_str,
|
||||||
|
add_special_tokens=False)
|
||||||
|
|
||||||
|
prompt = TokensPrompt(prompt_token_ids=prompt_token_ids)
|
||||||
|
|
||||||
|
if mm_data is not None:
|
||||||
|
prompt["multi_modal_data"] = mm_data
|
||||||
|
|
||||||
|
if mm_uuids is not None:
|
||||||
|
prompt["multi_modal_uuids"] = mm_uuids
|
||||||
|
|
||||||
|
if mm_processor_kwargs is not None:
|
||||||
|
prompt["mm_processor_kwargs"] = mm_processor_kwargs
|
||||||
|
|
||||||
|
prompts.append(prompt)
|
||||||
|
|
||||||
|
return prompts
|
||||||
|
|
||||||
def chat(
|
def chat(
|
||||||
self,
|
self,
|
||||||
messages: Union[list[ChatCompletionMessageParam],
|
messages: Union[list[ChatCompletionMessageParam],
|
||||||
@ -769,80 +869,18 @@ class LLM:
|
|||||||
A list of `RequestOutput` objects containing the generated
|
A list of `RequestOutput` objects containing the generated
|
||||||
responses in the same order as the input messages.
|
responses in the same order as the input messages.
|
||||||
"""
|
"""
|
||||||
list_of_messages: list[list[ChatCompletionMessageParam]]
|
|
||||||
|
|
||||||
# Handle multi and single conversations
|
prompts = self.preprocess_chat(
|
||||||
if is_list_of(messages, list):
|
messages=messages,
|
||||||
# messages is list[list[...]]
|
lora_request=lora_request,
|
||||||
list_of_messages = cast(list[list[ChatCompletionMessageParam]],
|
|
||||||
messages)
|
|
||||||
else:
|
|
||||||
# messages is list[...]
|
|
||||||
list_of_messages = [
|
|
||||||
cast(list[ChatCompletionMessageParam], messages)
|
|
||||||
]
|
|
||||||
|
|
||||||
tokenizer = self.get_tokenizer(lora_request)
|
|
||||||
model_config = self.llm_engine.get_model_config()
|
|
||||||
resolved_content_format = resolve_chat_template_content_format(
|
|
||||||
chat_template,
|
|
||||||
tools,
|
|
||||||
chat_template_content_format,
|
|
||||||
tokenizer,
|
|
||||||
model_config=model_config,
|
|
||||||
)
|
|
||||||
|
|
||||||
_chat_template_kwargs: dict[str, Any] = dict(
|
|
||||||
chat_template=chat_template,
|
chat_template=chat_template,
|
||||||
|
chat_template_content_format=chat_template_content_format,
|
||||||
add_generation_prompt=add_generation_prompt,
|
add_generation_prompt=add_generation_prompt,
|
||||||
continue_final_message=continue_final_message,
|
continue_final_message=continue_final_message,
|
||||||
tools=tools,
|
tools=tools,
|
||||||
|
chat_template_kwargs=chat_template_kwargs,
|
||||||
|
mm_processor_kwargs=mm_processor_kwargs,
|
||||||
)
|
)
|
||||||
_chat_template_kwargs.update(chat_template_kwargs or {})
|
|
||||||
|
|
||||||
prompts: list[Union[TokensPrompt, TextPrompt]] = []
|
|
||||||
|
|
||||||
for msgs in list_of_messages:
|
|
||||||
# NOTE: _parse_chat_message_content_parts() currently doesn't
|
|
||||||
# handle mm_processor_kwargs, since there is no implementation in
|
|
||||||
# the chat message parsing for it.
|
|
||||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
|
||||||
msgs,
|
|
||||||
model_config,
|
|
||||||
tokenizer,
|
|
||||||
content_format=resolved_content_format,
|
|
||||||
)
|
|
||||||
|
|
||||||
if isinstance(tokenizer, MistralTokenizer):
|
|
||||||
prompt_token_ids = apply_mistral_chat_template(
|
|
||||||
tokenizer,
|
|
||||||
messages=msgs,
|
|
||||||
**_chat_template_kwargs,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
prompt_str = apply_hf_chat_template(
|
|
||||||
tokenizer=tokenizer,
|
|
||||||
conversation=conversation,
|
|
||||||
model_config=model_config,
|
|
||||||
**_chat_template_kwargs,
|
|
||||||
)
|
|
||||||
# Special tokens are already included in chat templates so
|
|
||||||
# should not be added by the tokenizer in this case.
|
|
||||||
prompt_token_ids = tokenizer.encode(prompt_str,
|
|
||||||
add_special_tokens=False)
|
|
||||||
|
|
||||||
prompt = TokensPrompt(prompt_token_ids=prompt_token_ids)
|
|
||||||
|
|
||||||
if mm_data is not None:
|
|
||||||
prompt["multi_modal_data"] = mm_data
|
|
||||||
|
|
||||||
if mm_uuids is not None:
|
|
||||||
prompt["multi_modal_uuids"] = mm_uuids
|
|
||||||
|
|
||||||
if mm_processor_kwargs is not None:
|
|
||||||
prompt["mm_processor_kwargs"] = mm_processor_kwargs
|
|
||||||
|
|
||||||
prompts.append(prompt)
|
|
||||||
|
|
||||||
return self.generate(
|
return self.generate(
|
||||||
prompts,
|
prompts,
|
||||||
|
@ -70,6 +70,7 @@ if TYPE_CHECKING:
|
|||||||
VLLM_VIDEO_LOADER_BACKEND: str = "opencv"
|
VLLM_VIDEO_LOADER_BACKEND: str = "opencv"
|
||||||
VLLM_MM_INPUT_CACHE_GIB: int = 4
|
VLLM_MM_INPUT_CACHE_GIB: int = 4
|
||||||
VLLM_TARGET_DEVICE: str = "cuda"
|
VLLM_TARGET_DEVICE: str = "cuda"
|
||||||
|
VLLM_MAIN_CUDA_VERSION: str = "12.8"
|
||||||
MAX_JOBS: Optional[str] = None
|
MAX_JOBS: Optional[str] = None
|
||||||
NVCC_THREADS: Optional[str] = None
|
NVCC_THREADS: Optional[str] = None
|
||||||
VLLM_USE_PRECOMPILED: bool = False
|
VLLM_USE_PRECOMPILED: bool = False
|
||||||
@ -246,6 +247,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
"VLLM_TARGET_DEVICE":
|
"VLLM_TARGET_DEVICE":
|
||||||
lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(),
|
lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(),
|
||||||
|
|
||||||
|
# Main CUDA version of vLLM, supporting [12.6, 12.8, 12.9],
|
||||||
|
# 12.8 is the default. This follows PyTorch but can be overridden.
|
||||||
|
"VLLM_MAIN_CUDA_VERSION":
|
||||||
|
lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower() or "12.8",
|
||||||
|
|
||||||
# Maximum number of compilation jobs to run in parallel.
|
# Maximum number of compilation jobs to run in parallel.
|
||||||
# By default this is the number of CPUs
|
# By default this is the number of CPUs
|
||||||
"MAX_JOBS":
|
"MAX_JOBS":
|
||||||
|
@ -0,0 +1,146 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 64,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 5
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 64,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 256,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 256,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_M": 128,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_M": 128,
|
||||||
|
"BLOCK_SIZE_N": 256,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_M": 128,
|
||||||
|
"BLOCK_SIZE_N": 256,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 4
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,146 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 256,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,146 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 64,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 64,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 64,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 256,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_M": 128,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 3
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,146 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 256,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 256,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,146 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 64,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 64,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 256,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 256,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 256,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_M": 128,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 3
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,146 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 256,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 64,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,146 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 3
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2
|
||||||
|
}
|
||||||
|
}
|
@ -19,10 +19,11 @@ from vllm.logger import init_logger
|
|||||||
from vllm.model_executor.layers.linear import QKVCrossParallelLinear
|
from vllm.model_executor.layers.linear import QKVCrossParallelLinear
|
||||||
from vllm.model_executor.layers.quantization.base_config import (
|
from vllm.model_executor.layers.quantization.base_config import (
|
||||||
QuantizationConfig, QuantizeMethodBase)
|
QuantizationConfig, QuantizeMethodBase)
|
||||||
from vllm.model_executor.models.adapters import (as_embedding_model,
|
from vllm.model_executor.models.adapters import (
|
||||||
as_reward_model,
|
as_embedding_model, as_reward_model, as_seq_cls_model,
|
||||||
as_seq_cls_model)
|
try_create_mm_pooling_model_cls)
|
||||||
from vllm.model_executor.models.interfaces import SupportsQuant
|
from vllm.model_executor.models.interfaces import (SupportsQuant,
|
||||||
|
supports_multimodal)
|
||||||
from vllm.utils import is_pin_memory_available
|
from vllm.utils import is_pin_memory_available
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
@ -183,6 +184,15 @@ def get_model_architecture(
|
|||||||
"performance may not be optimal.", arch)
|
"performance may not be optimal.", arch)
|
||||||
|
|
||||||
convert_type = model_config.convert_type
|
convert_type = model_config.convert_type
|
||||||
|
if convert_type != "none" and supports_multimodal(model_cls):
|
||||||
|
logger.debug_once("Detected conversion of Multi Modal model.")
|
||||||
|
converted = try_create_mm_pooling_model_cls(model_cls)
|
||||||
|
if converted is not None:
|
||||||
|
logger.debug_once("Creating wrapper class to forward pooler.")
|
||||||
|
return converted, arch
|
||||||
|
else:
|
||||||
|
logger.debug_once("Attempting direct conversion.")
|
||||||
|
|
||||||
if convert_type == "none":
|
if convert_type == "none":
|
||||||
pass
|
pass
|
||||||
elif convert_type == "embed":
|
elif convert_type == "embed":
|
||||||
|
@ -1,12 +1,15 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
import ast
|
||||||
|
import inspect
|
||||||
from collections.abc import Iterable
|
from collections.abc import Iterable
|
||||||
from typing import TYPE_CHECKING, Any, Optional, TypeVar, cast
|
from typing import TYPE_CHECKING, Any, Optional, TypeVar, cast
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
|
|
||||||
|
from vllm.config import VllmConfig
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.model_executor.layers.activation import get_act_fn
|
from vllm.model_executor.layers.activation import get_act_fn
|
||||||
from vllm.model_executor.models.config import VerifyAndUpdateConfig
|
from vllm.model_executor.models.config import VerifyAndUpdateConfig
|
||||||
@ -129,6 +132,41 @@ def _get_pooling_model_name(orig_model_name: str, pooling_suffix: str) -> str:
|
|||||||
return model_name + pooling_suffix
|
return model_name + pooling_suffix
|
||||||
|
|
||||||
|
|
||||||
|
def try_create_mm_pooling_model_cls(orig_cls: _T) -> _T:
|
||||||
|
|
||||||
|
class CallVisitor(ast.NodeVisitor):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.calls = []
|
||||||
|
|
||||||
|
def visit_Call(self, node):
|
||||||
|
if isinstance(node.func, ast.Name):
|
||||||
|
self.calls.append(node.func.id)
|
||||||
|
self.generic_visit(node)
|
||||||
|
|
||||||
|
visitor = CallVisitor()
|
||||||
|
visitor.visit(ast.parse(inspect.getsource(orig_cls)))
|
||||||
|
if "init_vllm_registered_model" not in visitor.calls:
|
||||||
|
return None
|
||||||
|
|
||||||
|
class ModelForPooling(orig_cls, VllmModelForPooling):
|
||||||
|
|
||||||
|
is_pooling_model = True
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
vllm_config: "VllmConfig",
|
||||||
|
prefix: str = "",
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> None:
|
||||||
|
super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs)
|
||||||
|
|
||||||
|
self.pooler = self.get_language_model().pooler
|
||||||
|
|
||||||
|
return ModelForPooling # type: ignore
|
||||||
|
|
||||||
|
|
||||||
def _create_pooling_model_cls(orig_cls: _T) -> _T:
|
def _create_pooling_model_cls(orig_cls: _T) -> _T:
|
||||||
# Lazy import
|
# Lazy import
|
||||||
from .utils import AutoWeightsLoader, WeightsMapper
|
from .utils import AutoWeightsLoader, WeightsMapper
|
||||||
@ -399,6 +437,7 @@ def load_weights_using_from_2_way_softmax(
|
|||||||
from vllm.model_executor.models.utils import AutoWeightsLoader
|
from vllm.model_executor.models.utils import AutoWeightsLoader
|
||||||
|
|
||||||
model_config = model.vllm_config.model_config
|
model_config = model.vllm_config.model_config
|
||||||
|
|
||||||
tokens = getattr(model.config, "classifier_from_token", [])
|
tokens = getattr(model.config, "classifier_from_token", [])
|
||||||
tokens = cast(list[int], tokens)
|
tokens = cast(list[int], tokens)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
@ -406,9 +445,10 @@ def load_weights_using_from_2_way_softmax(
|
|||||||
if model.config.tie_word_embeddings:
|
if model.config.tie_word_embeddings:
|
||||||
model.lm_head = model.model.embed_tokens
|
model.lm_head = model.model.embed_tokens
|
||||||
else:
|
else:
|
||||||
|
quant_config = model.vllm_config.quant_config
|
||||||
model.lm_head = ParallelLMHead(model.config.vocab_size,
|
model.lm_head = ParallelLMHead(model.config.vocab_size,
|
||||||
model.config.hidden_size,
|
model.config.hidden_size,
|
||||||
quant_config=model.quant_config)
|
quant_config=quant_config)
|
||||||
|
|
||||||
loader = AutoWeightsLoader(model)
|
loader = AutoWeightsLoader(model)
|
||||||
loaded_weights = loader.load_weights(weights)
|
loaded_weights = loader.load_weights(weights)
|
||||||
@ -452,9 +492,10 @@ def load_weights_no_post_processing(model,
|
|||||||
if model.config.tie_word_embeddings:
|
if model.config.tie_word_embeddings:
|
||||||
model.lm_head = model.model.embed_tokens
|
model.lm_head = model.model.embed_tokens
|
||||||
else:
|
else:
|
||||||
|
quant_config = model.vllm_config.quant_config
|
||||||
model.lm_head = ParallelLMHead(model.config.vocab_size,
|
model.lm_head = ParallelLMHead(model.config.vocab_size,
|
||||||
model.config.hidden_size,
|
model.config.hidden_size,
|
||||||
quant_config=model.quant_config)
|
quant_config=quant_config)
|
||||||
|
|
||||||
loader = AutoWeightsLoader(model)
|
loader = AutoWeightsLoader(model)
|
||||||
loaded_weights = loader.load_weights(weights)
|
loaded_weights = loader.load_weights(weights)
|
||||||
|
@ -512,7 +512,11 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
|
|||||||
architectures=["Gemma3ForCausalLM"],
|
architectures=["Gemma3ForCausalLM"],
|
||||||
)
|
)
|
||||||
logit_scale = getattr(config, "logit_scale", 1.0)
|
logit_scale = getattr(config, "logit_scale", 1.0)
|
||||||
self.language_model.logits_processor.scale *= logit_scale
|
|
||||||
|
if hasattr(self.language_model, "logits_processor"):
|
||||||
|
# The logits processor can be unset if we're using
|
||||||
|
# automatic conversion to pooling model.
|
||||||
|
self.language_model.logits_processor.scale *= logit_scale
|
||||||
|
|
||||||
self.make_empty_intermediate_tensors = (
|
self.make_empty_intermediate_tensors = (
|
||||||
self.language_model.make_empty_intermediate_tensors)
|
self.language_model.make_empty_intermediate_tensors)
|
||||||
|
@ -170,8 +170,9 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
|
|||||||
return quant_config
|
return quant_config
|
||||||
|
|
||||||
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
||||||
# NOTE: hidden_states can have either 1D or 2D shape.
|
assert hidden_states.dim(
|
||||||
orig_shape = hidden_states.shape
|
) <= 2, "Qwen3MoeSparseMoeBlock only supports 1D or 2D inputs"
|
||||||
|
is_input_1d = hidden_states.dim() == 1
|
||||||
hidden_dim = hidden_states.shape[-1]
|
hidden_dim = hidden_states.shape[-1]
|
||||||
hidden_states = hidden_states.view(-1, hidden_dim)
|
hidden_states = hidden_states.view(-1, hidden_dim)
|
||||||
|
|
||||||
@ -180,7 +181,9 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
|
|||||||
final_hidden_states = self.experts(hidden_states=hidden_states,
|
final_hidden_states = self.experts(hidden_states=hidden_states,
|
||||||
router_logits=router_logits)
|
router_logits=router_logits)
|
||||||
|
|
||||||
return final_hidden_states.view(orig_shape)
|
# return to 1d if input is 1d
|
||||||
|
return final_hidden_states.squeeze(0) if is_input_1d else \
|
||||||
|
final_hidden_states
|
||||||
|
|
||||||
|
|
||||||
class Qwen3MoeAttention(nn.Module):
|
class Qwen3MoeAttention(nn.Module):
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
"""Inference-only Qwen3Next model."""
|
"""Inference-only Qwen3Next model."""
|
||||||
from collections.abc import Iterable
|
from collections.abc import Iterable
|
||||||
|
from itertools import islice
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
@ -917,8 +918,11 @@ class Qwen3NextModel(nn.Module):
|
|||||||
make_empty_intermediate_tensors_factory(
|
make_empty_intermediate_tensors_factory(
|
||||||
["hidden_states", "residual"], config.hidden_size))
|
["hidden_states", "residual"], config.hidden_size))
|
||||||
|
|
||||||
self.norm = Qwen3NextRMSNorm(config.hidden_size,
|
if get_pp_group().is_last_rank:
|
||||||
eps=config.rms_norm_eps)
|
self.norm = Qwen3NextRMSNorm(config.hidden_size,
|
||||||
|
eps=config.rms_norm_eps)
|
||||||
|
else:
|
||||||
|
self.norm = PPMissingLayer()
|
||||||
|
|
||||||
def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
|
def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
|
||||||
return self.embed_tokens(input_ids)
|
return self.embed_tokens(input_ids)
|
||||||
@ -941,7 +945,7 @@ class Qwen3NextModel(nn.Module):
|
|||||||
hidden_states = intermediate_tensors["hidden_states"]
|
hidden_states = intermediate_tensors["hidden_states"]
|
||||||
residual = intermediate_tensors["residual"]
|
residual = intermediate_tensors["residual"]
|
||||||
|
|
||||||
for layer in self.layers:
|
for layer in islice(self.layers, self.start_layer, self.end_layer):
|
||||||
hidden_states, residual = layer(
|
hidden_states, residual = layer(
|
||||||
positions=positions,
|
positions=positions,
|
||||||
hidden_states=hidden_states,
|
hidden_states=hidden_states,
|
||||||
|
@ -209,7 +209,8 @@ class GDNAttentionMetadataBuilder(
|
|||||||
|
|
||||||
# prepare tensors for cudagraph
|
# prepare tensors for cudagraph
|
||||||
if (self.use_full_cuda_graph and num_prefills == 0 and num_decodes == 0
|
if (self.use_full_cuda_graph and num_prefills == 0 and num_decodes == 0
|
||||||
and num_spec_decodes <= self.decode_cudagraph_max_bs):
|
and num_spec_decodes <= self.decode_cudagraph_max_bs
|
||||||
|
and m.num_actual_tokens <= self.decode_cudagraph_max_bs):
|
||||||
num_total_tokens = self.vllm_config.pad_for_cudagraph(
|
num_total_tokens = self.vllm_config.pad_for_cudagraph(
|
||||||
m.num_actual_tokens)
|
m.num_actual_tokens)
|
||||||
batch_size = num_total_tokens // (self.num_spec + 1)
|
batch_size = num_total_tokens // (self.num_spec + 1)
|
||||||
|
Reference in New Issue
Block a user