mirror of
https://github.com/vllm-project/vllm.git
synced 2025-11-01 15:24:47 +08:00
Compare commits
131 Commits
amd_dev
...
revert-276
| Author | SHA1 | Date | |
|---|---|---|---|
| f68f7ee030 | |||
| 141e6a0505 | |||
| 130aa8cbcf | |||
| e3d8186666 | |||
| f5710ef02a | |||
| a8c02fb5bf | |||
| 02af36df36 | |||
| e88bdd60d9 | |||
| 05e034f085 | |||
| 936643a868 | |||
| b186149e8e | |||
| 2abbd351ef | |||
| 446912d1cb | |||
| a00d6254e9 | |||
| 05181cc57f | |||
| 259504e147 | |||
| 0484b64248 | |||
| f58d9b6404 | |||
| 44b5ce956d | |||
| 7a865f2325 | |||
| 2fa90bda27 | |||
| 0291fbf65c | |||
| b46e4a06f1 | |||
| d34f5fe939 | |||
| bdb01a38fe | |||
| 5b3c35a68e | |||
| 61fbfe5274 | |||
| 255e34ca50 | |||
| a8d2e326ec | |||
| 53a56e658b | |||
| 69f064062b | |||
| 921e78f4bb | |||
| 6ebffafbb6 | |||
| 3b96f85c36 | |||
| 23ad820553 | |||
| 5d3be3ba4c | |||
| 4f882be4a0 | |||
| 9273754222 | |||
| f4e8154076 | |||
| a663f6ae64 | |||
| a4fc21895e | |||
| a3e8611da5 | |||
| 7c2bdb83dc | |||
| 9932ed6a83 | |||
| 2d631d28c6 | |||
| b368382964 | |||
| a806c14cc7 | |||
| 181bf5bbde | |||
| cbd5e07a51 | |||
| 63b22e0dbb | |||
| 5980604c44 | |||
| 361a7463d3 | |||
| 720af6ab79 | |||
| 55cba4a05c | |||
| c7abff2990 | |||
| 71b1c8b667 | |||
| 8fb7b2fab9 | |||
| be7b55a83d | |||
| 315b860abe | |||
| 87c41c26ad | |||
| 65d2cf9511 | |||
| d63cd9ff10 | |||
| 66a168a197 | |||
| a99564ac5b | |||
| 4c5f632165 | |||
| b853540388 | |||
| 56ed7609a9 | |||
| 29c9cb8007 | |||
| 83f478bb19 | |||
| 269c4db0a4 | |||
| 52efc34ebf | |||
| d95d0f4b98 | |||
| 0402428200 | |||
| 17af6aa0da | |||
| fc168c33f3 | |||
| acc78aeb88 | |||
| 0f67d4d962 | |||
| 7e1d697b56 | |||
| 699d62e6cf | |||
| cd390b609d | |||
| 2080b05099 | |||
| 6454afec90 | |||
| 41a62564a7 | |||
| 284cc92275 | |||
| 435be10db9 | |||
| b7030d962b | |||
| 3567816932 | |||
| e0ef8a2920 | |||
| 42efe609ba | |||
| 88d3141ec6 | |||
| 09a6a49eaf | |||
| 074475541a | |||
| d4c574c39f | |||
| c528b9006a | |||
| 85fee74b33 | |||
| 8dbe0c527f | |||
| 5cc6bddb6e | |||
| 1f9460c4c1 | |||
| 70022ffc00 | |||
| f417746ad7 | |||
| 0552cfb195 | |||
| 51dd14ac2b | |||
| dbfbf9f324 | |||
| ca76486a16 | |||
| a9f55dc588 | |||
| 81d5bb765a | |||
| 0825197bee | |||
| 9ef3d5b875 | |||
| 295c7f0267 | |||
| 3fa2c12185 | |||
| fe2016de2d | |||
| 237cf6d32a | |||
| faee3ccdc2 | |||
| 570c3e1cd4 | |||
| 3a4255c7c4 | |||
| 61089465a6 | |||
| 88afa11010 | |||
| d00ce29d89 | |||
| 3b7bdf983b | |||
| 50b788a17a | |||
| fc059c7061 | |||
| bfb240cc49 | |||
| e255d92990 | |||
| 3729ed00ba | |||
| 6644796bf4 | |||
| ff93cc8c84 | |||
| 243ed7d32e | |||
| 7e0941055f | |||
| 6738e4a093 | |||
| 2566dca2a9 | |||
| b4fda58a2d |
@ -7,6 +7,7 @@ from importlib import util
|
||||
|
||||
import pandas as pd
|
||||
|
||||
pd.options.display.float_format = "{:.2f}".format
|
||||
plotly_found = util.find_spec("plotly.express") is not None
|
||||
|
||||
|
||||
@ -109,7 +110,10 @@ def compare_data_columns(
|
||||
if len(compare_frames) >= 2:
|
||||
base = compare_frames[0]
|
||||
current = compare_frames[-1]
|
||||
ratio = current / base
|
||||
if "P99" in data_column or "Median" in data_column:
|
||||
ratio = base / current # for latency
|
||||
else:
|
||||
ratio = current / base
|
||||
ratio = ratio.mask(base == 0) # avoid inf when baseline is 0
|
||||
ratio.name = f"Ratio 1 vs {len(compare_frames)}"
|
||||
frames.append(ratio)
|
||||
@ -199,6 +203,71 @@ def split_json_by_tp_pp(
|
||||
return saved_paths
|
||||
|
||||
|
||||
def _add_limit_line(fig, y_value, label):
|
||||
# Visible dashed line + annotation
|
||||
fig.add_hline(
|
||||
y=y_value,
|
||||
line_dash="dash",
|
||||
line_color="red" if "ttft" in label.lower() else "blue",
|
||||
annotation_text=f"{label}: {y_value} ms",
|
||||
annotation_position="top left",
|
||||
)
|
||||
# Optional: add a legend item (as a transparent helper trace)
|
||||
if plot and plotly_found:
|
||||
import plotly.graph_objects as go
|
||||
|
||||
fig.add_trace(
|
||||
go.Scatter(
|
||||
x=[None],
|
||||
y=[None],
|
||||
mode="lines",
|
||||
line=dict(
|
||||
dash="dash", color="red" if "ttft" in label.lower() else "blue"
|
||||
),
|
||||
name=f"{label}",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _find_concurrency_col(df: pd.DataFrame) -> str:
|
||||
for c in [
|
||||
"# of max concurrency.",
|
||||
"# of max concurrency",
|
||||
"Max Concurrency",
|
||||
"max_concurrency",
|
||||
"Concurrency",
|
||||
]:
|
||||
if c in df.columns:
|
||||
return c
|
||||
# Fallback: guess an integer-like column (harmless if unused)
|
||||
for c in df.columns:
|
||||
if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
|
||||
return c
|
||||
return "# of max concurrency."
|
||||
|
||||
|
||||
def _highlight_threshold(
|
||||
df: pd.DataFrame, threshold: float
|
||||
) -> "pd.io.formats.style.Styler":
|
||||
"""Highlight numeric per-configuration columns with value <= threshold."""
|
||||
conc_col = _find_concurrency_col(df)
|
||||
key_cols = [
|
||||
c
|
||||
for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col]
|
||||
if c in df.columns
|
||||
]
|
||||
conf_cols = [
|
||||
c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio")
|
||||
]
|
||||
conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
|
||||
return df.style.map(
|
||||
lambda v: "background-color:#e6ffe6;font-weight:bold;"
|
||||
if pd.notna(v) and v <= threshold
|
||||
else "",
|
||||
subset=conf_cols,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
@ -220,6 +289,26 @@ if __name__ == "__main__":
|
||||
default="# of max concurrency.",
|
||||
help="column name to use as X Axis in comparison graph",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-l",
|
||||
"--latency",
|
||||
type=str,
|
||||
default="p99",
|
||||
help="take median|p99 for latency like TTFT/TPOT",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ttft-max-ms",
|
||||
type=float,
|
||||
default=3000.0,
|
||||
help="Reference limit for TTFT plots (ms)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tpot-max-ms",
|
||||
type=float,
|
||||
default=100.0,
|
||||
help="Reference limit for TPOT plots (ms)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
drop_column = "P99"
|
||||
@ -234,12 +323,22 @@ if __name__ == "__main__":
|
||||
"# of max concurrency.",
|
||||
"qps",
|
||||
]
|
||||
data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
|
||||
html_msgs_for_data_cols = [
|
||||
"Compare Output Tokens /n",
|
||||
"Median TTFT /n",
|
||||
"Median TPOT /n",
|
||||
]
|
||||
|
||||
if "median" in args.latency:
|
||||
data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
|
||||
html_msgs_for_data_cols = [
|
||||
"Compare Output Tokens /n",
|
||||
"Median TTFT /n",
|
||||
"Median TPOT /n",
|
||||
]
|
||||
drop_column = "P99"
|
||||
elif "p99" in args.latency:
|
||||
data_cols_to_compare = ["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"]
|
||||
html_msgs_for_data_cols = [
|
||||
"Compare Output Tokens /n",
|
||||
"P99 TTFT /n",
|
||||
"P99 TPOT /n",
|
||||
]
|
||||
|
||||
if len(args.file) == 1:
|
||||
files = split_json_by_tp_pp(args.file[0], output_root="splits")
|
||||
@ -275,33 +374,83 @@ if __name__ == "__main__":
|
||||
f"Expected subset: {filtered_info_cols}, "
|
||||
f"but DataFrame has: {list(output_df.columns)}"
|
||||
)
|
||||
output_df_sorted = output_df.sort_values(by=existing_group_cols)
|
||||
# output_df_sorted = output_df.sort_values(by=existing_group_cols)
|
||||
output_df_sorted = output_df.sort_values(by=args.xaxis)
|
||||
output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
|
||||
for name, group in output_groups:
|
||||
html = group.to_html()
|
||||
group_name = (
|
||||
",".join(map(str, name)).replace(",", "_").replace("/", "-")
|
||||
)
|
||||
group_html_name = "perf_comparison_" + group_name + ".html"
|
||||
|
||||
metric_name = str(data_cols_to_compare[i]).lower()
|
||||
if "tok/s" in metric_name:
|
||||
html = group.to_html()
|
||||
elif "ttft" in metric_name:
|
||||
styler = _highlight_threshold(group, args.ttft_max_ms).format(
|
||||
{c: "{:.2f}" for c in group.select_dtypes("number").columns},
|
||||
na_rep="—",
|
||||
)
|
||||
html = styler.to_html(
|
||||
table_attributes='border="1" class="dataframe"'
|
||||
)
|
||||
elif (
|
||||
"tpot" in metric_name
|
||||
or "median" in metric_name
|
||||
or "p99" in metric_name
|
||||
):
|
||||
styler = _highlight_threshold(group, args.tpot_max_ms).format(
|
||||
{c: "{:.2f}" for c in group.select_dtypes("number").columns},
|
||||
na_rep="—",
|
||||
)
|
||||
html = styler.to_html(
|
||||
table_attributes='border="1" class="dataframe"'
|
||||
)
|
||||
|
||||
text_file.write(html_msgs_for_data_cols[i])
|
||||
text_file.write(html)
|
||||
with open(group_html_name, "a+") as sub_text_file:
|
||||
sub_text_file.write(html_msgs_for_data_cols[i])
|
||||
sub_text_file.write(html)
|
||||
|
||||
if plot and plotly_found:
|
||||
import plotly.express as px
|
||||
if plot and plotly_found:
|
||||
import plotly.express as px
|
||||
|
||||
df = group[raw_data_cols]
|
||||
df_sorted = df.sort_values(by=info_cols[y_axis_index])
|
||||
# Melt DataFrame for plotting
|
||||
df_melted = df_sorted.melt(
|
||||
id_vars=info_cols[y_axis_index],
|
||||
var_name="Configuration",
|
||||
value_name=data_cols_to_compare[i],
|
||||
)
|
||||
title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
|
||||
# Create Plotly line chart
|
||||
fig = px.line(
|
||||
df_melted,
|
||||
x=info_cols[y_axis_index],
|
||||
y=data_cols_to_compare[i],
|
||||
color="Configuration",
|
||||
title=title,
|
||||
markers=True,
|
||||
)
|
||||
# Export to HTML
|
||||
text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn"))
|
||||
df = group[raw_data_cols]
|
||||
df_sorted = df.sort_values(by=info_cols[y_axis_index])
|
||||
# Melt DataFrame for plotting
|
||||
df_melted = df_sorted.melt(
|
||||
id_vars=info_cols[y_axis_index],
|
||||
var_name="Configuration",
|
||||
value_name=data_cols_to_compare[i],
|
||||
)
|
||||
title = (
|
||||
data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
|
||||
)
|
||||
# Create Plotly line chart
|
||||
fig = px.line(
|
||||
df_melted,
|
||||
x=info_cols[y_axis_index],
|
||||
y=data_cols_to_compare[i],
|
||||
color="Configuration",
|
||||
title=title,
|
||||
markers=True,
|
||||
)
|
||||
|
||||
# ---- Add threshold lines based on metric name ----
|
||||
if "ttft" in metric_name:
|
||||
_add_limit_line(fig, args.ttft_max_ms, "TTFT limit")
|
||||
elif (
|
||||
"tpot" in metric_name
|
||||
or "median" in metric_name
|
||||
or "p99" in metric_name
|
||||
):
|
||||
_add_limit_line(fig, args.tpot_max_ms, "TPOT limit")
|
||||
|
||||
# Export to HTML
|
||||
text_file.write(
|
||||
fig.to_html(full_html=True, include_plotlyjs="cdn")
|
||||
)
|
||||
sub_text_file.write(
|
||||
fig.to_html(full_html=True, include_plotlyjs="cdn")
|
||||
)
|
||||
|
||||
@ -63,9 +63,11 @@ serving_column_mapping = {
|
||||
"mean_ttft_ms": "Mean TTFT (ms)",
|
||||
"median_ttft_ms": "Median TTFT (ms)",
|
||||
"p99_ttft_ms": "P99 TTFT (ms)",
|
||||
"std_ttft_ms": "STD TTFT (ms)",
|
||||
"mean_tpot_ms": "Mean TPOT (ms)",
|
||||
"median_tpot_ms": "Median",
|
||||
"p99_tpot_ms": "P99",
|
||||
"std_tpot_ms": "STD TPOT (ms)",
|
||||
"mean_itl_ms": "Mean ITL (ms)",
|
||||
"median_itl_ms": "Median ITL (ms)",
|
||||
"p99_itl_ms": "P99 ITL (ms)",
|
||||
@ -368,7 +370,7 @@ if __name__ == "__main__":
|
||||
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
|
||||
# we want to turn it into "8xGPUTYPE"
|
||||
df["GPU"] = df["GPU"].apply(
|
||||
lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}"
|
||||
lambda x: "{}x{}".format(len(x.split("\n")), x.split("\n")[0])
|
||||
)
|
||||
|
||||
# get markdown tables
|
||||
|
||||
@ -471,6 +471,11 @@ main() {
|
||||
mkdir -p $RESULTS_FOLDER
|
||||
QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
||||
|
||||
# dump vllm info via vllm collect-env
|
||||
env_output=$(vllm collect-env)
|
||||
|
||||
echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"
|
||||
|
||||
# benchmarking
|
||||
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
|
||||
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
|
||||
|
||||
@ -1,28 +1,24 @@
|
||||
[
|
||||
{
|
||||
"test_name": "latency_llama8B_tp1",
|
||||
"test_name": "latency_llama8B_tp2",
|
||||
"environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"load_format": "dummy",
|
||||
"num_iters_warmup": 5,
|
||||
"num_iters": 15
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "latency_llama8B_tp4",
|
||||
"environment_variables": {
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"load_format": "dummy",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"num_iters_warmup": 5,
|
||||
"num_iters": 15
|
||||
}
|
||||
|
||||
@ -95,6 +95,38 @@
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_bf16_tp4_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
@ -233,6 +265,41 @@
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_bf16_tp4_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
@ -365,6 +432,38 @@
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp4_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
@ -503,6 +602,41 @@
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp4_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
@ -638,6 +772,39 @@
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp4_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"quantization": "awq",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
@ -780,6 +947,42 @@
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp4_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"quantization": "awq",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
@ -28,13 +28,13 @@
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
"num_prompts": 32
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
@ -60,13 +60,13 @@
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
"num_prompts": 32
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp4_sharegpt",
|
||||
"test_name": "serving_llama8B_tp1_random_128_128",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
@ -76,39 +76,7 @@
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp4_random_1024_128",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
@ -124,16 +92,16 @@
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 1024,
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 100
|
||||
"num_prompts": 32
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_pp6_random_1024_128",
|
||||
"test_name": "serving_llama8B_tp2_random_128_128",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
@ -143,7 +111,7 @@
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"pipeline_parallel_size": 6,
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
@ -159,10 +127,150 @@
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 1024,
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 100
|
||||
"num_prompts": 32
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_128_2048",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 2048,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 32
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_128_2048",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 2048,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 32
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_2048_128",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 32
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_2048_128",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 32
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
@ -1,29 +1,24 @@
|
||||
[
|
||||
{
|
||||
"test_name": "throughput_llama8B_tp1",
|
||||
"test_name": "throughput_llama8B_tp2",
|
||||
"environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"load_format": "dummy",
|
||||
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200,
|
||||
"backend": "vllm"
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "throughput_llama8B_tp4",
|
||||
"environment_variables": {
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"load_format": "dummy",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200,
|
||||
"backend": "vllm"
|
||||
|
||||
@ -22,7 +22,7 @@ steps:
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile.cpu ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
|
||||
@ -50,7 +50,7 @@ steps:
|
||||
|
||||
- label: Async Engine, Inputs, Utils, Worker Test # 36min
|
||||
timeout_in_minutes: 50
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
@ -395,7 +395,9 @@ steps:
|
||||
- python3 offline_inference/basic/embed.py
|
||||
- python3 offline_inference/basic/score.py
|
||||
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
||||
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
||||
#- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||
|
||||
- label: Platform Tests (CUDA) # 4min
|
||||
timeout_in_minutes: 15
|
||||
@ -436,7 +438,11 @@ steps:
|
||||
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
|
||||
--ignore=lora/test_chatglm3_tp.py \
|
||||
--ignore=lora/test_llama_tp.py \
|
||||
--ignore=lora/test_llm_with_multi_loras.py
|
||||
--ignore=lora/test_llm_with_multi_loras.py \
|
||||
--ignore=lora/test_olmoe_tp.py \
|
||||
--ignore=lora/test_deepseekv2_tp.py \
|
||||
--ignore=lora/test_gptoss.py \
|
||||
--ignore=lora/test_qwen3moe_tp.py
|
||||
parallelism: 4
|
||||
|
||||
- label: PyTorch Compilation Unit Tests # 15min
|
||||
@ -555,7 +561,7 @@ steps:
|
||||
|
||||
- label: Model Executor Test # 23min
|
||||
timeout_in_minutes: 35
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
@ -783,8 +789,10 @@ steps:
|
||||
- vllm/
|
||||
- tests/models/language/generation
|
||||
commands:
|
||||
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
||||
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
|
||||
# Install fast path packages for testing against transformers
|
||||
# Note: also needed to run plamo2 model in vLLM
|
||||
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
|
||||
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
|
||||
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
|
||||
|
||||
- label: Language Models Test (PPL)
|
||||
@ -1208,6 +1216,7 @@ steps:
|
||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||
- pytest -v -s -x lora/test_llama_tp.py
|
||||
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
||||
- pytest -v -s -x lora/test_olmoe_tp.py
|
||||
|
||||
- label: Weight Loading Multiple GPU Test # 33min
|
||||
timeout_in_minutes: 45
|
||||
|
||||
@ -313,6 +313,15 @@ steps:
|
||||
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||
|
||||
- label: V1 Test attention (H100) # 10min
|
||||
timeout_in_minutes: 30
|
||||
gpu: h100
|
||||
source_file_dependencies:
|
||||
- vllm/v1/attention
|
||||
- tests/v1/attention
|
||||
commands:
|
||||
- pytest -v -s v1/attention
|
||||
|
||||
- label: V1 Test others (CPU) # 5 mins
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@ -435,6 +444,18 @@ steps:
|
||||
- pytest -v -s compile/test_full_graph.py
|
||||
- pytest -v -s compile/test_fusions_e2e.py
|
||||
|
||||
- label: Cudagraph test
|
||||
timeout_in_minutes: 20
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- tests/v1/cudagraph
|
||||
- vllm/v1/cudagraph_dispatcher.py
|
||||
- vllm/config/compilation.py
|
||||
- vllm/compilation
|
||||
commands:
|
||||
- pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
|
||||
- pytest -v -s v1/cudagraph/test_cudagraph_mode.py
|
||||
|
||||
- label: Kernels Core Operation Test # 48min
|
||||
timeout_in_minutes: 75
|
||||
mirror_hardwares: [amdexperimental]
|
||||
@ -687,8 +708,10 @@ steps:
|
||||
- vllm/
|
||||
- tests/models/language/generation
|
||||
commands:
|
||||
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
||||
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
|
||||
# Install fast path packages for testing against transformers
|
||||
# Note: also needed to run plamo2 model in vLLM
|
||||
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
|
||||
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
|
||||
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
|
||||
|
||||
- label: Language Models Test (PPL)
|
||||
|
||||
@ -48,8 +48,8 @@ repos:
|
||||
entry: python tools/generate_nightly_torch_test.py
|
||||
files: ^requirements/test\.(in|txt)$
|
||||
- id: mypy-local
|
||||
name: Run mypy for local Python installation
|
||||
entry: python tools/pre_commit/mypy.py 0 "local"
|
||||
name: Run mypy locally for lowest supported Python version
|
||||
entry: python tools/pre_commit/mypy.py 0 "3.10"
|
||||
stages: [pre-commit] # Don't run in CI
|
||||
<<: &mypy_common
|
||||
language: python
|
||||
|
||||
@ -5,7 +5,7 @@ import gc
|
||||
from benchmark_utils import TimeCollector
|
||||
from tabulate import tabulate
|
||||
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
from vllm.v1.core.block_pool import BlockPool
|
||||
|
||||
|
||||
|
||||
@ -46,7 +46,7 @@ import time
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def test_long_document_qa(llm=None, sampling_params=None, prompts=None):
|
||||
|
||||
@ -19,7 +19,7 @@ from vllm.config import (
|
||||
VllmConfig,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
|
||||
from vllm.v1.worker.gpu_input_batch import InputBatch
|
||||
from vllm.v1.worker.gpu_model_runner import GPUModelRunner
|
||||
|
||||
@ -37,7 +37,7 @@ from transformers import PreTrainedTokenizerBase
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
try:
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
@ -11,7 +11,7 @@ import time
|
||||
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
||||
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
# Select a equi-probable random priority
|
||||
|
||||
@ -51,7 +51,7 @@ except ImportError:
|
||||
from backend_request_func import get_tokenizer
|
||||
|
||||
try:
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
except ImportError:
|
||||
from argparse import ArgumentParser as FlexibleArgumentParser
|
||||
|
||||
|
||||
@ -15,7 +15,7 @@ from utils import make_rand_sparse_tensors
|
||||
from weight_shapes import WEIGHT_SHAPES
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
||||
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
|
||||
|
||||
@ -18,7 +18,8 @@ from vllm import _custom_ops as ops
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
w8a8_triton_block_scaled_mm,
|
||||
)
|
||||
from vllm.utils import FlexibleArgumentParser, cdiv
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
from vllm.utils.math_utils import cdiv
|
||||
|
||||
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
||||
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
|
||||
|
||||
@ -10,7 +10,7 @@ import torch
|
||||
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
|
||||
|
||||
|
||||
@ -10,7 +10,7 @@ import vllm.model_executor.layers.activation # noqa F401
|
||||
from vllm.model_executor.custom_op import CustomOp
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
|
||||
batch_size_range = [1, 16, 32, 64, 128]
|
||||
|
||||
@ -28,7 +28,7 @@ except ImportError as e:
|
||||
|
||||
from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target
|
||||
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark BitBLAS int4 on a specific target."
|
||||
|
||||
@ -20,7 +20,7 @@ from vllm.model_executor.layers.fused_moe.config import (
|
||||
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
|
||||
from vllm.scalar_type import scalar_types
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
WEIGHT_SHAPES_MOE = {
|
||||
"nvidia/DeepSeek-R1-FP4": [
|
||||
|
||||
@ -14,7 +14,7 @@ from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_confi
|
||||
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
# Weight shapes for different models: [num_experts, topk, hidden_size,
|
||||
# intermediate_size]
|
||||
|
||||
@ -39,7 +39,7 @@ from vllm.distributed.device_communicators.pynccl_allocator import (
|
||||
)
|
||||
from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@ -13,7 +13,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (
|
||||
fused_experts,
|
||||
fused_topk,
|
||||
)
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
DEFAULT_MODELS = [
|
||||
"nm-testing/Mixtral-8x7B-Instruct-v0.1",
|
||||
|
||||
@ -7,7 +7,7 @@ import torch
|
||||
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
|
||||
|
||||
|
||||
@ -25,7 +25,7 @@ if HAS_TRITON:
|
||||
from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink
|
||||
from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
|
||||
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
||||
DEFAULT_TP_SIZES = [1]
|
||||
|
||||
@ -33,7 +33,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
quantize_weights,
|
||||
)
|
||||
from vllm.scalar_type import ScalarType, scalar_types
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
DEFAULT_MODELS = ["meta-llama/Llama-3-8b", "meta-llama/Llama-2-70b-hf"]
|
||||
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024]
|
||||
|
||||
@ -44,7 +44,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
sort_weights,
|
||||
)
|
||||
from vllm.scalar_type import ScalarType, scalar_types
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
|
||||
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
|
||||
|
||||
@ -22,7 +22,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import *
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.transformers_utils.config import get_config
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
FP8_DTYPE = current_platform.fp8_dtype()
|
||||
|
||||
|
||||
@ -17,7 +17,7 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
FP8_DTYPE = current_platform.fp8_dtype()
|
||||
|
||||
|
||||
@ -39,7 +39,7 @@ import torch
|
||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.transformers_utils.config import get_config
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
|
||||
@ -9,7 +9,7 @@ import torch
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import (
|
||||
STR_DTYPE_TO_TORCH_DTYPE,
|
||||
create_kv_caches_with_random,
|
||||
|
||||
@ -7,7 +7,7 @@ import torch
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
|
||||
|
||||
|
||||
@ -9,7 +9,7 @@ from tabulate import tabulate
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import (
|
||||
STR_DTYPE_TO_TORCH_DTYPE,
|
||||
create_kv_caches_with_random,
|
||||
|
||||
@ -12,7 +12,7 @@ from vllm.attention.ops.triton_reshape_and_cache_flash import (
|
||||
)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import (
|
||||
STR_DTYPE_TO_TORCH_DTYPE,
|
||||
create_kv_caches_with_random_flash,
|
||||
|
||||
@ -8,7 +8,7 @@ import torch
|
||||
|
||||
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding, get_rope
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def benchmark_rope_kernels_multi_lora(
|
||||
|
||||
@ -8,7 +8,7 @@ from datetime import datetime
|
||||
import flashinfer
|
||||
import torch
|
||||
|
||||
from vllm.utils import round_up
|
||||
from vllm.utils.math_utils import round_up
|
||||
|
||||
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
|
||||
FP8_DTYPE = torch.float8_e4m3fn
|
||||
|
||||
@ -8,7 +8,7 @@ from datetime import datetime
|
||||
import flashinfer
|
||||
import torch
|
||||
|
||||
from vllm.utils import round_up
|
||||
from vllm.utils.math_utils import round_up
|
||||
|
||||
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
|
||||
FP8_DTYPE = torch.float8_e4m3fn
|
||||
|
||||
@ -18,7 +18,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
mp.set_start_method("spawn", force=True)
|
||||
|
||||
|
||||
@ -11,7 +11,7 @@ import regex as re
|
||||
import seaborn as sns
|
||||
from torch.utils.benchmark import Measurement as TMeasurement
|
||||
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser(
|
||||
|
||||
@ -5,7 +5,7 @@ import cProfile
|
||||
import pstats
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
# A very long prompt, total number of tokens is about 15k.
|
||||
LONG_PROMPT = ["You are an expert in large language models, aren't you?"] * 1000
|
||||
|
||||
@ -212,11 +212,24 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
|
||||
# Build ACL with scons
|
||||
include(ProcessorCount)
|
||||
ProcessorCount(_NPROC)
|
||||
set(_scons_cmd
|
||||
scons -j${_NPROC}
|
||||
Werror=0 debug=0 neon=1 examples=0 embed_kernels=0 os=linux
|
||||
arch=armv8.2-a build=native benchmark_examples=0 fixed_format_kernels=1
|
||||
multi_isa=1 openmp=1 cppthreads=0
|
||||
)
|
||||
|
||||
# locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0)
|
||||
# and create a local shim dir with it
|
||||
include("${CMAKE_CURRENT_LIST_DIR}/utils.cmake")
|
||||
vllm_prepare_torch_gomp_shim(VLLM_TORCH_GOMP_SHIM_DIR)
|
||||
|
||||
if(NOT VLLM_TORCH_GOMP_SHIM_DIR STREQUAL "")
|
||||
list(APPEND _scons_cmd extra_link_flags=-L${VLLM_TORCH_GOMP_SHIM_DIR})
|
||||
endif()
|
||||
|
||||
execute_process(
|
||||
COMMAND scons -j${_NPROC}
|
||||
Werror=0 debug=0 neon=1 examples=0 embed_kernels=0 os=linux
|
||||
arch=armv8.2-a build=native benchmark_examples=0 fixed_format_kernels=1
|
||||
multi_isa=1 openmp=1 cppthreads=0
|
||||
COMMAND ${_scons_cmd}
|
||||
WORKING_DIRECTORY "$ENV{ACL_ROOT_DIR}"
|
||||
RESULT_VARIABLE _acl_rc
|
||||
)
|
||||
|
||||
@ -19,7 +19,7 @@ else()
|
||||
FetchContent_Declare(
|
||||
flashmla
|
||||
GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
|
||||
GIT_TAG 28417e516fcbf6257a422ba117ef5b6f44da5682
|
||||
GIT_TAG 46d64a8ebef03fa50b4ae74937276a5c940e3f95
|
||||
GIT_PROGRESS TRUE
|
||||
CONFIGURE_COMMAND ""
|
||||
BUILD_COMMAND ""
|
||||
|
||||
@ -129,6 +129,44 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
|
||||
set(${OUT_GPU_FLAGS} ${GPU_FLAGS} PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
# Find libgomp that gets shipped with PyTorch wheel and create a shim dir with:
|
||||
# libgomp.so -> libgomp-<hash>.so...
|
||||
# libgomp.so.1 -> libgomp-<hash>.so...
|
||||
# OUTPUT: TORCH_GOMP_SHIM_DIR ("" if not found)
|
||||
function(vllm_prepare_torch_gomp_shim TORCH_GOMP_SHIM_DIR)
|
||||
set(${TORCH_GOMP_SHIM_DIR} "" PARENT_SCOPE)
|
||||
|
||||
# Use run_python to locate vendored libgomp; never throw on failure.
|
||||
run_python(_VLLM_TORCH_GOMP_PATH
|
||||
"
|
||||
import os, glob
|
||||
try:
|
||||
import torch
|
||||
torch_pkg = os.path.dirname(torch.__file__)
|
||||
site_root = os.path.dirname(torch_pkg)
|
||||
torch_libs = os.path.join(site_root, 'torch.libs')
|
||||
print(glob.glob(os.path.join(torch_libs, 'libgomp-*.so*'))[0])
|
||||
except:
|
||||
print('')
|
||||
"
|
||||
"failed to probe torch.libs for libgomp")
|
||||
|
||||
if(_VLLM_TORCH_GOMP_PATH STREQUAL "" OR NOT EXISTS "${_VLLM_TORCH_GOMP_PATH}")
|
||||
return()
|
||||
endif()
|
||||
|
||||
# Create shim under the build tree
|
||||
set(_shim "${CMAKE_BINARY_DIR}/gomp_shim")
|
||||
file(MAKE_DIRECTORY "${_shim}")
|
||||
|
||||
execute_process(COMMAND ${CMAKE_COMMAND} -E rm -f "${_shim}/libgomp.so")
|
||||
execute_process(COMMAND ${CMAKE_COMMAND} -E rm -f "${_shim}/libgomp.so.1")
|
||||
execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink "${_VLLM_TORCH_GOMP_PATH}" "${_shim}/libgomp.so")
|
||||
execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink "${_VLLM_TORCH_GOMP_PATH}" "${_shim}/libgomp.so.1")
|
||||
|
||||
set(${TORCH_GOMP_SHIM_DIR} "${_shim}" PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
# Macro for converting a `gencode` version number to a cmake version number.
|
||||
macro(string_to_ver OUT_VER IN_STR)
|
||||
string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR})
|
||||
|
||||
@ -187,7 +187,8 @@ template <>
|
||||
struct hash<MatMulPrimitiveHandler::ClassMatmulCacheKey> {
|
||||
size_t operator()(
|
||||
const MatMulPrimitiveHandler::ClassMatmulCacheKey& val) const {
|
||||
return hash<dnnl_dim_t>()(val.b_n_size) ^ hash<dnnl_dim_t>()(val.b_k_size);
|
||||
return hash<dnnl_dim_t>()(val.b_n_size) ^ hash<dnnl_dim_t>()(val.b_k_size) ^
|
||||
hash<int>()(static_cast<int>(val.b_type));
|
||||
}
|
||||
};
|
||||
|
||||
@ -216,7 +217,8 @@ bool operator==(const W8A8MatMulPrimitiveHandler::MSizeCacheKey& l,
|
||||
|
||||
bool operator==(const MatMulPrimitiveHandler::ClassMatmulCacheKey& l,
|
||||
const MatMulPrimitiveHandler::ClassMatmulCacheKey& r) {
|
||||
return l.b_n_size == r.b_n_size && l.b_k_size == r.b_k_size;
|
||||
return l.b_n_size == r.b_n_size && l.b_k_size == r.b_k_size &&
|
||||
l.b_type == r.b_type;
|
||||
}
|
||||
|
||||
bool operator==(const MatMulPrimitiveHandler::MSizeCacheKey& l,
|
||||
@ -493,8 +495,10 @@ void MatMulPrimitiveHandler::execute(ExecArgs& args) {
|
||||
dnnl::matmul MatMulPrimitiveHandler::get_matmul_cache(
|
||||
const MSizeCacheKey& key) {
|
||||
if (m_size_cache_.get() == nullptr) {
|
||||
ClassMatmulCacheKey key = {.b_n_size = b_n_size_, .b_k_size = b_k_size_};
|
||||
m_size_cache_ = get_matul_class_primitive_cache(key, primitive_cache_size_);
|
||||
ClassMatmulCacheKey class_key = {
|
||||
.b_n_size = b_n_size_, .b_k_size = b_k_size_, .b_type = b_type_};
|
||||
m_size_cache_ =
|
||||
get_matul_class_primitive_cache(class_key, primitive_cache_size_);
|
||||
}
|
||||
return m_size_cache_->get_or_create(key, [&]() {
|
||||
dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
|
||||
|
||||
@ -199,6 +199,7 @@ class MatMulPrimitiveHandler : public DNNLMatMulPrimitiveHandler {
|
||||
struct ClassMatmulCacheKey {
|
||||
dnnl_dim_t b_n_size;
|
||||
dnnl_dim_t b_k_size;
|
||||
dnnl::memory::data_type b_type;
|
||||
|
||||
friend bool operator==(const ClassMatmulCacheKey& l,
|
||||
const ClassMatmulCacheKey& r);
|
||||
|
||||
@ -307,7 +307,7 @@ void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
|
||||
torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
|
||||
torch::Tensor b_gptq_qzeros,
|
||||
torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
|
||||
bool use_exllama, int64_t bit);
|
||||
bool use_exllama, bool use_v2_format, int64_t bit);
|
||||
|
||||
void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit);
|
||||
|
||||
|
||||
@ -185,7 +185,7 @@ typedef void (*fp_gemm_half_q_half_gptq_kernel)(const half*, const uint32_t*,
|
||||
const uint32_t*, const half*,
|
||||
half*, const int, const int,
|
||||
const int, const int,
|
||||
const int*);
|
||||
const bool, const int*);
|
||||
|
||||
template <bool first_block, int m_count>
|
||||
__global__ void gemm_half_q_half_gptq_4bit_kernel(
|
||||
@ -193,12 +193,15 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel(
|
||||
const uint32_t* __restrict__ b_gptq_qzeros,
|
||||
const half* __restrict__ b_gptq_scales, half* __restrict__ c,
|
||||
const int size_m, const int size_n, const int size_k, const int groups,
|
||||
const int* __restrict__ b_q_perm) {
|
||||
const bool use_v2_format, const int* __restrict__ b_q_perm) {
|
||||
MatrixView_half a_(a, size_m, size_k);
|
||||
MatrixView_half_rw c_(c, size_m, size_n);
|
||||
MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
|
||||
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
|
||||
|
||||
// GPTQv2 and GPTQv1 handles zero points differently
|
||||
int zero_offset = use_v2_format ? 0 : 1;
|
||||
|
||||
auto t = threadIdx.x;
|
||||
|
||||
// Block
|
||||
@ -256,10 +259,10 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel(
|
||||
half2 y1y16[4][2];
|
||||
b_gptq_qzeros_.item4(zeros, group, n);
|
||||
b_gptq_scales_.item4_f(scales, group, n);
|
||||
dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
|
||||
dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
|
||||
dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
|
||||
dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
|
||||
dequant_4bit_8_prep_zero(zeros[0] + zero_offset, z1z16[0], y1y16[0]);
|
||||
dequant_4bit_8_prep_zero(zeros[1] + zero_offset, z1z16[1], y1y16[1]);
|
||||
dequant_4bit_8_prep_zero(zeros[2] + zero_offset, z1z16[2], y1y16[2]);
|
||||
dequant_4bit_8_prep_zero(zeros[3] + zero_offset, z1z16[3], y1y16[3]);
|
||||
|
||||
// Column result
|
||||
float block_c[m_count][4] = {};
|
||||
@ -272,10 +275,10 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel(
|
||||
nextgroup += groupsize;
|
||||
b_gptq_qzeros_.item4(zeros, group, n);
|
||||
b_gptq_scales_.item4_f(scales, group, n);
|
||||
dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
|
||||
dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
|
||||
dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
|
||||
dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
|
||||
dequant_4bit_8_prep_zero(zeros[0] + zero_offset, z1z16[0], y1y16[0]);
|
||||
dequant_4bit_8_prep_zero(zeros[1] + zero_offset, z1z16[1], y1y16[1]);
|
||||
dequant_4bit_8_prep_zero(zeros[2] + zero_offset, z1z16[2], y1y16[2]);
|
||||
dequant_4bit_8_prep_zero(zeros[3] + zero_offset, z1z16[3], y1y16[3]);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
@ -329,12 +332,15 @@ __global__ void gemm_half_q_half_gptq_2bit_kernel(
|
||||
const uint32_t* __restrict__ b_gptq_qzeros,
|
||||
const half* __restrict__ b_gptq_scales, half* __restrict__ c,
|
||||
const int size_m, const int size_n, const int size_k, const int groups,
|
||||
const int* __restrict__ b_q_perm) {
|
||||
const bool use_v2_format, const int* __restrict__ b_q_perm) {
|
||||
MatrixView_half a_(a, size_m, size_k);
|
||||
MatrixView_half_rw c_(c, size_m, size_n);
|
||||
MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
|
||||
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
|
||||
|
||||
// GPTQv2 and GPTQv1 handles zero points differently
|
||||
int zero_offset = use_v2_format ? 0 : 1;
|
||||
|
||||
auto t = threadIdx.x;
|
||||
|
||||
// Block
|
||||
@ -409,10 +415,10 @@ __global__ void gemm_half_q_half_gptq_2bit_kernel(
|
||||
int4 load_int4 = *b_ptr4;
|
||||
|
||||
half2 dq[4][8];
|
||||
dequant_2bit_16(load_int4.x, dq[0], size_n, zeros[0] + 1);
|
||||
dequant_2bit_16(load_int4.y, dq[1], size_n, zeros[1] + 1);
|
||||
dequant_2bit_16(load_int4.z, dq[2], size_n, zeros[2] + 1);
|
||||
dequant_2bit_16(load_int4.w, dq[3], size_n, zeros[3] + 1);
|
||||
dequant_2bit_16(load_int4.x, dq[0], size_n, zeros[0] + zero_offset);
|
||||
dequant_2bit_16(load_int4.y, dq[1], size_n, zeros[1] + zero_offset);
|
||||
dequant_2bit_16(load_int4.z, dq[2], size_n, zeros[2] + zero_offset);
|
||||
dequant_2bit_16(load_int4.w, dq[3], size_n, zeros[3] + zero_offset);
|
||||
|
||||
#pragma unroll
|
||||
for (int m = 0; m < m_count; m++) {
|
||||
@ -448,12 +454,15 @@ __global__ void gemm_half_q_half_gptq_3bit_kernel(
|
||||
const uint32_t* __restrict__ b_gptq_qzeros,
|
||||
const half* __restrict__ b_gptq_scales, half* __restrict__ c,
|
||||
const int size_m, const int size_n, const int size_k, const int groups,
|
||||
const int* __restrict__ b_q_perm) {
|
||||
const bool use_v2_format, const int* __restrict__ b_q_perm) {
|
||||
MatrixView_half a_(a, size_m, size_k);
|
||||
MatrixView_half_rw c_(c, size_m, size_n);
|
||||
MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
|
||||
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
|
||||
|
||||
// GPTQv2 and GPTQv1 handles zero points differently
|
||||
int zero_offset = use_v2_format ? 0 : 1;
|
||||
|
||||
auto t = threadIdx.x;
|
||||
|
||||
// Block
|
||||
@ -534,13 +543,13 @@ __global__ void gemm_half_q_half_gptq_3bit_kernel(
|
||||
|
||||
half2 dq[4][16];
|
||||
dequant_3bit_32(load_int4[0].x, load_int4[1].x, load_int4[2].x, dq[0],
|
||||
size_n, zeros[0] + 1);
|
||||
size_n, zeros[0] + zero_offset);
|
||||
dequant_3bit_32(load_int4[0].y, load_int4[1].y, load_int4[2].y, dq[1],
|
||||
size_n, zeros[1] + 1);
|
||||
size_n, zeros[1] + zero_offset);
|
||||
dequant_3bit_32(load_int4[0].z, load_int4[1].z, load_int4[2].z, dq[2],
|
||||
size_n, zeros[2] + 1);
|
||||
size_n, zeros[2] + zero_offset);
|
||||
dequant_3bit_32(load_int4[0].w, load_int4[1].w, load_int4[2].w, dq[3],
|
||||
size_n, zeros[3] + 1);
|
||||
size_n, zeros[3] + zero_offset);
|
||||
|
||||
#pragma unroll
|
||||
for (int m = 0; m < m_count; m++) {
|
||||
@ -574,12 +583,15 @@ __global__ void gemm_half_q_half_gptq_8bit_kernel(
|
||||
const uint32_t* __restrict__ b_gptq_qzeros,
|
||||
const half* __restrict__ b_gptq_scales, half* __restrict__ c,
|
||||
const int size_m, const int size_n, const int size_k, const int groups,
|
||||
const int* __restrict__ b_q_perm) {
|
||||
const bool use_v2_format, const int* __restrict__ b_q_perm) {
|
||||
MatrixView_half a_(a, size_m, size_k);
|
||||
MatrixView_half_rw c_(c, size_m, size_n);
|
||||
MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
|
||||
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
|
||||
|
||||
// GPTQv2 and GPTQv1 handles zero points differently
|
||||
int zero_offset = use_v2_format ? 0 : 1;
|
||||
|
||||
auto t = threadIdx.x;
|
||||
|
||||
// Block
|
||||
@ -658,13 +670,13 @@ __global__ void gemm_half_q_half_gptq_8bit_kernel(
|
||||
|
||||
half2 dq[4][4];
|
||||
dequant_8bit_8(load_int4[0].x, load_int4[1].x, dq[0], size_n,
|
||||
zeros[0] + 1);
|
||||
zeros[0] + zero_offset);
|
||||
dequant_8bit_8(load_int4[0].y, load_int4[1].y, dq[1], size_n,
|
||||
zeros[1] + 1);
|
||||
zeros[1] + zero_offset);
|
||||
dequant_8bit_8(load_int4[0].z, load_int4[1].z, dq[2], size_n,
|
||||
zeros[2] + 1);
|
||||
zeros[2] + zero_offset);
|
||||
dequant_8bit_8(load_int4[0].w, load_int4[1].w, dq[3], size_n,
|
||||
zeros[3] + 1);
|
||||
zeros[3] + zero_offset);
|
||||
|
||||
for (int m = 0; m < m_count; m++) {
|
||||
block_c[m][0] =
|
||||
@ -730,7 +742,8 @@ void gemm_half_q_half_cuda_part(const half* a, const uint32_t* b_q_weight,
|
||||
const uint32_t* b_gptq_qzeros,
|
||||
const half* b_gptq_scales, const int* b_q_perm,
|
||||
half* c, int size_m, int size_n, int size_k,
|
||||
int m_count, int groups, int bit) {
|
||||
int m_count, int groups, bool use_v2_format,
|
||||
int bit) {
|
||||
dim3 blockDim, gridDim;
|
||||
blockDim.x = BLOCK_KN_SIZE;
|
||||
blockDim.y = 1;
|
||||
@ -743,20 +756,23 @@ void gemm_half_q_half_cuda_part(const half* a, const uint32_t* b_q_weight,
|
||||
pick_gemm_half_q_half_gptq_kernel(true, m_count, bit);
|
||||
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
kernel<<<gridDim, blockDim, 0, stream>>>(a, b_q_weight, b_gptq_qzeros,
|
||||
b_gptq_scales, c, size_m, size_n,
|
||||
size_k, groups, b_q_perm);
|
||||
kernel<<<gridDim, blockDim, 0, stream>>>(
|
||||
a, b_q_weight, b_gptq_qzeros, b_gptq_scales, c, size_m, size_n, size_k,
|
||||
groups, use_v2_format, b_q_perm);
|
||||
}
|
||||
|
||||
__global__ void reconstruct_exllama_8bit_kernel(
|
||||
const uint32_t* __restrict__ b_q_weight, const int* __restrict__ b_q_perm,
|
||||
const uint32_t* __restrict__ b_gptq_qzeros,
|
||||
const half* __restrict__ b_gptq_scales, const int size_k, const int size_n,
|
||||
const int groups, half* __restrict__ b) {
|
||||
const int groups, const bool use_v2_format, half* __restrict__ b) {
|
||||
MatrixView_half_rw b_(b, size_k, size_n);
|
||||
MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
|
||||
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
|
||||
|
||||
// GPTQv2 and GPTQv1 handles zero points differently
|
||||
int zero_offset = use_v2_format ? 0 : 1;
|
||||
|
||||
auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
|
||||
auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
|
||||
|
||||
@ -812,13 +828,13 @@ __global__ void reconstruct_exllama_8bit_kernel(
|
||||
|
||||
half2 dq[4][4];
|
||||
dequant_8bit_8(load_int4[0].x, load_int4[1].x, dq[0], size_n,
|
||||
zeros[0] + 1);
|
||||
zeros[0] + zero_offset);
|
||||
dequant_8bit_8(load_int4[0].y, load_int4[1].y, dq[1], size_n,
|
||||
zeros[1] + 1);
|
||||
zeros[1] + zero_offset);
|
||||
dequant_8bit_8(load_int4[0].z, load_int4[1].z, dq[2], size_n,
|
||||
zeros[2] + 1);
|
||||
zeros[2] + zero_offset);
|
||||
dequant_8bit_8(load_int4[0].w, load_int4[1].w, dq[3], size_n,
|
||||
zeros[3] + 1);
|
||||
zeros[3] + zero_offset);
|
||||
|
||||
// half* dqh = (half*)dq;
|
||||
if (b_q_perm) {
|
||||
@ -849,11 +865,14 @@ __global__ void reconstruct_exllama_4bit_kernel(
|
||||
const uint32_t* __restrict__ b_q_weight, const int* __restrict__ b_q_perm,
|
||||
const uint32_t* __restrict__ b_gptq_qzeros,
|
||||
const half* __restrict__ b_gptq_scales, const int size_k, const int size_n,
|
||||
const int groups, half* __restrict__ b) {
|
||||
const int groups, const bool use_v2_format, half* __restrict__ b) {
|
||||
MatrixView_half_rw b_(b, size_k, size_n);
|
||||
MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
|
||||
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
|
||||
|
||||
// GPTQv2 and GPTQv1 handles zero points differently
|
||||
int zero_offset = use_v2_format ? 0 : 1;
|
||||
|
||||
auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
|
||||
auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
|
||||
|
||||
@ -888,10 +907,10 @@ __global__ void reconstruct_exllama_4bit_kernel(
|
||||
half2 y1y16[4][2];
|
||||
b_gptq_qzeros_.item4(zeros, group, n);
|
||||
b_gptq_scales_.item4_h2(scales, group, n);
|
||||
dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
|
||||
dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
|
||||
dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
|
||||
dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
|
||||
dequant_4bit_8_prep_zero(zeros[0] + zero_offset, z1z16[0], y1y16[0]);
|
||||
dequant_4bit_8_prep_zero(zeros[1] + zero_offset, z1z16[1], y1y16[1]);
|
||||
dequant_4bit_8_prep_zero(zeros[2] + zero_offset, z1z16[2], y1y16[2]);
|
||||
dequant_4bit_8_prep_zero(zeros[3] + zero_offset, z1z16[3], y1y16[3]);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
@ -904,10 +923,10 @@ __global__ void reconstruct_exllama_4bit_kernel(
|
||||
nextgroup += groupsize;
|
||||
b_gptq_qzeros_.item4(zeros, group, n);
|
||||
b_gptq_scales_.item4_h2(scales, group, n);
|
||||
dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
|
||||
dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
|
||||
dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
|
||||
dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
|
||||
dequant_4bit_8_prep_zero(zeros[0] + zero_offset, z1z16[0], y1y16[0]);
|
||||
dequant_4bit_8_prep_zero(zeros[1] + zero_offset, z1z16[1], y1y16[1]);
|
||||
dequant_4bit_8_prep_zero(zeros[2] + zero_offset, z1z16[2], y1y16[2]);
|
||||
dequant_4bit_8_prep_zero(zeros[3] + zero_offset, z1z16[3], y1y16[3]);
|
||||
}
|
||||
|
||||
for (int p = 0; p < 4; p++) {
|
||||
@ -954,11 +973,14 @@ __global__ void reconstruct_exllama_3bit_kernel(
|
||||
const uint32_t* __restrict__ b_q_weight, const int* __restrict__ b_q_perm,
|
||||
const uint32_t* __restrict__ b_gptq_qzeros,
|
||||
const half* __restrict__ b_gptq_scales, const int size_k, const int size_n,
|
||||
const int groups, half* __restrict__ b) {
|
||||
const int groups, const bool use_v2_format, half* __restrict__ b) {
|
||||
MatrixView_half_rw b_(b, size_k, size_n);
|
||||
MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
|
||||
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
|
||||
|
||||
// GPTQv2 and GPTQv1 handles zero points differently
|
||||
int zero_offset = use_v2_format ? 0 : 1;
|
||||
|
||||
auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
|
||||
auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
|
||||
|
||||
@ -1016,13 +1038,13 @@ __global__ void reconstruct_exllama_3bit_kernel(
|
||||
|
||||
half2 dq[4][16];
|
||||
dequant_3bit_32(load_int4[0].x, load_int4[1].x, load_int4[2].x, dq[0],
|
||||
size_n, zeros[0] + 1);
|
||||
size_n, zeros[0] + zero_offset);
|
||||
dequant_3bit_32(load_int4[0].y, load_int4[1].y, load_int4[2].y, dq[1],
|
||||
size_n, zeros[1] + 1);
|
||||
size_n, zeros[1] + zero_offset);
|
||||
dequant_3bit_32(load_int4[0].z, load_int4[1].z, load_int4[2].z, dq[2],
|
||||
size_n, zeros[2] + 1);
|
||||
size_n, zeros[2] + zero_offset);
|
||||
dequant_3bit_32(load_int4[0].w, load_int4[1].w, load_int4[2].w, dq[3],
|
||||
size_n, zeros[3] + 1);
|
||||
size_n, zeros[3] + zero_offset);
|
||||
|
||||
if (b_q_perm) {
|
||||
for (int j = 0; j < 16; j++) {
|
||||
@ -1052,11 +1074,14 @@ __global__ void reconstruct_exllama_2bit_kernel(
|
||||
const uint32_t* __restrict__ b_q_weight, const int* __restrict__ b_q_perm,
|
||||
const uint32_t* __restrict__ b_gptq_qzeros,
|
||||
const half* __restrict__ b_gptq_scales, const int size_k, const int size_n,
|
||||
const int groups, half* __restrict__ b) {
|
||||
const int groups, const bool use_v2_format, half* __restrict__ b) {
|
||||
MatrixView_half_rw b_(b, size_k, size_n);
|
||||
MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
|
||||
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
|
||||
|
||||
// GPTQv2 and GPTQv1 handles zero points differently
|
||||
int zero_offset = use_v2_format ? 0 : 1;
|
||||
|
||||
auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
|
||||
auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
|
||||
|
||||
@ -1108,10 +1133,10 @@ __global__ void reconstruct_exllama_2bit_kernel(
|
||||
int4 load_int4 = *b_ptr4;
|
||||
|
||||
half2 dq[4][8];
|
||||
dequant_2bit_16(load_int4.x, dq[0], size_n, zeros[0] + 1);
|
||||
dequant_2bit_16(load_int4.y, dq[1], size_n, zeros[1] + 1);
|
||||
dequant_2bit_16(load_int4.z, dq[2], size_n, zeros[2] + 1);
|
||||
dequant_2bit_16(load_int4.w, dq[3], size_n, zeros[3] + 1);
|
||||
dequant_2bit_16(load_int4.x, dq[0], size_n, zeros[0] + zero_offset);
|
||||
dequant_2bit_16(load_int4.y, dq[1], size_n, zeros[1] + zero_offset);
|
||||
dequant_2bit_16(load_int4.z, dq[2], size_n, zeros[2] + zero_offset);
|
||||
dequant_2bit_16(load_int4.w, dq[3], size_n, zeros[3] + zero_offset);
|
||||
|
||||
b_ptr += size_n;
|
||||
// half* dqh = (half*)dq;
|
||||
@ -1143,7 +1168,7 @@ void reconstruct_exllama(const uint32_t* b_q_weight,
|
||||
const uint32_t* b_gptq_qzeros,
|
||||
const half* b_gptq_scales, const int* b_q_perm,
|
||||
half* out, int height, int width, int groups,
|
||||
int bit) {
|
||||
bool use_v2_format, int bit) {
|
||||
dim3 blockDim, gridDim;
|
||||
blockDim.x = BLOCK_KN_SIZE;
|
||||
blockDim.y = 1;
|
||||
@ -1162,14 +1187,14 @@ void reconstruct_exllama(const uint32_t* b_q_weight,
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
reconstruct_exllama_kernel<<<gridDim, blockDim, 0, stream>>>(
|
||||
b_q_weight, b_q_perm, b_gptq_qzeros, b_gptq_scales, height, width, groups,
|
||||
out);
|
||||
use_v2_format, out);
|
||||
}
|
||||
|
||||
__global__ void gemm_half_q_half_alt_4bit_kernel(
|
||||
const half2* __restrict__ vec, const uint32_t* __restrict__ mat,
|
||||
half* __restrict__ mul, const half* __restrict__ scales,
|
||||
const uint32_t* __restrict__ zeros, const int* __restrict__ g_idx,
|
||||
int batch, int height, int width) {
|
||||
int batch, int height, int width, bool use_v2_format) {
|
||||
int zero_width = width / 8;
|
||||
int vec_height = height * 4;
|
||||
const int blockwidth2 = BLOCK_KN_SIZE / 2;
|
||||
@ -1179,6 +1204,9 @@ __global__ void gemm_half_q_half_alt_4bit_kernel(
|
||||
int h_end = min(BLOCK_KN_SIZE / 8, height - h) * 4;
|
||||
auto w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
|
||||
|
||||
// GPTQv2 and GPTQv1 handles zero points differently
|
||||
int zero_offset = use_v2_format ? 0 : 1;
|
||||
|
||||
__shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2];
|
||||
if (threadIdx.x < h_end) {
|
||||
for (int m = 0; m < b_end; ++m) {
|
||||
@ -1223,10 +1251,11 @@ __global__ void gemm_half_q_half_alt_4bit_kernel(
|
||||
half2 zero = __halves2half2(
|
||||
__hmul(scale_f,
|
||||
__int2half_rn(-((zeros[g * zero_width + z_w] >> z_mod) & 0xF) -
|
||||
1)),
|
||||
__hmul(scale_f2,
|
||||
__int2half_rn(
|
||||
-((zeros[g2 * zero_width + z_w] >> z_mod) & 0xF) - 1)));
|
||||
zero_offset)),
|
||||
__hmul(
|
||||
scale_f2,
|
||||
__int2half_rn(-((zeros[g2 * zero_width + z_w] >> z_mod) & 0xF) -
|
||||
zero_offset)));
|
||||
scales_tmp[tmp_k] = scale;
|
||||
zeros_tmp[tmp_k] = zero;
|
||||
}
|
||||
@ -1268,7 +1297,7 @@ __global__ void gemm_half_q_half_alt_8bit_kernel(
|
||||
const half2* __restrict__ vec, const uint32_t* __restrict__ mat,
|
||||
half* __restrict__ mul, const half* __restrict__ scales,
|
||||
const uint32_t* __restrict__ zeros, const int* __restrict__ g_idx,
|
||||
int batch, int height, int width) {
|
||||
int batch, int height, int width, bool use_v2_format) {
|
||||
int zero_width = width / 4;
|
||||
int vec_height = height * 2;
|
||||
const int blockwidth2 = BLOCK_KN_SIZE / 2;
|
||||
@ -1278,6 +1307,9 @@ __global__ void gemm_half_q_half_alt_8bit_kernel(
|
||||
int h_end = min(BLOCK_KN_SIZE / 4, height - h) * 2;
|
||||
auto w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
|
||||
|
||||
// GPTQv2 and GPTQv1 handles zero points differently
|
||||
int zero_offset = use_v2_format ? 0 : 1;
|
||||
|
||||
__shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2];
|
||||
if (threadIdx.x < h_end) {
|
||||
for (int m = 0; m < b_end; ++m) {
|
||||
@ -1312,12 +1344,13 @@ __global__ void gemm_half_q_half_alt_8bit_kernel(
|
||||
half scale_f2 = scales[g2 * width + w];
|
||||
half2 scale = __halves2half2(scale_f, scale_f2);
|
||||
half2 zero = __halves2half2(
|
||||
__hmul(scale_f,
|
||||
__int2half_rn(
|
||||
-((zeros[g * zero_width + z_w] >> z_mod) & 0xff) - 1)),
|
||||
__hmul(scale_f2,
|
||||
__int2half_rn(
|
||||
-((zeros[g2 * zero_width + z_w] >> z_mod) & 0xff) - 1)));
|
||||
__hmul(scale_f, __int2half_rn(
|
||||
-((zeros[g * zero_width + z_w] >> z_mod) & 0xff) -
|
||||
zero_offset)),
|
||||
__hmul(
|
||||
scale_f2,
|
||||
__int2half_rn(-((zeros[g2 * zero_width + z_w] >> z_mod) & 0xff) -
|
||||
zero_offset)));
|
||||
scales_tmp[tmp_k] = scale;
|
||||
zeros_tmp[tmp_k] = zero;
|
||||
}
|
||||
@ -1355,7 +1388,7 @@ void gemm_half_q_half_alt(const half* a, const uint32_t* b_q_weight,
|
||||
const uint32_t* b_gptq_qzeros,
|
||||
const half* b_gptq_scales, const int* b_g_idx,
|
||||
half* c, int size_m, int size_n, int size_k,
|
||||
int bit) {
|
||||
bool use_v2_format, int bit) {
|
||||
dim3 blockDim, gridDim;
|
||||
blockDim.x = BLOCK_KN_SIZE;
|
||||
blockDim.y = 1;
|
||||
@ -1372,17 +1405,15 @@ void gemm_half_q_half_alt(const half* a, const uint32_t* b_q_weight,
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
kernel<<<gridDim, blockDim, 0, stream>>>(
|
||||
(const half2*)a, b_q_weight, c, b_gptq_scales, b_gptq_qzeros, b_g_idx,
|
||||
size_m, size_k / 32 * bit, size_n);
|
||||
size_m, size_k / 32 * bit, size_n, use_v2_format);
|
||||
}
|
||||
|
||||
template <class T, int bit>
|
||||
__global__ void reconstruct_gptq_kernel(const uint32_t* __restrict__ w,
|
||||
const half* __restrict__ w_scales,
|
||||
const uint32_t* __restrict__ w_zeros,
|
||||
const int* __restrict__ g_idx,
|
||||
const int height, const int width,
|
||||
const int group,
|
||||
half* __restrict__ out) {
|
||||
__global__ void reconstruct_gptq_kernel(
|
||||
const uint32_t* __restrict__ w, const half* __restrict__ w_scales,
|
||||
const uint32_t* __restrict__ w_zeros, const int* __restrict__ g_idx,
|
||||
const int height, const int width, const int group,
|
||||
const bool use_v2_format, half* __restrict__ out) {
|
||||
// Start of block
|
||||
|
||||
auto column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
|
||||
@ -1395,6 +1426,9 @@ __global__ void reconstruct_gptq_kernel(const uint32_t* __restrict__ w,
|
||||
MatrixView_half w_scales_(w_scales, group, width);
|
||||
T w_zeros_(w_zeros, group, width);
|
||||
|
||||
// GPTQv2 and GPTQv1 handles zero points differently
|
||||
int zero_offset = use_v2_format ? 0 : 1;
|
||||
|
||||
uint32_t w_read = w[blockIdx.y * width + column];
|
||||
half* out_ptr = out_.item_ptr(row, column);
|
||||
|
||||
@ -1402,7 +1436,7 @@ __global__ void reconstruct_gptq_kernel(const uint32_t* __restrict__ w,
|
||||
for (int s = 0; s < 32; s += bit) {
|
||||
int group = g_idx[row + s / bit];
|
||||
half w_scale = w_scales_.item(group, column);
|
||||
uint32_t w_zero = w_zeros_.item(group, column) + 1;
|
||||
uint32_t w_zero = w_zeros_.item(group, column) + zero_offset;
|
||||
half w_item =
|
||||
__hmul(__int2half_rn((int)((w_read >> s) & ((1 << bit) - 1)) - w_zero),
|
||||
w_scale);
|
||||
@ -1415,7 +1449,7 @@ __global__ void reconstruct_gptq_3bit_kernel(
|
||||
const uint32_t* __restrict__ w, const half* __restrict__ w_scales,
|
||||
const uint32_t* __restrict__ w_zeros, const int* __restrict__ g_idx,
|
||||
const int height, const int width, const int group,
|
||||
half* __restrict__ out) {
|
||||
const bool use_v2_format, half* __restrict__ out) {
|
||||
// Start of block
|
||||
auto column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
|
||||
auto row = blockIdx.y * 32;
|
||||
@ -1427,6 +1461,9 @@ __global__ void reconstruct_gptq_3bit_kernel(
|
||||
MatrixView_half w_scales_(w_scales, group, width);
|
||||
MatrixView_q3_row w_zeros_(w_zeros, group, width);
|
||||
|
||||
// GPTQv2 and GPTQv1 handles zero points differently
|
||||
int zero_offset = use_v2_format ? 0 : 1;
|
||||
|
||||
uint32_t w1 = w[(blockIdx.y * 3) * width + column];
|
||||
uint32_t w2 = w[(blockIdx.y * 3 + 1) * width + column];
|
||||
uint32_t w3 = w[(blockIdx.y * 3 + 2) * width + column];
|
||||
@ -1436,7 +1473,7 @@ __global__ void reconstruct_gptq_3bit_kernel(
|
||||
for (int i = 0; i < 32; i += 1) {
|
||||
int group = g_idx[row + i];
|
||||
half w_scale = w_scales_.item(group, column);
|
||||
uint32_t w_zero = w_zeros_.item(group, column) + 1;
|
||||
uint32_t w_zero = w_zeros_.item(group, column) + zero_offset;
|
||||
int w_item;
|
||||
if (i == 10) {
|
||||
w_item = (w1 >> 30) | ((w2 << 2) & 0x4);
|
||||
@ -1456,7 +1493,8 @@ __global__ void reconstruct_gptq_3bit_kernel(
|
||||
|
||||
void reconstruct_gptq(const uint32_t* b_q_weight, const uint32_t* b_gptq_qzeros,
|
||||
const half* b_gptq_scales, const int* b_g_idx, half* out,
|
||||
int height, int width, int groups, int bit) {
|
||||
int height, int width, int groups, bool use_v2_format,
|
||||
int bit) {
|
||||
dim3 blockDim, gridDim;
|
||||
blockDim.x = BLOCK_KN_SIZE;
|
||||
blockDim.y = 1;
|
||||
@ -1476,7 +1514,7 @@ void reconstruct_gptq(const uint32_t* b_q_weight, const uint32_t* b_gptq_qzeros,
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
kernel<<<gridDim, blockDim, 0, stream>>>(b_q_weight, b_gptq_scales,
|
||||
b_gptq_qzeros, b_g_idx, height,
|
||||
width, groups, out);
|
||||
width, groups, use_v2_format, out);
|
||||
}
|
||||
|
||||
void gemm_half_q_half_cuda(cublasHandle_t cublas_handle, const half* a,
|
||||
@ -1484,7 +1522,8 @@ void gemm_half_q_half_cuda(cublasHandle_t cublas_handle, const half* a,
|
||||
const uint32_t* b_gptq_qzeros,
|
||||
const half* b_gptq_scales, const int* b_g_idx,
|
||||
half* c, half* temp_dq, int size_m, int size_n,
|
||||
int size_k, int groups, bool use_exllama, int bit) {
|
||||
int size_k, int groups, bool use_exllama,
|
||||
bool use_v2_format, int bit) {
|
||||
bool use_reconstruct;
|
||||
if (use_exllama) {
|
||||
use_reconstruct = ((bit == 8 && size_m > MAX_Q_GEMM_ROWS_8BIT) ||
|
||||
@ -1498,10 +1537,10 @@ void gemm_half_q_half_cuda(cublasHandle_t cublas_handle, const half* a,
|
||||
// Reconstruct FP16 matrix, then cuBLAS
|
||||
if (use_exllama) {
|
||||
reconstruct_exllama(b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx,
|
||||
temp_dq, size_k, size_n, groups, bit);
|
||||
temp_dq, size_k, size_n, groups, use_v2_format, bit);
|
||||
} else {
|
||||
reconstruct_gptq(b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx,
|
||||
temp_dq, size_k, size_n, groups, bit);
|
||||
temp_dq, size_k, size_n, groups, use_v2_format, bit);
|
||||
}
|
||||
|
||||
const half alpha = __float2half(1.0f);
|
||||
@ -1517,18 +1556,18 @@ void gemm_half_q_half_cuda(cublasHandle_t cublas_handle, const half* a,
|
||||
if (max_chunks) {
|
||||
gemm_half_q_half_cuda_part(a, b_q_weight, b_gptq_qzeros, b_gptq_scales,
|
||||
b_g_idx, c, last_chunk, size_n, size_k,
|
||||
BLOCK_M_SIZE_MAX, groups, bit);
|
||||
BLOCK_M_SIZE_MAX, groups, use_v2_format, bit);
|
||||
}
|
||||
|
||||
if (last_chunk_size) {
|
||||
gemm_half_q_half_cuda_part(a + last_chunk * size_k, b_q_weight,
|
||||
b_gptq_qzeros, b_gptq_scales, b_g_idx,
|
||||
c + last_chunk * size_n, last_chunk_size,
|
||||
size_n, size_k, last_chunk_size, groups, bit);
|
||||
gemm_half_q_half_cuda_part(
|
||||
a + last_chunk * size_k, b_q_weight, b_gptq_qzeros, b_gptq_scales,
|
||||
b_g_idx, c + last_chunk * size_n, last_chunk_size, size_n, size_k,
|
||||
last_chunk_size, groups, use_v2_format, bit);
|
||||
}
|
||||
} else {
|
||||
gemm_half_q_half_alt(a, b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx,
|
||||
c, size_m, size_n, size_k, bit);
|
||||
c, size_m, size_n, size_k, use_v2_format, bit);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1815,7 +1854,7 @@ void shuffle_exllama_weight(uint32_t* q_weight, int* q_perm, int height,
|
||||
torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
|
||||
torch::Tensor b_gptq_qzeros,
|
||||
torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
|
||||
bool use_exllama, int64_t bit) {
|
||||
bool use_exllama, bool use_v2_format, int64_t bit) {
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
|
||||
auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
|
||||
at::Tensor c = torch::empty({a.size(0), b_q_weight.size(1)}, options);
|
||||
@ -1833,7 +1872,7 @@ torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
|
||||
c.size(1), // n
|
||||
a.size(1), // k
|
||||
b_gptq_qzeros.size(0), // group number
|
||||
use_exllama, bit);
|
||||
use_exllama, use_v2_format, bit);
|
||||
return c;
|
||||
}
|
||||
|
||||
|
||||
@ -557,7 +557,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
// to prevent the meta function registry.
|
||||
ops.def(
|
||||
"gptq_gemm(Tensor a, Tensor b_q_weight, Tensor b_gptq_qzeros, "
|
||||
"Tensor b_gptq_scales, Tensor b_g_idx, bool use_exllama, int bit) "
|
||||
"Tensor b_gptq_scales, Tensor b_g_idx, bool use_exllama, bool "
|
||||
"use_v2_format, int bit) "
|
||||
"-> Tensor",
|
||||
{stride_tag});
|
||||
ops.impl("gptq_gemm", torch::kCUDA, &gptq_gemm);
|
||||
|
||||
@ -79,6 +79,9 @@ RUN echo 'ulimit -c 0' >> ~/.bashrc
|
||||
######################### BUILD IMAGE #########################
|
||||
FROM base AS vllm-build
|
||||
|
||||
ARG max_jobs=32
|
||||
ENV MAX_JOBS=${max_jobs}
|
||||
|
||||
ARG GIT_REPO_CHECK=0
|
||||
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
|
||||
ARG VLLM_CPU_DISABLE_AVX512=0
|
||||
@ -104,95 +107,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=cache,target=/root/.cache/ccache \
|
||||
--mount=type=cache,target=/workspace/vllm/.deps,sharing=locked \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel
|
||||
|
||||
#################### WHEEL BUILD IMAGE ####################
|
||||
FROM base AS build
|
||||
ARG TARGETPLATFORM
|
||||
|
||||
ARG PIP_INDEX_URL UV_INDEX_URL
|
||||
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
||||
|
||||
# install build dependencies
|
||||
COPY requirements/build.txt requirements/build.txt
|
||||
|
||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||
ENV UV_HTTP_TIMEOUT=500
|
||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
# Use copy mode to avoid hardlink failures with Docker cache mounts
|
||||
ENV UV_LINK_MODE=copy
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt
|
||||
|
||||
COPY . .
|
||||
ARG GIT_REPO_CHECK=0
|
||||
RUN --mount=type=bind,source=.git,target=.git \
|
||||
if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
|
||||
|
||||
# max jobs used by Ninja to build extensions
|
||||
ARG max_jobs=2
|
||||
ENV MAX_JOBS=${max_jobs}
|
||||
|
||||
ARG USE_SCCACHE
|
||||
ARG SCCACHE_DOWNLOAD_URL=https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz
|
||||
ARG SCCACHE_ENDPOINT
|
||||
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
|
||||
ARG SCCACHE_REGION_NAME=us-west-2
|
||||
ARG SCCACHE_S3_NO_CREDENTIALS=0
|
||||
|
||||
# Flag to control whether to use pre-built vLLM wheels
|
||||
ARG VLLM_USE_PRECOMPILED=""
|
||||
|
||||
# if USE_SCCACHE is set, use sccache to speed up compilation
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
if [ "$USE_SCCACHE" = "1" ]; then \
|
||||
echo "Installing sccache..." \
|
||||
&& curl -L -o sccache.tar.gz ${SCCACHE_DOWNLOAD_URL} \
|
||||
&& tar -xzf sccache.tar.gz \
|
||||
&& sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
|
||||
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
|
||||
&& if [ ! -z ${SCCACHE_ENDPOINT} ] ; then export SCCACHE_ENDPOINT=${SCCACHE_ENDPOINT} ; fi \
|
||||
&& export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
|
||||
&& export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
|
||||
&& export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
|
||||
&& export SCCACHE_IDLE_TIMEOUT=0 \
|
||||
&& export CMAKE_BUILD_TYPE=Release \
|
||||
&& export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
|
||||
&& export VLLM_DOCKER_BUILD_CONTEXT=1 \
|
||||
&& sccache --show-stats \
|
||||
&& python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
|
||||
&& sccache --show-stats; \
|
||||
fi
|
||||
|
||||
ARG vllm_target_device="cpu"
|
||||
ENV VLLM_TARGET_DEVICE=${vllm_target_device}
|
||||
ENV CCACHE_DIR=/root/.cache/ccache
|
||||
RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||
--mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
if [ "$USE_SCCACHE" != "1" ]; then \
|
||||
# Clean any existing CMake artifacts
|
||||
rm -rf .deps && \
|
||||
mkdir -p .deps && \
|
||||
export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \
|
||||
export VLLM_DOCKER_BUILD_CONTEXT=1 && \
|
||||
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
|
||||
fi
|
||||
|
||||
# Check the size of the wheel if RUN_WHEEL_CHECK is true
|
||||
COPY .buildkite/check-wheel-size.py check-wheel-size.py
|
||||
# sync the default value with .buildkite/check-wheel-size.py
|
||||
ARG VLLM_MAX_SIZE_MB=450
|
||||
ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
|
||||
ARG RUN_WHEEL_CHECK=true
|
||||
RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
|
||||
python3 check-wheel-size.py dist; \
|
||||
else \
|
||||
echo "Skipping wheel size check."; \
|
||||
fi
|
||||
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38
|
||||
|
||||
######################### TEST DEPS #########################
|
||||
FROM base AS vllm-test-deps
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
# default base image
|
||||
ARG REMOTE_VLLM="0"
|
||||
ARG COMMON_WORKDIR=/app
|
||||
ARG BASE_IMAGE=rocm/vllm-dev:base_custom_1020_rc1_20251008_tuned_20251008
|
||||
ARG BASE_IMAGE=rocm/vllm-dev:base
|
||||
|
||||
FROM ${BASE_IMAGE} AS base
|
||||
|
||||
|
||||
@ -7,7 +7,7 @@ ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
|
||||
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
|
||||
ARG FA_BRANCH="0e60e394"
|
||||
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
|
||||
ARG AITER_BRANCH="eef23c7f"
|
||||
ARG AITER_BRANCH="9716b1b8"
|
||||
ARG AITER_REPO="https://github.com/ROCm/aiter.git"
|
||||
|
||||
FROM ${BASE_IMAGE} AS base
|
||||
|
||||
BIN
docs/assets/contributing/load-pattern-examples.png
Normal file
BIN
docs/assets/contributing/load-pattern-examples.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 577 KiB |
@ -16,9 +16,9 @@ Finally, one of the most impactful ways to support us is by raising awareness ab
|
||||
Unsure on where to start? Check out the following links for tasks to work on:
|
||||
|
||||
- [Good first issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22good%20first%20issue%22)
|
||||
- [Selected onboarding tasks](gh-project:6)
|
||||
- [Selected onboarding tasks](https://github.com/orgs/vllm-project/projects/6)
|
||||
- [New model requests](https://github.com/vllm-project/vllm/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22new-model%22)
|
||||
- [Models with multi-modal capabilities](gh-project:10)
|
||||
- [Models with multi-modal capabilities](https://github.com/orgs/vllm-project/projects/10)
|
||||
|
||||
## License
|
||||
|
||||
|
||||
@ -321,6 +321,73 @@ The following arguments can be used to control the ramp-up:
|
||||
- `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
|
||||
- `--ramp-up-end-rps`: The request rate at the end of the benchmark.
|
||||
|
||||
##### Load Pattern Configuration
|
||||
|
||||
vLLM's benchmark serving script provides sophisticated load pattern simulation capabilities through three key parameters that control request generation and concurrency behavior:
|
||||
|
||||
###### Load Pattern Control Parameters
|
||||
|
||||
- `--request-rate`: Controls the target request generation rate (requests per second). Set to `inf` for maximum throughput testing or finite values for controlled load simulation.
|
||||
- `--burstiness`: Controls traffic variability using a Gamma distribution (range: > 0). Lower values create bursty traffic, higher values create uniform traffic.
|
||||
- `--max-concurrency`: Limits concurrent outstanding requests. If this argument is not provided, concurrency is unlimited. Set a value to simulate backpressure.
|
||||
|
||||
These parameters work together to create realistic load patterns with carefully chosen defaults. The `--request-rate` parameter defaults to `inf` (infinite), which sends all requests immediately for maximum throughput testing. When set to finite values, it uses either a Poisson process (default `--burstiness=1.0`) or Gamma distribution for realistic request timing. The `--burstiness` parameter only takes effect when `--request-rate` is not infinite - a value of 1.0 creates natural Poisson traffic, while lower values (0.1-0.5) create bursty patterns and higher values (2.0-5.0) create uniform spacing. The `--max-concurrency` parameter defaults to `None` (unlimited) but can be set to simulate real-world constraints where a load balancer or API gateway limits concurrent connections. When combined, these parameters allow you to simulate everything from unrestricted stress testing (`--request-rate=inf`) to production-like scenarios with realistic arrival patterns and resource constraints.
|
||||
|
||||
The `--burstiness` parameter mathematically controls request arrival patterns using a Gamma distribution where:
|
||||
|
||||
- Shape parameter: `burstiness` value
|
||||
- Coefficient of Variation (CV): $\frac{1}{\sqrt{burstiness}}$
|
||||
- Traffic characteristics:
|
||||
- `burstiness = 0.1`: Highly bursty traffic (CV ≈ 3.16) - stress testing
|
||||
- `burstiness = 1.0`: Natural Poisson traffic (CV = 1.0) - realistic simulation
|
||||
- `burstiness = 5.0`: Uniform traffic (CV ≈ 0.45) - controlled load testing
|
||||
|
||||

|
||||
|
||||
*Figure: Load pattern examples for each use case. Top row: Request arrival timelines showing cumulative requests over time. Bottom row: Inter-arrival time distributions showing traffic variability patterns. Each column represents a different use case with its specific parameter settings and resulting traffic characteristics.*
|
||||
|
||||
Load Pattern Recommendations by Use Case:
|
||||
|
||||
| Use Case | Burstiness | Request Rate | Max Concurrency | Description |
|
||||
| --- | --- | --- | --- | --- |
|
||||
| Maximum Throughput | N/A | Infinite | Limited | **Most common**: Simulates load balancer/gateway limits with unlimited user demand |
|
||||
| Realistic Testing | 1.0 | Moderate (5-20) | Infinite | Natural Poisson traffic patterns for baseline performance |
|
||||
| Stress Testing | 0.1-0.5 | High (20-100) | Infinite | Challenging burst patterns to test resilience |
|
||||
| Latency Profiling | 2.0-5.0 | Low (1-10) | Infinite | Uniform load for consistent timing analysis |
|
||||
| Capacity Planning | 1.0 | Variable | Limited | Test resource limits with realistic constraints |
|
||||
| SLA Validation | 1.0 | Target rate | SLA limit | Production-like constraints for compliance testing |
|
||||
|
||||
These load patterns help evaluate different aspects of your vLLM deployment, from basic performance characteristics to resilience under challenging traffic conditions.
|
||||
|
||||
The **Maximum Throughput** pattern (`--request-rate=inf --max-concurrency=<limit>`) is the most commonly used configuration for production benchmarking. This simulates real-world deployment architectures where:
|
||||
|
||||
- Users send requests as fast as they can (infinite rate)
|
||||
- A load balancer or API gateway controls the maximum concurrent connections
|
||||
- The system operates at its concurrency limit, revealing true throughput capacity
|
||||
- `--burstiness` has no effect since request timing is not controlled when rate is infinite
|
||||
|
||||
This pattern helps determine optimal concurrency settings for your production load balancer configuration.
|
||||
|
||||
To effectively configure load patterns, especially for **Capacity Planning** and **SLA Validation** use cases, you need to understand your system's resource limits. During startup, vLLM reports KV cache configuration that directly impacts your load testing parameters:
|
||||
|
||||
```text
|
||||
GPU KV cache size: 15,728,640 tokens
|
||||
Maximum concurrency for 8,192 tokens per request: 1920
|
||||
```
|
||||
|
||||
Where:
|
||||
|
||||
- GPU KV cache size: Total tokens that can be cached across all concurrent requests
|
||||
- Maximum concurrency: Theoretical maximum concurrent requests for the given `max_model_len`
|
||||
- Calculation: `max_concurrency = kv_cache_size / max_model_len`
|
||||
|
||||
Using KV cache metrics for load pattern configuration:
|
||||
|
||||
- For Capacity Planning: Set `--max-concurrency` to 80-90% of the reported maximum to test realistic resource constraints
|
||||
- For SLA Validation: Use the reported maximum as your SLA limit to ensure compliance testing matches production capacity
|
||||
- For Realistic Testing: Monitor memory usage when approaching theoretical limits to understand sustainable request rates
|
||||
- Request rate guidance: Use the KV cache size to estimate sustainable request rates for your specific workload and sequence lengths
|
||||
|
||||
</details>
|
||||
|
||||
#### 📈 Offline Throughput Benchmark
|
||||
|
||||
@ -41,11 +41,11 @@ You can add any other [engine-args](../configuration/engine_args.md) you need af
|
||||
create a custom Dockerfile on top of the base image with an extra layer that installs them:
|
||||
|
||||
```Dockerfile
|
||||
FROM vllm/vllm-openai:v0.9.0
|
||||
FROM vllm/vllm-openai:v0.11.0
|
||||
|
||||
# e.g. install the `audio` optional dependencies
|
||||
# NOTE: Make sure the version of vLLM matches the base image!
|
||||
RUN uv pip install --system vllm[audio]==0.9.0
|
||||
RUN uv pip install --system vllm[audio]==0.11.0
|
||||
```
|
||||
|
||||
!!! tip
|
||||
|
||||
@ -167,7 +167,7 @@ class AttentionCGSupport(enum.Enum):
|
||||
"""NO CUDA Graphs support"""
|
||||
```
|
||||
|
||||
Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation mode. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture].
|
||||
Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation mode. For the complete fallback policy, please see the code for [this][vllm.v1.worker.gpu_model_runner.GPUModelRunner._check_and_update_cudagraph_mode].
|
||||
|
||||
The following table lists backends that support full CUDA Graphs at the time of writing.
|
||||
|
||||
|
||||
@ -13,7 +13,6 @@ IOProcessorInput = TypeVar("IOProcessorInput")
|
||||
IOProcessorOutput = TypeVar("IOProcessorOutput")
|
||||
|
||||
class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
|
||||
|
||||
def __init__(self, vllm_config: VllmConfig):
|
||||
self.vllm_config = vllm_config
|
||||
|
||||
@ -49,13 +48,24 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
|
||||
request_id: str | None = None,
|
||||
**kwargs,
|
||||
) -> IOProcessorOutput:
|
||||
collected_output = [item async for i, item in model_output]
|
||||
# We cannot guarantee outputs are returned in the same order they were
|
||||
# fed to vLLM.
|
||||
# Let's sort them by id before post_processing
|
||||
sorted_output = sorted(
|
||||
[(i, item) async for i, item in model_output], key=lambda output: output[0]
|
||||
)
|
||||
collected_output = [output[1] for output in sorted_output]
|
||||
return self.post_process(collected_output, request_id, **kwargs)
|
||||
|
||||
@abstractmethod
|
||||
def parse_request(self, request: Any) -> IOProcessorInput:
|
||||
raise NotImplementedError
|
||||
|
||||
def validate_or_generate_params(
|
||||
self, params: SamplingParams | PoolingParams | None = None
|
||||
) -> SamplingParams | PoolingParams:
|
||||
return params or PoolingParams()
|
||||
|
||||
@abstractmethod
|
||||
def output_to_response(
|
||||
self, plugin_output: IOProcessorOutput
|
||||
@ -66,10 +76,10 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
|
||||
The `parse_request` method is used for validating the user prompt and converting it into the input expected by the `pre_process`/`pre_process_async` methods.
|
||||
The `pre_process*` methods take the validated plugin input to generate vLLM's model prompts for regular inference.
|
||||
The `post_process*` methods take `PoolingRequestOutput` objects as input and generate a custom plugin output.
|
||||
|
||||
The `validate_or_generate_params` method is used for validating with the plugin any `SamplingParameters`/`PoolingParameters` received with the user request, or to generate new ones if none are specified. The function always returns the validated/generated parameters.
|
||||
The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/pooling` serving endpoint is available here [vllm/entrypoints/openai/serving_pooling.py](../../vllm/entrypoints/openai/serving_pooling.py).
|
||||
|
||||
An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/christian-pinto/prithvi_io_processor_plugin). Please, also refer to our online ([examples/online_serving/prithvi_geospatial_mae.py](../../examples/online_serving/prithvi_geospatial_mae.py)) and offline ([examples/offline_inference/prithvi_geospatial_mae_io_processor.py](../../examples/offline_inference/prithvi_geospatial_mae_io_processor.py)) inference examples.
|
||||
An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/IBM/terratorch/tree/main/terratorch/vllm/plugins/segmentation). Please, also refer to our online ([examples/online_serving/prithvi_geospatial_mae.py](../../examples/online_serving/prithvi_geospatial_mae.py)) and offline ([examples/offline_inference/prithvi_geospatial_mae_io_processor.py](../../examples/offline_inference/prithvi_geospatial_mae_io_processor.py)) inference examples.
|
||||
|
||||
## Using an IO Processor plugin
|
||||
|
||||
|
||||
@ -52,7 +52,7 @@ These are documented under [Inferencing and Serving -> Production Metrics](../us
|
||||
|
||||
### Grafana Dashboard
|
||||
|
||||
vLLM also provides [a reference example](../examples/online_serving/prometheus_grafana.md) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard.
|
||||
vLLM also provides [a reference example](../../examples/online_serving/prometheus_grafana/README.md) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard.
|
||||
|
||||
The subset of metrics exposed in the Grafana dashboard gives us an indication of which metrics are especially important:
|
||||
|
||||
|
||||
@ -52,7 +52,7 @@ th:not(:first-child) {
|
||||
| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](https://github.com/vllm-project/vllm/pull/4194)<sup>^</sup> | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | |
|
||||
| best-of | ✅ | ✅ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](https://github.com/vllm-project/vllm/issues/7968) | ✅ | ✅ | | |
|
||||
| beam-search | ✅ | ✅ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](https://github.com/vllm-project/vllm/issues/7968) | ❔ | ✅ | ✅ | |
|
||||
| [prompt-embeds](prompt_embeds.md) | ✅ | [❌](https://github.com/vllm-project/vllm/issues/25096) | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❔ | ❔ | ❌ | ❔ | ❔ | ✅ |
|
||||
| [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❔ | ❔ | ❌ | ❔ | ❔ | ✅ |
|
||||
|
||||
\* Chunked prefill and prefix caching are only applicable to last-token pooling.
|
||||
<sup>^</sup> LoRA is only applicable to the language backbone of multimodal models.
|
||||
@ -75,4 +75,4 @@ th:not(:first-child) {
|
||||
| multi-step | ✅ | ✅ | ✅ | ✅ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/8477) | ✅ | ❌ | ✅ |
|
||||
| best-of | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ |
|
||||
| beam-search | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ |
|
||||
| [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ? | [❌](https://github.com/vllm-project/vllm/issues/25097) | ✅ |
|
||||
| [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | [❌](https://github.com/vllm-project/vllm/issues/25097) | ✅ |
|
||||
|
||||
@ -14,11 +14,12 @@ vLLM currently supports the following reasoning models:
|
||||
| [DeepSeek-V3.1](https://huggingface.co/collections/deepseek-ai/deepseek-v31-68a491bed32bd77e7fca048f) | `deepseek_v3` | `json`, `regex` | ❌ |
|
||||
| [ERNIE-4.5-VL series](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-PT) | `ernie45` | `json`, `regex` | ❌ |
|
||||
| [ERNIE-4.5-21B-A3B-Thinking](https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking) | `ernie45` | `json`, `regex` | ✅ |
|
||||
| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `json`, `regex` | ✅ |
|
||||
| [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ |
|
||||
| [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `json`, `regex` | ✅ |
|
||||
| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `json`, `regex` | ✅ |
|
||||
| [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `json`, `regex` | ✅ |
|
||||
| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `json`, `regex` | ✅ |
|
||||
| [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ |
|
||||
| [MiniMax-M2](https://huggingface.co/MiniMaxAI/MiniMax-M2) | `minimax_m2_append_think` | `json`, `regex` | ✅ |
|
||||
| [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `json`, `regex` | ✅ |
|
||||
| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `json`, `regex` | ✅ |
|
||||
|
||||
!!! note
|
||||
IBM Granite 3.2 and DeepSeek-V3.1 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
|
||||
|
||||
@ -321,7 +321,7 @@ Supported models:
|
||||
Flags:
|
||||
|
||||
* For non-reasoning: `--tool-call-parser hunyuan_a13b`
|
||||
* For reasoning: `--tool-call-parser hunyuan_a13b --reasoning-parser hunyuan_a13b --enable_reasoning`
|
||||
* For reasoning: `--tool-call-parser hunyuan_a13b --reasoning-parser hunyuan_a13b`
|
||||
|
||||
### LongCat-Flash-Chat Models (`longcat`)
|
||||
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# --8<-- [start:installation]
|
||||
|
||||
vLLM supports AMD GPUs with ROCm 6.3 or above.
|
||||
vLLM supports AMD GPUs with ROCm 6.3 or above, and torch 2.8.0 and above.
|
||||
|
||||
!!! tip
|
||||
[Docker](#set-up-using-docker) is the recommended way to use vLLM on ROCm.
|
||||
@ -28,57 +28,63 @@ Currently, there are no pre-built ROCm wheels.
|
||||
# --8<-- [end:pre-built-wheels]
|
||||
# --8<-- [start:build-wheel-from-source]
|
||||
|
||||
!!! tip
|
||||
- If you found that the following installation step does not work for you, please refer to [docker/Dockerfile.rocm_base](https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm_base). Dockerfile is a form of installation steps.
|
||||
|
||||
0. Install prerequisites (skip if you are already in an environment/docker with the following installed):
|
||||
|
||||
- [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html)
|
||||
- [PyTorch](https://pytorch.org/)
|
||||
|
||||
For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.4.3_ubuntu24.04_py3.12_pytorch_release_2.6.0`, `rocm/pytorch-nightly`. If you are using docker image, you can skip to Step 3.
|
||||
For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm7.0_ubuntu22.04_py3.10_pytorch_release_2.8.0`, `rocm/pytorch-nightly`. If you are using docker image, you can skip to Step 3.
|
||||
|
||||
Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch [Getting Started](https://pytorch.org/get-started/locally/). Example:
|
||||
|
||||
```bash
|
||||
# Install PyTorch
|
||||
pip uninstall torch -y
|
||||
pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/rocm6.4
|
||||
pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/nightly/rocm7.0
|
||||
```
|
||||
|
||||
1. Install [Triton for ROCm](https://github.com/triton-lang/triton)
|
||||
1. Install [Triton for ROCm](https://github.com/ROCm/triton.git)
|
||||
|
||||
Install ROCm's Triton (the default triton-mlir branch) following the instructions from [ROCm/triton](https://github.com/ROCm/triton/blob/triton-mlir/README.md)
|
||||
Install ROCm's Triton following the instructions from [ROCm/triton](https://github.com/ROCm/triton.git)
|
||||
|
||||
```bash
|
||||
python3 -m pip install ninja cmake wheel pybind11
|
||||
pip uninstall -y triton
|
||||
git clone https://github.com/triton-lang/triton.git
|
||||
git clone https://github.com/ROCm/triton.git
|
||||
cd triton
|
||||
git checkout e5be006
|
||||
# git checkout $TRITON_BRANCH
|
||||
git checkout f9e5bf54
|
||||
if [ ! -f setup.py ]; then cd python; fi
|
||||
python3 setup.py install
|
||||
cd ../..
|
||||
```
|
||||
|
||||
!!! note
|
||||
If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent.
|
||||
- The validated `$TRITON_BRANCH` can be found in the [docker/Dockerfile.rocm_base](https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm_base).
|
||||
- If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent.
|
||||
|
||||
2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/Dao-AILab/flash-attention)
|
||||
2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/Dao-AILab/flash-attention.git)
|
||||
|
||||
Install ROCm's flash attention (v2.7.2) following the instructions from [ROCm/flash-attention](https://github.com/ROCm/flash-attention#amd-rocm-support)
|
||||
Alternatively, wheels intended for vLLM use can be accessed under the releases.
|
||||
Install ROCm's flash attention (v2.8.0) following the instructions from [ROCm/flash-attention](https://github.com/Dao-AILab/flash-attention#amd-rocm-support)
|
||||
|
||||
For example, for ROCm 6.3, suppose your gfx arch is `gfx90a`. To get your gfx architecture, run `rocminfo |grep gfx`.
|
||||
For example, for ROCm 7.0, suppose your gfx arch is `gfx942`. To get your gfx architecture, run `rocminfo |grep gfx`.
|
||||
|
||||
```bash
|
||||
git clone https://github.com/Dao-AILab/flash-attention.git
|
||||
cd flash-attention
|
||||
git checkout 1a7f4dfa
|
||||
# git checkout $FA_BRANCH
|
||||
git checkout 0e60e394
|
||||
git submodule update --init
|
||||
GPU_ARCHS="gfx90a" python3 setup.py install
|
||||
GPU_ARCHS="gfx942" python3 setup.py install
|
||||
cd ..
|
||||
```
|
||||
|
||||
!!! note
|
||||
You might need to downgrade the "ninja" version to 1.10 as it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
|
||||
- The validated `$FA_BRANCH` can be found in the [docker/Dockerfile.rocm_base](https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm_base).
|
||||
|
||||
|
||||
3. If you choose to build AITER yourself to use a certain branch or commit, you can build AITER using the following steps:
|
||||
|
||||
@ -92,11 +98,13 @@ Currently, there are no pre-built ROCm wheels.
|
||||
```
|
||||
|
||||
!!! note
|
||||
You will need to config the `$AITER_BRANCH_OR_COMMIT` for your purpose.
|
||||
- You will need to config the `$AITER_BRANCH_OR_COMMIT` for your purpose.
|
||||
- The validated `$AITER_BRANCH_OR_COMMIT` can be found in the [docker/Dockerfile.rocm_base](https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm_base).
|
||||
|
||||
|
||||
4. Build vLLM. For example, vLLM on ROCM 6.3 can be built with the following steps:
|
||||
4. Build vLLM. For example, vLLM on ROCM 7.0 can be built with the following steps:
|
||||
|
||||
??? console "Commands"
|
||||
???+ console "Commands"
|
||||
|
||||
```bash
|
||||
pip install --upgrade pip
|
||||
@ -109,31 +117,48 @@ Currently, there are no pre-built ROCm wheels.
|
||||
scipy \
|
||||
huggingface-hub[cli,hf_transfer] \
|
||||
setuptools_scm
|
||||
pip install "numpy<2"
|
||||
pip install -r requirements/rocm.txt
|
||||
|
||||
# Build vLLM for MI210/MI250/MI300.
|
||||
export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
|
||||
# To build for a single architecture (e.g., MI300) for faster installation (recommended):
|
||||
export PYTORCH_ROCM_ARCH="gfx942"
|
||||
|
||||
# To build vLLM for multiple arch MI210/MI250/MI300, use this instead
|
||||
# export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
|
||||
|
||||
python3 setup.py develop
|
||||
```
|
||||
|
||||
This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation.
|
||||
|
||||
!!! tip
|
||||
- Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm-up step before collecting perf numbers.
|
||||
- Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support.
|
||||
- To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention.
|
||||
- The ROCm version of PyTorch, ideally, should match the ROCm driver version.
|
||||
|
||||
!!! tip
|
||||
- For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level.
|
||||
For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization).
|
||||
For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/inference-optimization/vllm-optimization.html).
|
||||
|
||||
# --8<-- [end:build-wheel-from-source]
|
||||
# --8<-- [start:pre-built-images]
|
||||
|
||||
The [AMD Infinity hub for vLLM](https://hub.docker.com/r/rocm/vllm/tags) offers a prebuilt, optimized
|
||||
docker image designed for validating inference performance on the AMD Instinct™ MI300X accelerator.
|
||||
AMD also offers nightly prebuilt docker image from [Docker Hub](https://hub.docker.com/r/rocm/vllm-dev), which has vLLM and all its dependencies installed.
|
||||
|
||||
???+ console "Commands"
|
||||
```bash
|
||||
docker pull rocm/vllm-dev:nightly # to get the latest image
|
||||
docker run -it --rm \
|
||||
--network=host \
|
||||
--group-add=video \
|
||||
--ipc=host \
|
||||
--cap-add=SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--device /dev/kfd \
|
||||
--device /dev/dri \
|
||||
-v <path/to/your/models>:/app/models \
|
||||
-e HF_HOME="/app/models" \
|
||||
rocm/vllm-dev:nightly
|
||||
```
|
||||
|
||||
!!! tip
|
||||
Please check [LLM inference performance validation on AMD Instinct MI300X](https://rocm.docs.amd.com/en/latest/how-to/performance-validation/mi300x/vllm-benchmark.html)
|
||||
@ -144,29 +169,29 @@ docker image designed for validating inference performance on the AMD Instinct
|
||||
|
||||
Building the Docker image from source is the recommended way to use vLLM with ROCm.
|
||||
|
||||
#### (Optional) Build an image with ROCm software stack
|
||||
??? info "(Optional) Build an image with ROCm software stack"
|
||||
|
||||
Build a docker image from [docker/Dockerfile.rocm_base](https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm_base) which setup ROCm software stack needed by the vLLM.
|
||||
**This step is optional as this rocm_base image is usually prebuilt and store at [Docker Hub](https://hub.docker.com/r/rocm/vllm-dev) under tag `rocm/vllm-dev:base` to speed up user experience.**
|
||||
If you choose to build this rocm_base image yourself, the steps are as follows.
|
||||
Build a docker image from [docker/Dockerfile.rocm_base](https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm_base) which setup ROCm software stack needed by the vLLM.
|
||||
**This step is optional as this rocm_base image is usually prebuilt and store at [Docker Hub](https://hub.docker.com/r/rocm/vllm-dev) under tag `rocm/vllm-dev:base` to speed up user experience.**
|
||||
If you choose to build this rocm_base image yourself, the steps are as follows.
|
||||
|
||||
It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to set up buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
|
||||
It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to set up buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
|
||||
|
||||
```json
|
||||
{
|
||||
"features": {
|
||||
"buildkit": true
|
||||
```json
|
||||
{
|
||||
"features": {
|
||||
"buildkit": true
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
```
|
||||
|
||||
To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default:
|
||||
To build vllm on ROCm 7.0 for MI200 and MI300 series, you can use the default:
|
||||
|
||||
```bash
|
||||
DOCKER_BUILDKIT=1 docker build \
|
||||
-f docker/Dockerfile.rocm_base \
|
||||
-t rocm/vllm-dev:base .
|
||||
```
|
||||
```bash
|
||||
DOCKER_BUILDKIT=1 docker build \
|
||||
-f docker/Dockerfile.rocm_base \
|
||||
-t rocm/vllm-dev:base .
|
||||
```
|
||||
|
||||
#### Build an image with vLLM
|
||||
|
||||
@ -181,7 +206,7 @@ It is important that the user kicks off the docker build using buildkit. Either
|
||||
}
|
||||
```
|
||||
|
||||
[docker/Dockerfile.rocm](https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm) uses ROCm 6.3 by default, but also supports ROCm 5.7, 6.0, 6.1, and 6.2, in older vLLM branches.
|
||||
[docker/Dockerfile.rocm](https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm) uses ROCm 7.0 by default, but also supports ROCm 5.7, 6.0, 6.1, 6.2, 6.3, and 6.4, in older vLLM branches.
|
||||
It provides flexibility to customize the build of docker image using the following arguments:
|
||||
|
||||
- `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using [docker/Dockerfile.rocm_base](https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm_base)
|
||||
@ -189,16 +214,16 @@ It provides flexibility to customize the build of docker image using the followi
|
||||
|
||||
Their values can be passed in when running `docker build` with `--build-arg` options.
|
||||
|
||||
To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default:
|
||||
To build vllm on ROCm 7.0 for MI200 and MI300 series, you can use the default:
|
||||
|
||||
```bash
|
||||
DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm -t vllm-rocm .
|
||||
```
|
||||
???+ console "Commands"
|
||||
```bash
|
||||
DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm -t vllm-rocm .
|
||||
```
|
||||
|
||||
To run the above docker image `vllm-rocm`, use the below command:
|
||||
|
||||
??? console "Command"
|
||||
|
||||
???+ console "Commands"
|
||||
```bash
|
||||
docker run -it \
|
||||
--network=host \
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following commands:
|
||||
On NVIDIA CUDA only, it's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following commands:
|
||||
|
||||
```bash
|
||||
uv venv --python 3.12 --seed
|
||||
|
||||
@ -65,7 +65,9 @@ ChatCommand = auto_mock("vllm.entrypoints.cli.openai", "ChatCommand")
|
||||
CompleteCommand = auto_mock("vllm.entrypoints.cli.openai", "CompleteCommand")
|
||||
cli_args = auto_mock("vllm.entrypoints.openai", "cli_args")
|
||||
run_batch = auto_mock("vllm.entrypoints.openai", "run_batch")
|
||||
FlexibleArgumentParser = auto_mock("vllm.utils", "FlexibleArgumentParser")
|
||||
FlexibleArgumentParser = auto_mock(
|
||||
"vllm.utils.argparse_utils", "FlexibleArgumentParser"
|
||||
)
|
||||
|
||||
|
||||
class MarkdownFormatter(HelpFormatter):
|
||||
|
||||
@ -374,8 +374,8 @@ th {
|
||||
| `GraniteMoeSharedForCausalLM` | Granite MoE Shared | `ibm-research/moe-7b-1b-active-shared-experts` (test model) | ✅︎ | ✅︎ |
|
||||
| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ |
|
||||
| `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ |
|
||||
| `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | ✅︎ |
|
||||
| `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | ✅︎ |
|
||||
| `HunYuanDenseV1ForCausalLM` | Hunyuan Dense | `tencent/Hunyuan-7B-Instruct` | ✅︎ | ✅︎ |
|
||||
| `HunYuanMoEV1ForCausalLM` | Hunyuan-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | ✅︎ |
|
||||
| `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | |
|
||||
| `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ |
|
||||
| `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ |
|
||||
@ -390,6 +390,7 @@ th {
|
||||
| `MiMoForCausalLM` | MiMo | `XiaomiMiMo/MiMo-7B-RL`, etc. | ✅︎ | ✅︎ |
|
||||
| `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ |
|
||||
| `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ |
|
||||
| `MiniMaxM2ForCausalLM` | MiniMax-M2 |`MiniMaxAI/MiniMax-M2`, etc. | | ✅︎ |
|
||||
| `MistralForCausalLM` | Mistral, Mistral-Instruct | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ |
|
||||
| `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ |
|
||||
| `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ |
|
||||
@ -736,37 +737,6 @@ Some models are supported only via the [Transformers backend](#transformers). Th
|
||||
!!! note
|
||||
To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
|
||||
|
||||
!!! warning
|
||||
The output quality of `AllenAI/Molmo-7B-D-0924` (especially in object localization tasks) has deteriorated in recent updates.
|
||||
|
||||
For the best results, we recommend using the following dependency versions (tested on A10 and L40):
|
||||
|
||||
??? code "Dependency versions"
|
||||
|
||||
```text
|
||||
# Core vLLM-compatible dependencies with Molmo accuracy setup (tested on L40)
|
||||
torch==2.5.1
|
||||
torchvision==0.20.1
|
||||
transformers==4.48.1
|
||||
tokenizers==0.21.0
|
||||
tiktoken==0.7.0
|
||||
vllm==0.7.0
|
||||
|
||||
# Optional but recommended for improved performance and stability
|
||||
triton==3.1.0
|
||||
xformers==0.0.28.post3
|
||||
uvloop==0.21.0
|
||||
protobuf==5.29.3
|
||||
openai==1.60.2
|
||||
opencv-python-headless==4.11.0.86
|
||||
pillow==10.4.0
|
||||
|
||||
# Installed FlashAttention (for float16 only)
|
||||
flash-attn>=2.5.6 # Not used in float32, but should be documented
|
||||
```
|
||||
|
||||
**Note:** Make sure you understand the security implications of using outdated packages.
|
||||
|
||||
!!! note
|
||||
The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now.
|
||||
For more details, please see: <https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630>
|
||||
@ -805,6 +775,7 @@ The following table lists those that are tested in vLLM.
|
||||
| `CLIPModel` | CLIP | T / I | `openai/clip-vit-base-patch32`, `openai/clip-vit-large-patch14`, etc. | | |
|
||||
| `LlavaNextForConditionalGeneration`<sup>C</sup> | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | ✅︎ |
|
||||
| `Phi3VForCausalLM`<sup>C</sup> | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | | ✅︎ |
|
||||
| `SiglipModel` | SigLIP, SigLIP2 | T / I | `google/siglip-base-patch16-224`, `google/siglip2-base-patch16-224` | | |
|
||||
| `*ForConditionalGeneration`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | \* | N/A | \* | \* |
|
||||
|
||||
<sup>C</sup> Automatically converted into an embedding model via `--convert embed`. ([details](./pooling_models.md#model-conversion))
|
||||
|
||||
@ -5,6 +5,7 @@ Reinforcement Learning from Human Feedback (RLHF) is a technique that fine-tunes
|
||||
The following open-source RL libraries use vLLM for fast rollouts (sorted alphabetically and non-exhaustive):
|
||||
|
||||
- [Cosmos-RL](https://github.com/nvidia-cosmos/cosmos-rl)
|
||||
- [ms-swift](https://github.com/modelscope/ms-swift/tree/main)
|
||||
- [NeMo-RL](https://github.com/NVIDIA-NeMo/RL)
|
||||
- [Open Instruct](https://github.com/allenai/open-instruct)
|
||||
- [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF)
|
||||
|
||||
@ -18,7 +18,7 @@ from transformers import AutoTokenizer
|
||||
from vllm import LLM, EngineArgs, SamplingParams
|
||||
from vllm.assets.audio import AudioAsset
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
|
||||
question_per_audio_count = {
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from vllm import LLM, EngineArgs
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def create_parser():
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
from argparse import Namespace
|
||||
|
||||
from vllm import LLM, EngineArgs
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def parse_args():
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
from argparse import Namespace
|
||||
|
||||
from vllm import LLM, EngineArgs
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def parse_args():
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from vllm import LLM, EngineArgs
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def create_parser():
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
from argparse import Namespace
|
||||
|
||||
from vllm import LLM, EngineArgs
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def parse_args():
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
from argparse import Namespace
|
||||
|
||||
from vllm import LLM, EngineArgs
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def parse_args():
|
||||
|
||||
@ -13,7 +13,7 @@ from typing import NamedTuple
|
||||
|
||||
from vllm import LLM, EngineArgs, PromptType, SamplingParams
|
||||
from vllm.assets.audio import AudioAsset
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
class ModelRequestData(NamedTuple):
|
||||
|
||||
@ -8,7 +8,7 @@ for processing prompts with various sampling parameters.
|
||||
import argparse
|
||||
|
||||
from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def create_test_prompts() -> list[tuple[str, SamplingParams]]:
|
||||
|
||||
@ -25,7 +25,7 @@ python load_sharded_state.py \
|
||||
import dataclasses
|
||||
|
||||
from vllm import LLM, EngineArgs, SamplingParams
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def parse_args():
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
from argparse import Namespace
|
||||
|
||||
from vllm import LLM, EngineArgs
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def parse_args():
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
from argparse import Namespace
|
||||
|
||||
from vllm import LLM, EngineArgs, PoolingParams
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def parse_args():
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
from argparse import Namespace
|
||||
|
||||
from vllm import LLM, EngineArgs
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def parse_args():
|
||||
|
||||
@ -5,7 +5,7 @@
|
||||
from argparse import Namespace
|
||||
|
||||
from vllm import LLM, EngineArgs
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def parse_args():
|
||||
|
||||
@ -64,7 +64,7 @@ class PrithviMAE:
|
||||
}
|
||||
|
||||
prompt = {"prompt_token_ids": [1], "multi_modal_data": mm_data}
|
||||
outputs = self.model.encode(prompt, use_tqdm=False)
|
||||
outputs = self.model.encode(prompt, pooling_task="plugin", use_tqdm=False)
|
||||
|
||||
return outputs[0].outputs.data
|
||||
|
||||
|
||||
@ -6,14 +6,14 @@ import os
|
||||
import torch
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.pooling_params import PoolingParams
|
||||
|
||||
# This example shows how to perform an offline inference that generates
|
||||
# multimodal data. In this specific case this example will take a geotiff
|
||||
# image as input, process it using the multimodal data processor, and
|
||||
# perform inference.
|
||||
# Requirement - install plugin at:
|
||||
# https://github.com/christian-pinto/prithvi_io_processor_plugin
|
||||
# Requirements:
|
||||
# - install TerraTorch v1.1 (or later):
|
||||
# pip install terratorch>=v1.1
|
||||
|
||||
|
||||
def main():
|
||||
@ -36,16 +36,12 @@ def main():
|
||||
# to avoid the model going OOM.
|
||||
# The maximum number depends on the available GPU memory
|
||||
max_num_seqs=32,
|
||||
io_processor_plugin="prithvi_to_tiff",
|
||||
io_processor_plugin="terratorch_segmentation",
|
||||
model_impl="terratorch",
|
||||
enable_mm_embeds=True,
|
||||
)
|
||||
|
||||
pooling_params = PoolingParams(task="token_classify", activation=False)
|
||||
pooler_output = llm.encode(
|
||||
img_prompt,
|
||||
pooling_params=pooling_params,
|
||||
)
|
||||
pooler_output = llm.encode(img_prompt, pooling_task="plugin")
|
||||
output = pooler_output[0].outputs
|
||||
|
||||
print(output)
|
||||
|
||||
@ -13,7 +13,7 @@ from tqdm import tqdm
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.inputs import PromptType
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
DURATION_MS = int(os.getenv("VLLM_TPU_PROFILE_DURATION_MS", 3000))
|
||||
DELAY_MS = int(os.getenv("VLLM_TPU_PROFILE_DELAY_MS", 0))
|
||||
|
||||
@ -13,7 +13,7 @@ from vllm.assets.audio import AudioAsset
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.assets.video import VideoAsset
|
||||
from vllm.multimodal.image import convert_image_mode
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
class QueryResult(NamedTuple):
|
||||
|
||||
@ -30,7 +30,7 @@ from pathlib import Path
|
||||
|
||||
from vllm import LLM, EngineArgs
|
||||
from vllm.model_executor.model_loader import ShardedStateLoader
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def parse_args():
|
||||
|
||||
@ -9,7 +9,7 @@ from vllm.inputs import TokensPrompt
|
||||
from vllm.v1.metrics.reader import Counter, Vector
|
||||
|
||||
try:
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
except ImportError:
|
||||
from argparse import ArgumentParser as FlexibleArgumentParser
|
||||
|
||||
|
||||
@ -22,7 +22,7 @@ from vllm.assets.image import ImageAsset
|
||||
from vllm.assets.video import VideoAsset
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.multimodal.image import convert_image_mode
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
class ModelRequestData(NamedTuple):
|
||||
|
||||
@ -18,7 +18,7 @@ from transformers import AutoProcessor, AutoTokenizer
|
||||
from vllm import LLM, EngineArgs, SamplingParams
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.multimodal.utils import fetch_image
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
QUESTION = "What is the content of each image?"
|
||||
IMAGE_URLS = [
|
||||
@ -44,6 +44,7 @@ class ModelRequestData(NamedTuple):
|
||||
stop_token_ids: list[int] | None = None
|
||||
chat_template: str | None = None
|
||||
lora_requests: list[LoRARequest] | None = None
|
||||
sampling_params: SamplingParams | None = None
|
||||
|
||||
|
||||
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
|
||||
@ -201,6 +202,46 @@ def load_deepseek_vl2(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def load_deepseek_ocr(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
from vllm.model_executor.models.deepseek_ocr import NGramPerReqLogitsProcessor
|
||||
|
||||
model_name = "deepseek-ai/DeepSeek-OCR"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_num_seqs=2,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
logits_processors=[NGramPerReqLogitsProcessor],
|
||||
)
|
||||
|
||||
placeholder = "<image>\n" * len(image_urls)
|
||||
prompt = placeholder + question
|
||||
|
||||
# The following sampling params config is taken from
|
||||
# the official Deepseek-OCR inference example.
|
||||
# (IMPORTANT) Use the custom logits processor and avoid skipping
|
||||
# special tokens for this model for the optimal OCR performance.
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.0,
|
||||
max_tokens=8192,
|
||||
# ngram logit processor args
|
||||
extra_args=dict(
|
||||
ngram_size=30,
|
||||
window_size=90,
|
||||
# whitelist: <td>, </td>
|
||||
whitelist_token_ids={128821, 128822},
|
||||
),
|
||||
skip_special_tokens=False,
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
|
||||
|
||||
def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "google/gemma-3-4b-it"
|
||||
|
||||
@ -1253,6 +1294,7 @@ model_example_map = {
|
||||
"bee": load_bee,
|
||||
"command_a_vision": load_command_a_vision,
|
||||
"deepseek_vl_v2": load_deepseek_vl2,
|
||||
"deepseek_ocr": load_deepseek_ocr,
|
||||
"gemma3": load_gemma3,
|
||||
"h2ovl_chat": load_h2ovl,
|
||||
"hyperclovax_seed_vision": load_hyperclovax_seed_vision,
|
||||
@ -1325,8 +1367,12 @@ def run_chat(model: str, question: str, image_urls: list[str], seed: int | None)
|
||||
engine_args = asdict(req_data.engine_args) | {"seed": seed}
|
||||
llm = LLM(**engine_args)
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
|
||||
sampling_params = (
|
||||
SamplingParams(
|
||||
temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
|
||||
)
|
||||
if req_data.sampling_params is None
|
||||
else req_data.sampling_params
|
||||
)
|
||||
outputs = llm.chat(
|
||||
[
|
||||
|
||||
@ -18,7 +18,7 @@ from PIL.Image import Image
|
||||
from vllm import LLM, EngineArgs
|
||||
from vllm.entrypoints.score_utils import ScoreMultiModalParam
|
||||
from vllm.multimodal.utils import fetch_image
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
ROOT_DIR = Path(__file__).parent.parent.parent
|
||||
EXAMPLES_DIR = ROOT_DIR / "examples"
|
||||
@ -110,6 +110,53 @@ def run_e5_v(query: Query) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def run_jinavl_reranker(query: Query) -> ModelRequestData:
|
||||
if query["modality"] != "text+images":
|
||||
raise ValueError(f"Unsupported query modality: '{query['modality']}'")
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model="jinaai/jina-reranker-m0",
|
||||
runner="pooling",
|
||||
max_model_len=32768,
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs={
|
||||
"min_pixels": 3136,
|
||||
"max_pixels": 602112,
|
||||
},
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
query=query["text"],
|
||||
documents=query["image"],
|
||||
)
|
||||
|
||||
|
||||
def run_siglip(query: Query) -> ModelRequestData:
|
||||
if query["modality"] == "text":
|
||||
prompt = query["text"]
|
||||
image = None
|
||||
elif query["modality"] == "image":
|
||||
prompt = "" # For image input, make sure that the prompt text is empty
|
||||
image = query["image"]
|
||||
else:
|
||||
modality = query["modality"]
|
||||
raise ValueError(f"Unsupported query modality: '{modality}'")
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model="google/siglip-base-patch16-224",
|
||||
runner="pooling",
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image=image,
|
||||
)
|
||||
|
||||
|
||||
def _get_vlm2vec_prompt_image(query: Query, image_token: str):
|
||||
if query["modality"] == "text":
|
||||
text = query["text"]
|
||||
@ -211,29 +258,6 @@ def run_vlm2vec_qwen2vl(query: Query) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def run_jinavl_reranker(query: Query) -> ModelRequestData:
|
||||
if query["modality"] != "text+images":
|
||||
raise ValueError(f"Unsupported query modality: '{query['modality']}'")
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model="jinaai/jina-reranker-m0",
|
||||
runner="pooling",
|
||||
max_model_len=32768,
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs={
|
||||
"min_pixels": 3136,
|
||||
"max_pixels": 602112,
|
||||
},
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
query=query["text"],
|
||||
documents=query["image"],
|
||||
)
|
||||
|
||||
|
||||
def get_query(modality: QueryModality):
|
||||
if modality == "text":
|
||||
return TextQuery(modality="text", text="A dog sitting in the grass")
|
||||
@ -328,9 +352,10 @@ def run_score(model: str, modality: QueryModality, seed: int | None):
|
||||
model_example_map = {
|
||||
"clip": run_clip,
|
||||
"e5_v": run_e5_v,
|
||||
"jinavl_reranker": run_jinavl_reranker,
|
||||
"siglip": run_siglip,
|
||||
"vlm2vec_phi3v": run_vlm2vec_phi3v,
|
||||
"vlm2vec_qwen2vl": run_vlm2vec_qwen2vl,
|
||||
"jinavl_reranker": run_jinavl_reranker,
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -530,7 +530,7 @@ spec:
|
||||
name: accelerators-thanos-querier-datasource
|
||||
# Multiply by 100 so we can read it as a percentage without setting a unit (avoids CUE unit conflicts)
|
||||
query: >
|
||||
100 * avg(vllm:gpu_cache_usage_perc)
|
||||
100 * avg(vllm:kv_cache_usage_perc)
|
||||
|
||||
"18":
|
||||
kind: Panel
|
||||
|
||||
@ -98,7 +98,7 @@ spec:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: avg(vllm:gpu_cache_usage_perc{namespace="$NS",service="$SVC"}) or vector(0)
|
||||
query: avg(vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
core_running_ts:
|
||||
@ -168,7 +168,7 @@ spec:
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
# multiply by 100 to present percentage; omit format.unit to avoid schema conflicts
|
||||
query: (avg(vllm:gpu_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
|
||||
query: (avg(vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
core_kv_usage_pct_ts:
|
||||
@ -187,7 +187,7 @@ spec:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: (avg by (service) (vllm:gpu_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
|
||||
query: (avg by (service) (vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
# --- Per-Pod breakdowns (works on Simulator & Real) ---
|
||||
@ -246,7 +246,7 @@ spec:
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
# if your exporter labels kv metric with pod (the sim does), this works; otherwise it will just return empty
|
||||
query: (avg by (pod) (vllm:gpu_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
|
||||
query: (avg by (pod) (vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
# --- Real vLLM only (zeros on simulator) ---
|
||||
|
||||
@ -26,7 +26,7 @@ import requests
|
||||
from openai import OpenAI
|
||||
from utils import get_first_model
|
||||
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||
openai_api_key = "EMPTY"
|
||||
|
||||
@ -83,6 +83,109 @@ def run_clip(client: OpenAI, model: str):
|
||||
print("Text embedding output:", response.data[0].embedding)
|
||||
|
||||
|
||||
def run_dse_qwen2_vl(client: OpenAI, model: str):
|
||||
"""
|
||||
Start the server using:
|
||||
|
||||
vllm serve MrLight/dse-qwen2-2b-mrl-v1 \
|
||||
--runner pooling \
|
||||
--trust-remote-code \
|
||||
--max-model-len 8192 \
|
||||
--chat-template examples/template_dse_qwen2_vl.jinja
|
||||
"""
|
||||
response = create_chat_embeddings(
|
||||
client,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url,
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": "What is shown in this image?"},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
encoding_format="float",
|
||||
)
|
||||
|
||||
print("Image embedding output:", response.data[0].embedding)
|
||||
|
||||
# MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image
|
||||
# of the minimum input size
|
||||
buffer = io.BytesIO()
|
||||
image_placeholder = Image.new("RGB", (56, 56))
|
||||
image_placeholder.save(buffer, "png")
|
||||
buffer.seek(0)
|
||||
image_placeholder = base64.b64encode(buffer.read()).decode("utf-8")
|
||||
response = create_chat_embeddings(
|
||||
client,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{image_placeholder}",
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": "Query: What is the weather like today?"},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
encoding_format="float",
|
||||
)
|
||||
|
||||
print("Text embedding output:", response.data[0].embedding)
|
||||
|
||||
|
||||
def run_siglip(client: OpenAI, model: str):
|
||||
"""
|
||||
Start the server using:
|
||||
|
||||
vllm serve google/siglip-base-patch16-224 \
|
||||
--runner pooling
|
||||
"""
|
||||
|
||||
response = create_chat_embeddings(
|
||||
client,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
encoding_format="float",
|
||||
)
|
||||
|
||||
print("Image embedding output:", response.data[0].embedding)
|
||||
|
||||
response = create_chat_embeddings(
|
||||
client,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "a photo of a cat"},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
encoding_format="float",
|
||||
)
|
||||
|
||||
print("Text embedding output:", response.data[0].embedding)
|
||||
|
||||
|
||||
def run_vlm2vec(client: OpenAI, model: str):
|
||||
"""
|
||||
Start the server using:
|
||||
@ -148,72 +251,11 @@ def run_vlm2vec(client: OpenAI, model: str):
|
||||
print("Text embedding output:", response.data[0].embedding)
|
||||
|
||||
|
||||
def run_dse_qwen2_vl(client: OpenAI, model: str):
|
||||
"""
|
||||
Start the server using:
|
||||
|
||||
vllm serve MrLight/dse-qwen2-2b-mrl-v1 \
|
||||
--runner pooling \
|
||||
--trust-remote-code \
|
||||
--max-model-len 8192 \
|
||||
--chat-template examples/template_dse_qwen2_vl.jinja
|
||||
"""
|
||||
response = create_chat_embeddings(
|
||||
client,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url,
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": "What is shown in this image?"},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
encoding_format="float",
|
||||
)
|
||||
|
||||
print("Image embedding output:", response.data[0].embedding)
|
||||
|
||||
# MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image
|
||||
# of the minimum input size
|
||||
buffer = io.BytesIO()
|
||||
image_placeholder = Image.new("RGB", (56, 56))
|
||||
image_placeholder.save(buffer, "png")
|
||||
buffer.seek(0)
|
||||
image_placeholder = base64.b64encode(buffer.read()).decode("utf-8")
|
||||
response = create_chat_embeddings(
|
||||
client,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{image_placeholder}",
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": "Query: What is the weather like today?"},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
encoding_format="float",
|
||||
)
|
||||
|
||||
print("Text embedding output:", response.data[0].embedding)
|
||||
|
||||
|
||||
model_example_map = {
|
||||
"clip": run_clip,
|
||||
"vlm2vec": run_vlm2vec,
|
||||
"dse_qwen2_vl": run_dse_qwen2_vl,
|
||||
"siglip": run_siglip,
|
||||
"vlm2vec": run_vlm2vec,
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -11,14 +11,14 @@ import requests
|
||||
# image as input, process it using the multimodal data processor, and
|
||||
# perform inference.
|
||||
# Requirements :
|
||||
# - install plugin at:
|
||||
# https://github.com/christian-pinto/prithvi_io_processor_plugin
|
||||
# - install TerraTorch v1.1 (or later):
|
||||
# pip install terratorch>=v1.1
|
||||
# - start vllm in serving mode with the below args
|
||||
# --model='christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM'
|
||||
# --model-impl terratorch
|
||||
# --task embed --trust-remote-code
|
||||
# --skip-tokenizer-init --enforce-eager
|
||||
# --io-processor-plugin prithvi_to_tiff
|
||||
# --io-processor-plugin terratorch_segmentation
|
||||
# --enable-mm-embeds
|
||||
|
||||
|
||||
@ -35,7 +35,6 @@ def main():
|
||||
},
|
||||
"priority": 0,
|
||||
"model": "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM",
|
||||
"softmax": False,
|
||||
}
|
||||
|
||||
ret = requests.post(server_endpoint, json=request_payload_url)
|
||||
|
||||
@ -852,7 +852,7 @@
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "vllm:gpu_cache_usage_perc{model_name=\"$model_name\"}",
|
||||
"expr": "vllm:kv_cache_usage_perc{model_name=\"$model_name\"}",
|
||||
"instant": false,
|
||||
"legendFormat": "GPU Cache Usage",
|
||||
"range": true,
|
||||
|
||||
@ -16,7 +16,7 @@ from vllm.model_executor.model_loader.tensorizer import (
|
||||
tensorize_vllm_model,
|
||||
tensorizer_kwargs_arg,
|
||||
)
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
logger = logging.getLogger()
|
||||
|
||||
|
||||
@ -13,3 +13,5 @@ torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytor
|
||||
# xformers==0.0.32.post1; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.8
|
||||
# FlashInfer should be updated together with the Dockerfile
|
||||
flashinfer-python==0.4.1
|
||||
# Triton Kernels are needed for mxfp4 fused moe. (Should be updated alongside torch)
|
||||
triton_kernels @ git+https://github.com/triton-lang/triton.git@v3.5.0#subdirectory=python/triton_kernels
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user