mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
Signed-off-by: Nick Hill <nhill@redhat.com> Signed-off-by: Lucas Kabela <lucaskabela@meta.com> Signed-off-by: Max de Bayser <mbayser@br.ibm.com> Signed-off-by: Andrew Sansom <andrew@protopia.ai> Signed-off-by: Boyuan Feng <boyuan@meta.com> Signed-off-by: Boyuan Feng <fby.1994@gmail.com> Signed-off-by: boyuanfeng <boyuan@meta.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: JartX <sagformas@epdcenter.es> Signed-off-by: Chendi Xue <Chendi.Xue@intel.com> Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Chen Zhang <zhangch99@outlook.com> Signed-off-by: Roger Wang <hey@rogerw.io> Signed-off-by: mgoin <mgoin64@gmail.com> Signed-off-by: wwl2755 <wangwenlong2755@gmail.com> Signed-off-by: Manoel Marques <manoel.marques@ibm.com> Signed-off-by: Manoel Marques <manoelmrqs@gmail.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: pengdrumli <pengdrumli@tencent.com> Signed-off-by: windsonsea <haifeng.yao@daocloud.io> Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai> Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Signed-off-by: Huamin Li <3ericli@gmail.com> Signed-off-by: simondanielsson <simon.danielsson99@hotmail.com> Signed-off-by: Rahul Tuli <rtuli@redhat.com> Signed-off-by: Yang <lymailforjob@gmail.com> Signed-off-by: Debolina Roy <debroy@redhat.com> Signed-off-by: David Chen <530634352@qq.com> Signed-off-by: wangzi <3220100013@zju.edu.cn> Signed-off-by: Eldar Kurtic <8884008+eldarkurtic@users.noreply.github.com> Signed-off-by: NickLucche <nlucches@redhat.com> Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com> Signed-off-by: Sara Kokkila Schumacher <saraks@ibm.com> Signed-off-by: Csrayz <jover@cmbchina.com> Signed-off-by: ivyilike <pww123@cmbchina.com> Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com> Signed-off-by: Bowen Wang <abmfy@icloud.com> Signed-off-by: qqma <qqma@amazon.com> Signed-off-by: ElizaWszola <ewszola@redhat.com> Signed-off-by: Lu Fang <fanglu@fb.com> Signed-off-by: Zhuohan Li <zhuohan123@gmail.com> Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: luka <lgovedic@redhat.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Signed-off-by: Or Ozeri <oro@il.ibm.com> Signed-off-by: Johnny Yang <johnnyyang@google.com> Signed-off-by: Alec Solder <alecs@fb.com> Signed-off-by: Alec S <10566873+alecsolder@users.noreply.github.com> Signed-off-by: Russell Bryant <rbryant@redhat.com> Signed-off-by: Matthew Bonanni <mbonanni@redhat.com> Signed-off-by: Alexander Matveev <amatveev@redhat.com> Signed-off-by: yewentao256 <zhyanwentao@126.com> Signed-off-by: liuye.hj <liuye.hj@alibaba-inc.com> Signed-off-by: Kunshang Ji <kunshang.ji@intel.com> Signed-off-by: Lucia Fang <116399278+luccafong@users.noreply.github.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com> Signed-off-by: Ming Yang <minos.future@gmail.com> Signed-off-by: Zhikaiiii <1658973216@qq.com> Signed-off-by: Andreas Hartel <andreas.hartel@aleph-alpha.com> Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com> Signed-off-by: wuxibin <wuxibin@bytedance.com> Signed-off-by: youkaichao <youkaichao@gmail.com> Signed-off-by: Peter Pan <Peter.Pan@daocloud.io> Signed-off-by: Peter Pan <peter.pan@daocloud.io> Signed-off-by: Nicolò Lucchesi<nicolo.lucchesi@gmail.com> Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> Signed-off-by: Sage Moore <sage@neuralmagic.com> Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com> Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com> Signed-off-by: Bill Nell <bnell@redhat.com> Signed-off-by: Shreeasish Kumar <shreeasish@rivosinc.com> Signed-off-by: Weida Hong <wdhongtw@google.com> Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Signed-off-by: Hashem Hashemi <hashem.hashemi@amd.com> Signed-off-by: Hashem Hashemi <159079214+amd-hhashemi@users.noreply.github.com> Signed-off-by: Amir Samani <asamani@nvidia.com> Signed-off-by: ElizaWszola <elizaw.9289@gmail.com> Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com> Signed-off-by: ilmarkov <markovilya197@gmail.com> Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com> Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com> Signed-off-by: rouchenzi <ruochenwen@gmail.com> Signed-off-by: rouchenzi <40842833+rouchenzi@users.noreply.github.com> Signed-off-by: Andrew Xia <axia@meta.com> Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com> Signed-off-by: Corey Lowman <clowman1993@gmail.com> Signed-off-by: jpvillam <jpvillam@amd.com> Signed-off-by: dougbtv <dosmith@redhat.com> Signed-off-by: Chenxi Yang <cxyang@fb.com> Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Signed-off-by: ahao-anyscale <ahao@anyscale.com> Signed-off-by: Yan Lu <luyan@nvidia.com> Signed-off-by: baxingpiaochong <771405853@qq.com> Signed-off-by: Kyle Sayers <kylesayrs@gmail.com> Signed-off-by: Nikhil Gupta <nikhil.gupta2@arm.com> Signed-off-by: Yong Hoon Shin <yhshin@meta.com> Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai> Signed-off-by: Benjamin Chislett <bchislett@nvidia.com> Signed-off-by: Ben Browning <bbrownin@redhat.com> Signed-off-by: Chengji Yao <chengjiyao@google.com> Signed-off-by: jiang1.li <jiang1.li@intel.com> Signed-off-by: Jackmin801 <ongjackm@gmail.com> Signed-off-by: Jonas M. Kübler <44084297+jmkuebler@users.noreply.github.com> Signed-off-by: taohui <taohui3@gmail.com> Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io> Signed-off-by: Shu Wang <shuw@nvidia.com> Signed-off-by: Shu Wang. <shuw@nvidia.com> Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com> Signed-off-by: Duncan Moss <djm.moss@gmail.com> Signed-off-by: Shiyan Deng <dsy842974287@meta.com> Signed-off-by: Wei Wei <wwei6@meta.com> Signed-off-by: Saman Keon <samanamp@outlook.com> Signed-off-by: yangxurui <yangxurui@meituan.com> Signed-off-by: nicole-lihui <nicole.li@daocloud.io> Signed-off-by: courage17340 <courage17340@163.com> Signed-off-by: Jacob Kahn <jacobkahn1@gmail.com> Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com> Signed-off-by: Agata Dobrzyniewicz <adobrzyniewicz@habana.ai> Signed-off-by: zxw <1020938856@qq.com> Signed-off-by: wang.yuqi <noooop@126.com> Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Signed-off-by: chenlang <chen.lang5@zte.com.cn> Signed-off-by: Jonas Kuebler <kuebj@amazon.com> Signed-off-by: AlonKejzman <alonkeizman@gmail.com> Signed-off-by: Tao Hui <taohui3@gmail.com> Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com> Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com> Signed-off-by: Aleksandr Malyshev <maleksan@amd.com> Signed-off-by: Eugene Khvedchenia <ekhvedchenia@nvidia.com> Signed-off-by: Eugene Khvedchenya <ekhvedchenya@gmail.com> Signed-off-by: yiting.jiang <yiting.jiang@daocloud.io> Signed-off-by: xaguilar <Xavier.AguilarFruto@amd.com> Signed-off-by: Iceber Gu <caiwei95@hotmail.com> Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com> Signed-off-by: Icey <1790571317@qq.com> Signed-off-by: 许文卿 <xwq391974@alibaba-inc.com> Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com> Co-authored-by: Nick Hill <nhill@redhat.com> Co-authored-by: Lucas Kabela <lucasakabela@gmail.com> Co-authored-by: Maximilien de Bayser <mbayser@br.ibm.com> Co-authored-by: Andrew Sansom <andrew@protopia.ai> Co-authored-by: Boyuan Feng <boyuan@meta.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: JartX <sagformas@epdcenter.es> Co-authored-by: Chendi.Xue <chendi.xue@intel.com> Co-authored-by: Chauncey <chaunceyjiang@gmail.com> Co-authored-by: xin.li <xin.li@daocloud.io> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk> Co-authored-by: Chen Zhang <zhangch99@outlook.com> Co-authored-by: Roger Wang <hey@rogerw.io> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Wenlong Wang <wangwenlong2755@gmail.com> Co-authored-by: Manoel Marques <manoelmrqs@gmail.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: lirong <56789630+lirong-lirong@users.noreply.github.com> Co-authored-by: Michael Yao <haifeng.yao@daocloud.io> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Co-authored-by: Huamin Li <3ericli@gmail.com> Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com> Co-authored-by: Simon Danielsson <70206058+simondanielsson@users.noreply.github.com> Co-authored-by: Rahul Tuli <rtuli@redhat.com> Co-authored-by: Claude <noreply@anthropic.com> Co-authored-by: Yang Liu <127183760+KKSK-DON@users.noreply.github.com> Co-authored-by: Deboleina <debroy@redhat.com> Co-authored-by: yinz-aizip <yinz@aizip.ai> Co-authored-by: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Co-authored-by: wangzi <3220100013@zju.edu.cn> Co-authored-by: Eldar Kurtić <8884008+eldarkurtic@users.noreply.github.com> Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com> Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com> Co-authored-by: Yizhou <136800916+yiz-liu@users.noreply.github.com> Co-authored-by: Sara-KS <50249410+Sara-KS@users.noreply.github.com> Co-authored-by: Csrayz <jover@cmbchina.com> Co-authored-by: ivyilike <pww123@cmbchina.com> Co-authored-by: Burkhard Ringlein <ngl@zurich.ibm.com> Co-authored-by: Bowen Wang <abmfy@icloud.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Daisy-Ma-coder <daisy.ma.0117@gmail.com> Co-authored-by: qqma <qqma@amazon.com> Co-authored-by: ElizaWszola <ewszola@redhat.com> Co-authored-by: Lucia Fang <116399278+luccafong@users.noreply.github.com> Co-authored-by: Zhuohan Li <zhuohan123@gmail.com> Co-authored-by: Simon Mo <simon.mo@hey.com> Co-authored-by: Or Ozeri <oro@il.ibm.com> Co-authored-by: Johnny Yang <24908445+jcyang43@users.noreply.github.com> Co-authored-by: Chengji Yao <chengjiyao@google.com> Co-authored-by: Alec S <10566873+alecsolder@users.noreply.github.com> Co-authored-by: Alec Solder <alecs@fb.com> Co-authored-by: Russell Bryant <rbryant@redhat.com> Co-authored-by: Matthew Bonanni <mbonanni@redhat.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Co-authored-by: Chris Bamford <chrisbam4d@gmail.com> Co-authored-by: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: JJJYmmm <92386084+JJJYmmm@users.noreply.github.com> Co-authored-by: liuye.hj <liuye.hj@alibaba-inc.com> Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> Co-authored-by: Lucia (Lu) Fang <fanglu@meta.com> Co-authored-by: Varun Sundar Rabindranath <varunsundar08@gmail.com> Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com> Co-authored-by: Ming Yang <yming@meta.com> Co-authored-by: Zhikaiiii <55917203+Zhikaiiii@users.noreply.github.com> Co-authored-by: Andreas Hartel <andreas@hartel.me> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: vllmellm <vllm.ellm@embeddedllm.com> Co-authored-by: Joel <wuxibin89@163.com> Co-authored-by: youkaichao <youkaichao@gmail.com> Co-authored-by: Mark McLoughlin <markmc@redhat.com> Co-authored-by: Peter Pan <peter.pan@daocloud.io> Co-authored-by: Nicolò Lucchesi <nicolo.lucchesi@gmail.com> Co-authored-by: Fanli Lin <fanli.lin@intel.com> Co-authored-by: Thomas Parnell <tpa@zurich.ibm.com> Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com> Co-authored-by: Sage Moore <sage@neuralmagic.com> Co-authored-by: yewentao256 <zhyanwentao@126.com> Co-authored-by: bnellnm <49004751+bnellnm@users.noreply.github.com> Co-authored-by: rivos-shreeasish <shreeasish@rivosinc.com> Co-authored-by: Chih-Chieh Yang <chih.chieh.yang@ibm.com> Co-authored-by: Weida Hong <wdhongtw@gmail.com> Co-authored-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Co-authored-by: Hashem Hashemi <159079214+amd-hhashemi@users.noreply.github.com> Co-authored-by: Amir Samani <samani@ualberta.ca> Co-authored-by: Luka Govedič <lgovedic@redhat.com> Co-authored-by: jiahanc <173873397+jiahanc@users.noreply.github.com> Co-authored-by: Ilya Markov <markovilya197@gmail.com> Co-authored-by: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Co-authored-by: Jialin Ouyang <Jialin.Ouyang@gmail.com> Co-authored-by: rouchenzi <40842833+rouchenzi@users.noreply.github.com> Co-authored-by: Andrew Xia <axia@meta.com> Co-authored-by: kourosh hakhamaneshi <31483498+kouroshHakha@users.noreply.github.com> Co-authored-by: Corey Lowman <clowman1993@gmail.com> Co-authored-by: Juan Villamizar <100237675+jpvillam-amd@users.noreply.github.com> Co-authored-by: jpvillam <jpvillam@amd.com> Co-authored-by: Doug Smith <dosmith@redhat.com> Co-authored-by: Chenxi Yang <cxyang@cs.utexas.edu> Co-authored-by: Chenxi Yang <cxyang@fb.com> Co-authored-by: ahao-anyscale <ahao@anyscale.com> Co-authored-by: 0xNullPath <luyanfcp@foxmail.com> Co-authored-by: baxingpiaochong <771405853@qq.com> Co-authored-by: Benjamin Chislett <bchislett@nvidia.com> Co-authored-by: Kyle Sayers <kylesayrs@gmail.com> Co-authored-by: Nikhil Gupta <nikhil.gupta2@arm.com> Co-authored-by: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Co-authored-by: lhsjohn <huashuoli@tencent.com> Co-authored-by: Ben Browning <bbrownin@redhat.com> Co-authored-by: Li, Jiang <jiang1.li@intel.com> Co-authored-by: Jackmin801 <56836461+Jackmin801@users.noreply.github.com> Co-authored-by: Jonas M. Kübler <44084297+jmkuebler@users.noreply.github.com> Co-authored-by: Tao Hui <taohui3@gmail.com> Co-authored-by: rongfu.leng <rongfu.leng@daocloud.io> Co-authored-by: Shu Wang <shuw@nvidia.com> Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com> Co-authored-by: Duncan Moss <djm.moss@gmail.com> Co-authored-by: Shiyan Deng <dsy842974287@meta.com> Co-authored-by: Wei Wei <wwei6@meta.com> Co-authored-by: Saman A. Pour <samanamp@outlook.com> Co-authored-by: XuruiYang <530534756@qq.com> Co-authored-by: yangxurui <yangxurui@meituan.com> Co-authored-by: Nicole LiHui 🥜 <nicolelihui@outlook.com> Co-authored-by: courage17340 <courage17340@users.noreply.github.com> Co-authored-by: Jacob Kahn <jacobkahn1@gmail.com> Co-authored-by: Nicole LiHui 🥜 <nicole.li@daocloud.io> Co-authored-by: Fadi Arafeh <115173828+fadara01@users.noreply.github.com> Co-authored-by: Agata Dobrzyniewicz <160237065+adobrzyn@users.noreply.github.com> Co-authored-by: yyzxw <34639446+yyzxw@users.noreply.github.com> Co-authored-by: wang.yuqi <noooop@126.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: chenlang <chen.lang5@zte.com.cn> Co-authored-by: chenlang <10346245@zte.com.cn> Co-authored-by: AlonKejzman <alonkeizman@gmail.com> Co-authored-by: tomeras91 <57313761+tomeras91@users.noreply.github.com> Co-authored-by: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com> Co-authored-by: Aleksandr Malyshev <maleksan@amd.com> Co-authored-by: Doug Lehr <douglehr@amd.com> Co-authored-by: Eugene Khvedchenya <ekhvedchenya@gmail.com> Co-authored-by: yitingdc <59356937+yitingdc@users.noreply.github.com> Co-authored-by: xaguilar-amd <xavier.aguilarfruto@amd.com> Co-authored-by: Iceber Gu <caiwei95@hotmail.com> Co-authored-by: Tao He <linzhu.ht@alibaba-inc.com> Co-authored-by: Icey <1790571317@qq.com> Co-authored-by: Xu Wenqing <121550081+Xu-Wenqing@users.noreply.github.com> Co-authored-by: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com> Co-authored-by: RishiAstra <40644327+RishiAstra@users.noreply.github.com>
416 lines
12 KiB
Python
416 lines
12 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
# Adapted from sglang quantization/tuning_block_wise_kernel.py
|
|
|
|
import argparse
|
|
import json
|
|
import multiprocessing as mp
|
|
import os
|
|
import time
|
|
from datetime import datetime
|
|
from typing import Any
|
|
|
|
import torch
|
|
from tqdm import tqdm
|
|
|
|
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
|
_w8a8_block_fp8_matmul,
|
|
)
|
|
from vllm.platforms import current_platform
|
|
from vllm.triton_utils import triton
|
|
from vllm.utils import FlexibleArgumentParser
|
|
|
|
mp.set_start_method("spawn", force=True)
|
|
|
|
assert current_platform.is_cuda(), (
|
|
"Only support tune w8a8 block fp8 kernel on CUDA device."
|
|
)
|
|
|
|
DTYPE_MAP = {
|
|
"float32": torch.float32,
|
|
"float16": torch.float16,
|
|
"half": torch.half,
|
|
"bfloat16": torch.bfloat16,
|
|
}
|
|
|
|
|
|
def w8a8_block_matmul(
|
|
A: torch.Tensor,
|
|
B: torch.Tensor,
|
|
As: torch.Tensor,
|
|
Bs: torch.Tensor,
|
|
block_size: list[int],
|
|
config: dict[str, Any],
|
|
output_dtype: torch.dtype = torch.float16,
|
|
) -> torch.Tensor:
|
|
"""This function performs matrix multiplication with
|
|
block-wise quantization.
|
|
|
|
It takes two input tensors `A` and `B` with scales `As` and `Bs`.
|
|
The output is returned in the specified `output_dtype`.
|
|
|
|
Args:
|
|
A: The input tensor, e.g., activation.
|
|
B: The input tensor, e.g., weight.
|
|
As: The per-token-group quantization scale for `A`.
|
|
Bs: The per-block quantization scale for `B`.
|
|
block_size: The block size for per-block quantization.
|
|
It should be 2-dim, e.g., [128, 128].
|
|
output_dtype: The dtype of the returned tensor.
|
|
|
|
Returns:
|
|
torch.Tensor: The result of matmul.
|
|
"""
|
|
assert len(block_size) == 2
|
|
block_n, block_k = block_size[0], block_size[1]
|
|
|
|
assert A.shape[-1] == B.shape[-1]
|
|
assert A.shape[:-1] == As.shape[:-1] and A.is_contiguous()
|
|
assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
|
|
M = A.numel() // A.shape[-1]
|
|
|
|
assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
|
|
N, K = B.shape
|
|
assert triton.cdiv(N, block_n) == Bs.shape[0]
|
|
assert triton.cdiv(K, block_k) == Bs.shape[1]
|
|
|
|
C_shape = A.shape[:-1] + (N,)
|
|
C = A.new_empty(C_shape, dtype=output_dtype)
|
|
|
|
def grid(META):
|
|
return (
|
|
triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
|
|
)
|
|
|
|
if A.dtype == torch.float8_e4m3fn:
|
|
kernel = _w8a8_block_fp8_matmul
|
|
else:
|
|
raise RuntimeError("Currently, only support tune w8a8 block fp8 kernel.")
|
|
|
|
kernel[grid](
|
|
A,
|
|
B,
|
|
C,
|
|
As,
|
|
Bs,
|
|
M,
|
|
N,
|
|
K,
|
|
block_n,
|
|
block_k,
|
|
A.stride(-2),
|
|
A.stride(-1),
|
|
B.stride(1),
|
|
B.stride(0),
|
|
C.stride(-2),
|
|
C.stride(-1),
|
|
As.stride(-2),
|
|
As.stride(-1),
|
|
Bs.stride(1),
|
|
Bs.stride(0),
|
|
**config,
|
|
)
|
|
|
|
return C
|
|
|
|
|
|
def get_configs_compute_bound():
|
|
configs = []
|
|
for num_stages in [2, 3, 4, 5]:
|
|
for block_m in [16, 32, 64, 128, 256]:
|
|
for block_k in [64, 128]:
|
|
for block_n in [32, 64, 128, 256]:
|
|
for num_warps in [4, 8]:
|
|
for group_size in [1, 16, 32, 64]:
|
|
configs.append(
|
|
{
|
|
"BLOCK_SIZE_M": block_m,
|
|
"BLOCK_SIZE_N": block_n,
|
|
"BLOCK_SIZE_K": block_k,
|
|
"GROUP_SIZE_M": group_size,
|
|
"num_warps": num_warps,
|
|
"num_stages": num_stages,
|
|
}
|
|
)
|
|
return configs
|
|
|
|
|
|
def get_weight_shapes(tp_size):
|
|
# NOTE(HandH1998): The weight shapes only works for DeepSeek-V3.
|
|
# Modify them, if you tune for another different model.
|
|
# cannot TP
|
|
total = [
|
|
(512 + 64, 7168),
|
|
(2112, 7168),
|
|
((128 + 64) * 128, 7168),
|
|
(128 * (128 + 128), 512),
|
|
(7168, 16384),
|
|
(7168, 18432),
|
|
]
|
|
# N can TP
|
|
n_tp = [
|
|
(18432 * 2, 7168),
|
|
((128 + 64) * 128, 7168),
|
|
(128 * (128 + 128), 512),
|
|
(24576, 1536),
|
|
(12288, 7168),
|
|
(4096, 7168),
|
|
]
|
|
# K can TP
|
|
k_tp = [(7168, 18432), (7168, 16384), (7168, 2048)]
|
|
|
|
weight_shapes = []
|
|
for t in total:
|
|
weight_shapes.append(t)
|
|
for n_t in n_tp:
|
|
new_t = (n_t[0] // tp_size, n_t[1])
|
|
weight_shapes.append(new_t)
|
|
for k_t in k_tp:
|
|
new_t = (k_t[0], k_t[1] // tp_size)
|
|
weight_shapes.append(new_t)
|
|
return weight_shapes
|
|
|
|
|
|
def benchmark_config(
|
|
A, B, As, Bs, block_size, config, out_dtype=torch.float16, num_iters=10
|
|
):
|
|
def run():
|
|
w8a8_block_matmul(A, B, As, Bs, block_size, config, out_dtype)
|
|
|
|
torch.cuda.synchronize()
|
|
# JIT complication & warmup
|
|
for _ in range(5):
|
|
run()
|
|
torch.cuda.synchronize()
|
|
|
|
start_event = torch.cuda.Event(enable_timing=True)
|
|
end_event = torch.cuda.Event(enable_timing=True)
|
|
|
|
latencies: list[float] = []
|
|
for i in range(num_iters):
|
|
torch.cuda.synchronize()
|
|
start_event.record()
|
|
run()
|
|
end_event.record()
|
|
end_event.synchronize()
|
|
latencies.append(start_event.elapsed_time(end_event))
|
|
avg = sum(latencies) / (num_iters * 10) * 1000 # us
|
|
return avg
|
|
|
|
|
|
def tune(M, N, K, block_size, out_dtype, search_space, input_type):
|
|
factor_for_scale = 1e-2
|
|
|
|
if input_type == "fp8":
|
|
fp8_info = torch.finfo(torch.float8_e4m3fn)
|
|
fp8_max, fp8_min = fp8_info.max, fp8_info.min
|
|
|
|
A_fp32 = (
|
|
(torch.rand(M, K, dtype=torch.float32, device="cuda") - 0.5) * 2 * fp8_max
|
|
)
|
|
A = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
|
|
|
|
B_fp32 = (
|
|
(torch.rand(N, K, dtype=torch.float32, device="cuda") - 0.5) * 2 * fp8_max
|
|
)
|
|
B = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
|
|
else:
|
|
raise RuntimeError("Currently, only support tune w8a8 block fp8 kernel.")
|
|
|
|
block_n, block_k = block_size[0], block_size[1]
|
|
n_tiles = (N + block_n - 1) // block_n
|
|
k_tiles = (K + block_k - 1) // block_k
|
|
|
|
As = torch.rand(M, k_tiles, dtype=torch.float32, device="cuda") * factor_for_scale
|
|
Bs = (
|
|
torch.rand(n_tiles, k_tiles, dtype=torch.float32, device="cuda")
|
|
* factor_for_scale
|
|
)
|
|
|
|
best_config = None
|
|
best_time = float("inf")
|
|
for config in tqdm(search_space):
|
|
try:
|
|
kernel_time = benchmark_config(
|
|
A,
|
|
B,
|
|
As,
|
|
Bs,
|
|
block_size,
|
|
config,
|
|
out_dtype,
|
|
num_iters=10,
|
|
)
|
|
except triton.runtime.autotuner.OutOfResources:
|
|
# Some configurations may be invalid and fail to compile.
|
|
continue
|
|
|
|
if kernel_time < best_time:
|
|
best_time = kernel_time
|
|
best_config = config
|
|
now = datetime.now()
|
|
print(f"{now.ctime()}] Completed tuning for batch_size={M}")
|
|
assert best_config is not None
|
|
return best_config
|
|
|
|
|
|
def save_configs(
|
|
N,
|
|
K,
|
|
block_n,
|
|
block_k,
|
|
configs,
|
|
save_path,
|
|
input_type="fp8",
|
|
) -> None:
|
|
os.makedirs(save_path, exist_ok=True)
|
|
device_name = current_platform.get_device_name().replace(" ", "_")
|
|
json_file_name = (
|
|
f"N={N},K={K},device_name={device_name},dtype={input_type}_w8a8,"
|
|
f"block_shape=[{block_n},{block_k}].json"
|
|
)
|
|
|
|
config_file_path = os.path.join(save_path, json_file_name)
|
|
print(f"Writing best config to {config_file_path}...")
|
|
|
|
with open(config_file_path, "w") as f:
|
|
json.dump(configs, f, indent=4)
|
|
f.write("\n")
|
|
|
|
|
|
def tune_on_gpu(args_dict):
|
|
"""Run tuning on a specific GPU."""
|
|
gpu_id = args_dict["gpu_id"]
|
|
batch_sizes = args_dict["batch_sizes"]
|
|
weight_shapes = args_dict["weight_shapes"]
|
|
args = args_dict["args"]
|
|
|
|
torch.cuda.set_device(gpu_id)
|
|
print(f"Starting tuning on GPU {gpu_id} with batch sizes {batch_sizes}")
|
|
|
|
block_n = args.block_n
|
|
block_k = args.block_k
|
|
out_dtype = DTYPE_MAP[args.out_dtype]
|
|
save_path = args.save_path
|
|
input_type = args.input_type
|
|
|
|
search_space = get_configs_compute_bound()
|
|
search_space = [
|
|
config for config in search_space if block_k % config["BLOCK_SIZE_K"] == 0
|
|
]
|
|
|
|
start = time.time()
|
|
for shape in tqdm(weight_shapes, desc=f"GPU {gpu_id} - Shapes"):
|
|
N, K = shape[0], shape[1]
|
|
print(f"[GPU {gpu_id}] Tune for weight shape of `N: {N}, K: {K}`")
|
|
benchmark_results = [
|
|
tune(
|
|
batch_size,
|
|
N,
|
|
K,
|
|
[block_n, block_k],
|
|
out_dtype,
|
|
search_space,
|
|
input_type,
|
|
)
|
|
for batch_size in tqdm(batch_sizes, desc=f"GPU {gpu_id} - Batch sizes")
|
|
]
|
|
best_configs = {M: config for M, config in zip(batch_sizes, benchmark_results)}
|
|
save_configs(N, K, block_n, block_k, best_configs, save_path, input_type)
|
|
|
|
end = time.time()
|
|
print(f"Tuning on GPU {gpu_id} took {end - start:.2f} seconds")
|
|
|
|
|
|
def distribute_batch_sizes(batch_sizes, num_gpus):
|
|
"""Distribute batch sizes across available GPUs."""
|
|
batches_per_gpu = []
|
|
for i in range(num_gpus):
|
|
start_idx = i * len(batch_sizes) // num_gpus
|
|
end_idx = (i + 1) * len(batch_sizes) // num_gpus
|
|
batches_per_gpu.append(batch_sizes[start_idx:end_idx])
|
|
return batches_per_gpu
|
|
|
|
|
|
def main(args):
|
|
print(args)
|
|
num_gpus = torch.cuda.device_count()
|
|
if num_gpus == 0:
|
|
raise RuntimeError("No GPU available for tuning")
|
|
print(f"Found {num_gpus} GPUs for parallel tuning")
|
|
|
|
torch.cuda.init()
|
|
|
|
if args.batch_size is None:
|
|
batch_sizes = [
|
|
1,
|
|
2,
|
|
4,
|
|
8,
|
|
16,
|
|
24,
|
|
32,
|
|
48,
|
|
64,
|
|
96,
|
|
128,
|
|
256,
|
|
512,
|
|
1024,
|
|
1536,
|
|
2048,
|
|
3072,
|
|
4096,
|
|
]
|
|
else:
|
|
batch_sizes = [args.batch_size]
|
|
num_gpus = 1 # If only one batch size, use only one GPU
|
|
|
|
weight_shapes = get_weight_shapes(args.tp_size)
|
|
|
|
batches_per_gpu = distribute_batch_sizes(batch_sizes, num_gpus)
|
|
|
|
process_args = []
|
|
for gpu_id in range(num_gpus):
|
|
process_args.append(
|
|
{
|
|
"gpu_id": gpu_id,
|
|
"batch_sizes": batches_per_gpu[gpu_id],
|
|
"weight_shapes": weight_shapes, # Each GPU processes all weight shapes
|
|
"args": args,
|
|
}
|
|
)
|
|
|
|
ctx = mp.get_context("spawn")
|
|
with ctx.Pool(num_gpus) as pool:
|
|
pool.map(tune_on_gpu, process_args)
|
|
|
|
print("Multi-GPU tuning completed")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = FlexibleArgumentParser(
|
|
description="""
|
|
Tune triton w8a8 block fp8 for DeepSeek-V3/DeepSeek-R1:
|
|
python3 benchmark_w8a8_block_fp8.py --tp-size 8 --input-type fp8
|
|
Then copy to model_executor/layers/quantization/utils/configs
|
|
""",
|
|
formatter_class=argparse.RawTextHelpFormatter,
|
|
)
|
|
|
|
parser.add_argument("--tp-size", "-tp", type=int, default=8)
|
|
parser.add_argument("--input-type", type=str, choices=["fp8"], default="fp8")
|
|
parser.add_argument(
|
|
"--out-dtype",
|
|
type=str,
|
|
choices=["float32", "float16", "bfloat16", "half"],
|
|
default="float16",
|
|
)
|
|
parser.add_argument("--block-n", type=int, default=128)
|
|
parser.add_argument("--block-k", type=int, default=128)
|
|
parser.add_argument("--batch-size", type=int, required=False)
|
|
parser.add_argument("--save-path", type=str, default="./")
|
|
args = parser.parse_args()
|
|
|
|
main(args)
|