mirror of
				https://github.com/vllm-project/vllm.git
				synced 2025-11-04 09:24:33 +08:00 
			
		
		
		
	Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io> Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com> Signed-off-by: Huzaifa Sidhpurwala <huzaifas@redhat.com> Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Signed-off-by: mgoin <mgoin64@gmail.com> Signed-off-by: Animesh Jain <anijain@umich.edu> Signed-off-by: Rui Qiao <ruisearch42@gmail.com> Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com> Signed-off-by: Nick Hill <nhill@redhat.com> Signed-off-by: yewentao256 <zhyanwentao@126.com> Signed-off-by: kf <kuanfu.liu@embeddedllm.com> Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com> Signed-off-by: NickLucche <nlucches@redhat.com> Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com> Signed-off-by: Sage Moore <sage@neuralmagic.com> Signed-off-by: tjtanaavllm <tunjian.tan@amd.com> Signed-off-by: Yong Hoon Shin <yhshin@meta.com> Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com> Signed-off-by: Roger Wang <hey@rogerw.me> Signed-off-by: Vadim Gimpelson <vadim.gimpelson@centml.ai> Signed-off-by: Isotr0py <2037008807@qq.com> Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com> Signed-off-by: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> Signed-off-by: yan <yan.ma@intel.com> Signed-off-by: Yan Ma <yan.ma@intel.com> Signed-off-by: Xiao Liu <xiszishu@gmail.com> Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com> Signed-off-by: LopezCastroRoberto <roberto.lopez.castro@udc.es> Signed-off-by: Andy Xie <andy.xning@gmail.com> Signed-off-by: Haibin Lin <haibin.lin@bytedance.com> Signed-off-by: David Ben-David <davidb@pliops.com> Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Signed-off-by: jiang1.li <jiang1.li@intel.com> Signed-off-by: Seiji Eicher <seiji@anyscale.com> Signed-off-by: zitian.zhao <zitian.zhao@tencentmusic.com> Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com> Signed-off-by: Abirdcfly <fp544037857@gmail.com> Signed-off-by: Giancarlo Delfin <gdelfin@meta.com> Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com> Signed-off-by: huangweixiao <huangweixiao@msh.team> Signed-off-by: alyosha-swamy <raghav@arcee.ai> Signed-off-by: Eric Hanley <ericehanley@google.com> Signed-off-by: Abatom <abzhonghua@gmail.com> Signed-off-by: CLFutureX <775523362@qq.com> Signed-off-by: Linkun Chen <github@lkchen.net> Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com> Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com> Signed-off-by: tlipoca9 <tlipoca9@gmail.com> Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com> Signed-off-by: zitian zhao <zitian.zhao@tencentmusic.com> Signed-off-by: mgoin <michael@neuralmagic.com> Signed-off-by: wang.yuqi <noooop@126.com> Signed-off-by: Benji Beck <benjibeck@meta.com> Signed-off-by: Siyuan Liu <lsiyuan@google.com> Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai> Signed-off-by: isotr0py <2037008807@qq.com> Signed-off-by: Chen Zhang <zhangch99@outlook.com> Signed-off-by: simon-mo <xmo@berkeley.edu> Signed-off-by: LucasWilkinson <lwilkinson@neuralmagic.com> Signed-off-by: Zhang Jason <ning.zhang2@amd.com> Signed-off-by: Yongye Zhu <zyy1102000@gmail.com> Signed-off-by: asafg <asafg@ai21.com> Signed-off-by: Siyuan Fu <siyuanf@nvidia.com> Signed-off-by: Lain <fusiyuan2000@hotmail.com> Signed-off-by: Max de Bayser <mbayser@br.ibm.com> Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Signed-off-by: Kunshang Ji <kunshang.ji@intel.com> Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Signed-off-by: QscQ <qscqesze@gmail.com> Signed-off-by: qingjun <qingjun@minimaxi.com> Signed-off-by: Syed Muhammad Bin Asif <syedmba7@connect.hku.hk> Signed-off-by: Lionel Villard <villard@us.ibm.com> Signed-off-by: ycyaw66 <497410282@qq.com> Signed-off-by: David Chen <530634352@qq.com> Signed-off-by: Linkun <github@lkchen.net> Signed-off-by: Moritz Sanft <58110325+msanft@users.noreply.github.com> Signed-off-by: Ming Yang <minos.future@gmail.com> Signed-off-by: Adrian Garcia <adrian.garcia@inceptionai.ai> Signed-off-by: shaojunqi <shaojunqi.sjq@alibaba-inc.com> Signed-off-by: Ricardo Decal <rdecal@anyscale.com> Signed-off-by: Andrew Chan <andrewkchan.akc@gmail.com> Signed-off-by: Felix Marty <Felix.Marty@amd.com> Signed-off-by: Andrew Sansom <andrew@protopia.ai> Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com> Signed-off-by: Shu Wang <shuw@nvidia.com> Signed-off-by: Po-Han Huang <pohanh@nvidia.com> Signed-off-by: Shu Wang. <shuw@nvidia.com> Signed-off-by: XIn Li <xinli@nvidia.com> Signed-off-by: Junhao Li <junhao@ubicloud.com> Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com> Signed-off-by: iAmir97 <Amir.balwel@embeddedllm.com> Signed-off-by: iAmir97 <71513472+iAmir97@users.noreply.github.com> Signed-off-by: <zyy1102000@gmail.com> Signed-off-by: Guy Stone <guys@spotify.com> Signed-off-by: <yyweiss@gmail.com> Signed-off-by: yyw <yyweiss@gmail.com> Signed-off-by: Russell Bryant <rbryant@redhat.com> Signed-off-by: Pradyun Ramadorai <pradyunr@amazon.com> Signed-off-by: Pradyun92 <142861237+Pradyun92@users.noreply.github.com> Signed-off-by: Jinzhen Lin <jinzhen.ljz@antgroup.com> Co-authored-by: rongfu.leng <rongfu.leng@daocloud.io> Co-authored-by: Huzaifa Sidhpurwala <huzaifas@redhat.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Russell Bryant <rbryant@redhat.com> Co-authored-by: Varun Sundar Rabindranath <varunsundar08@gmail.com> Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Animesh Jain <jainanimesh2305@yahoo.com> Co-authored-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Co-authored-by: XiongfeiWei <isaacwxf23@gmail.com> Co-authored-by: Nick Hill <nhill@redhat.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: JartX <sagformas@gmail.com> Co-authored-by: fhl2000 <63384265+fhl2000@users.noreply.github.com> Co-authored-by: vllmellm <vllm.ellm@embeddedllm.com> Co-authored-by: kf <kuanfu.liu@embeddedllm.com> Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com> Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com> Co-authored-by: Sage Moore <sage@neuralmagic.com> Co-authored-by: tjtanaavllm <tunjian.tan@amd.com> Co-authored-by: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Co-authored-by: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com> Co-authored-by: Roger Wang <hey@rogerw.me> Co-authored-by: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com> Co-authored-by: Yuxuan Zhang <2448370773@qq.com> Co-authored-by: Isotr0py <2037008807@qq.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk> Co-authored-by: Thomas Parnell <tpa@zurich.ibm.com> Co-authored-by: Yan Ma <yan.ma@intel.com> Co-authored-by: Xiao <xiszishu@gmail.com> Co-authored-by: jiahanc <173873397+jiahanc@users.noreply.github.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com> Co-authored-by: Roberto L. Castro <38211239+LopezCastroRoberto@users.noreply.github.com> Co-authored-by: Ning Xie <andy.xning@gmail.com> Co-authored-by: H <linhaibin.eric@gmail.com> Co-authored-by: David Ben-David <sdavidbd@gmail.com> Co-authored-by: David Ben-David <davidb@pliops.com> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Co-authored-by: Li, Jiang <jiang1.li@intel.com> Co-authored-by: TankNee <nee@tanknee.cn> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com> Co-authored-by: ZiTian.Zhao <zitian.zhao@tencentmusic.com> Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com> Co-authored-by: Abirdcfly <fp544037857@gmail.com> Co-authored-by: Giancarlo Delfin <32987265+TheEpicDolphin@users.noreply.github.com> Co-authored-by: Chenxi Yang <cxyang@cs.utexas.edu> Co-authored-by: Chenxi Yang <cxyang@meta.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Weixiao Huang <hwx.simle@gmail.com> Co-authored-by: Raghav Ravishankar <113712354+alyosha-swamy@users.noreply.github.com> Co-authored-by: ericehanley <ericehanley@google.com> Co-authored-by: Zhonghua Deng <abzhonghua@gmail.com> Co-authored-by: Po-Han Huang (NVIDIA) <53919306+nvpohanh@users.noreply.github.com> Co-authored-by: PiteXChen <44110731+CLFutureX@users.noreply.github.com> Co-authored-by: lkchen <github@lkchen.net> Co-authored-by: TJian <tunjian.tan@embeddedllm.com> Co-authored-by: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Co-authored-by: tlipoca9 <160737620+tlipoca9@users.noreply.github.com> Co-authored-by: elvischenv <219235043+elvischenv@users.noreply.github.com> Co-authored-by: wang.yuqi <noooop@126.com> Co-authored-by: Benji Beck <benjibeck@meta.com> Co-authored-by: youkaichao <youkaichao@gmail.com> Co-authored-by: Siyuan Liu <lsiyuan@google.com> Co-authored-by: Benjamin Chislett <chislett.ben@gmail.com> Co-authored-by: LiuXiaoxuanPKU <lilyliupku@gmail.com> Co-authored-by: simon-mo <xmo@berkeley.edu> Co-authored-by: Chen Zhang <zhangch99@outlook.com> Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com> Co-authored-by: Minseok Lee <47620120+minseokl@users.noreply.github.com> Co-authored-by: Yongye Zhu <zyy1102000@gmail.com> Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com> Co-authored-by: Zhang Jason <ning.zhang2@amd.com> Co-authored-by: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com> Co-authored-by: asafg <asafg@ai21.com> Co-authored-by: Lain <siyuanf@nvidia.com> Co-authored-by: tc-mb <157115220+tc-mb@users.noreply.github.com> Co-authored-by: imning3 <hbning@pku.edu.cn> Co-authored-by: Maximilien de Bayser <mbayser@br.ibm.com> Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> Co-authored-by: Tao He <linzhu.ht@alibaba-inc.com> Co-authored-by: qscqesze <qingjun@minimaxi.com> Co-authored-by: Syed Muhammad Bin Asif <92625830+syedmba@users.noreply.github.com> Co-authored-by: Lionel Villard <villard@us.ibm.com> Co-authored-by: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Co-authored-by: ycyaw66 <497410282@qq.com> Co-authored-by: Moritz Sanft <58110325+msanft@users.noreply.github.com> Co-authored-by: Ming Yang <minos.future@gmail.com> Co-authored-by: Adrián García García <adrigarvk8@gmail.com> Co-authored-by: Michael Goin <mgoin@redhat.com> Co-authored-by: JaceyShao <65159281+JaceyShao@users.noreply.github.com> Co-authored-by: shaojunqi <shaojunqi.sjq@alibaba-inc.com> Co-authored-by: Ricardo Decal <crypdick@users.noreply.github.com> Co-authored-by: Andrew Chan <andrewkchan.akc@gmail.com> Co-authored-by: fxmarty-amd <felmarty@amd.com> Co-authored-by: Andrew Sansom <andrew@protopia.ai> Co-authored-by: Zhiyu <zhiyuc@nvidia.com> Co-authored-by: Shu Wang <shuw@nvidia.com> Co-authored-by: XIn Li <xinli@nvidia.com> Co-authored-by: Junhao Li <streaver91@gmail.com> Co-authored-by: Chauncey <chaunceyjiang@gmail.com> Co-authored-by: iAmir97 <71513472+iAmir97@users.noreply.github.com> Co-authored-by: iAmir97 <Amir.balwel@embeddedllm.com> Co-authored-by: Hong Hanh <hanh.usth@gmail.com> Co-authored-by: Daniel Serebrenik <74646983+pliops-daniels@users.noreply.github.com> Co-authored-by: yewentao256 <zhyanwentao@126.com> Co-authored-by: Guy Stone <guys@spotify.com> Co-authored-by: yyweiss <70619747+yyweiss@users.noreply.github.com> Co-authored-by: Pradyun92 <142861237+Pradyun92@users.noreply.github.com> Co-authored-by: Pradyun Ramadorai <pradyunr@amazon.com> Co-authored-by: Nicolò Lucchesi <nicolo.lucchesi@gmail.com>
		
			
				
	
	
		
			142 lines
		
	
	
		
			4.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			142 lines
		
	
	
		
			4.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# SPDX-License-Identifier: Apache-2.0
 | 
						|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
						|
import glob
 | 
						|
import itertools
 | 
						|
import os
 | 
						|
import subprocess
 | 
						|
 | 
						|
import jinja2
 | 
						|
 | 
						|
FILE_HEAD = """
 | 
						|
// auto generated by generate.py
 | 
						|
// clang-format off
 | 
						|
 | 
						|
#include "kernel.h"
 | 
						|
#include "marlin_template.h"
 | 
						|
 | 
						|
namespace MARLIN_NAMESPACE_NAME {
 | 
						|
""".strip()
 | 
						|
 | 
						|
TEMPLATE = ("template __global__ void Marlin<"
 | 
						|
            "{{scalar_t}}, "
 | 
						|
            "{{w_type_id}}, "
 | 
						|
            "{{s_type_id}}, "
 | 
						|
            "{{threads}}, "
 | 
						|
            "{{thread_m_blocks}}, "
 | 
						|
            "{{thread_n_blocks}}, "
 | 
						|
            "{{thread_k_blocks}}, "
 | 
						|
            "{{'true' if m_block_size_8 else 'false'}}, "
 | 
						|
            "{{stages}}, "
 | 
						|
            "{{group_blocks}}, "
 | 
						|
            "{{'true' if is_zp_float else 'false'}}>"
 | 
						|
            "( MARLIN_KERNEL_PARAMS );")
 | 
						|
 | 
						|
# int8 with zero point case (vllm::kU8) is also supported,
 | 
						|
# we don't add it to reduce wheel size.
 | 
						|
SCALAR_TYPES = [
 | 
						|
    "vllm::kU4", "vllm::kU4B8", "vllm::kU8B128", "vllm::kFE4M3fn",
 | 
						|
    "vllm::kFE2M1f"
 | 
						|
]
 | 
						|
THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128),
 | 
						|
                  (128, 64, 128)]
 | 
						|
 | 
						|
THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
 | 
						|
# group_blocks:
 | 
						|
#   = 0 : act order case
 | 
						|
#   = -1 : channelwise quantization
 | 
						|
#   > 0 : group_size=16*group_blocks
 | 
						|
GROUP_BLOCKS = [0, 1, -1, 2, 4, 8]
 | 
						|
DTYPES = ["fp16", "bf16"]
 | 
						|
 | 
						|
 | 
						|
def remove_old_kernels():
 | 
						|
    for filename in glob.glob(os.path.dirname(__file__) + "/kernel_*.cu"):
 | 
						|
        subprocess.call(["rm", "-f", filename])
 | 
						|
 | 
						|
 | 
						|
def generate_new_kernels():
 | 
						|
    for scalar_type, dtype in itertools.product(SCALAR_TYPES, DTYPES):
 | 
						|
        all_template_str_list = []
 | 
						|
 | 
						|
        for group_blocks, m_blocks, thread_configs in itertools.product(
 | 
						|
                GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS):
 | 
						|
 | 
						|
            # act order case only support gptq-int4 and gptq-int8
 | 
						|
            if group_blocks == 0 and scalar_type not in [
 | 
						|
                    "vllm::kU4B8", "vllm::kU8B128"
 | 
						|
            ]:
 | 
						|
                continue
 | 
						|
            if thread_configs[2] == 256:
 | 
						|
                # for small batch (m_blocks == 1), we only need (128, 128, 256)
 | 
						|
                # for large batch (m_blocks > 1), we only need (64, 256, 256)
 | 
						|
                if m_blocks <= 1 and thread_configs[0] != 128:
 | 
						|
                    continue
 | 
						|
                if m_blocks > 1 and thread_configs[0] != 64:
 | 
						|
                    continue
 | 
						|
 | 
						|
            # we only support channelwise quantization and group_size == 128
 | 
						|
            # for fp8
 | 
						|
            if scalar_type == "vllm::kFE4M3fn" and group_blocks not in [-1, 8]:
 | 
						|
                continue
 | 
						|
            # nvfp4 only supports group_size == 16
 | 
						|
            # mxfp4 only supports group_size == 32
 | 
						|
            if scalar_type == "vllm::kFE2M1f" and group_blocks not in [1, 2]:
 | 
						|
                continue
 | 
						|
            # other quantization methods don't support group_size = 16
 | 
						|
            if scalar_type != "vllm::kFE2M1f" and group_blocks == 1:
 | 
						|
                continue
 | 
						|
 | 
						|
            k_blocks = thread_configs[0] // 16
 | 
						|
            n_blocks = thread_configs[1] // 16
 | 
						|
            threads = thread_configs[2]
 | 
						|
 | 
						|
            c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"
 | 
						|
 | 
						|
            is_zp_float_list = [False]
 | 
						|
            if dtype == "fp16" and scalar_type == "vllm::kU4" and \
 | 
						|
                    group_blocks == 4:
 | 
						|
                # HQQ (is_zp_float = true) only supports
 | 
						|
                # 4bit quantization and fp16
 | 
						|
                is_zp_float_list.append(True)
 | 
						|
 | 
						|
            if scalar_type == "vllm::kFE2M1f" and group_blocks == 1:
 | 
						|
                s_type = "vllm::kFE4M3fn"
 | 
						|
            elif scalar_type == "vllm::kFE2M1f" and group_blocks == 2:
 | 
						|
                s_type = "vllm::kFE8M0fnu"
 | 
						|
                if dtype == "fp16":
 | 
						|
                    # we cannot safely dequantize e8m0 to fp16, so skip this
 | 
						|
                    continue
 | 
						|
            elif dtype == "fp16":
 | 
						|
                s_type = "vllm::kFloat16"
 | 
						|
            elif dtype == "bf16":
 | 
						|
                s_type = "vllm::kBFloat16"
 | 
						|
 | 
						|
            for is_zp_float in is_zp_float_list:
 | 
						|
                template_str = jinja2.Template(TEMPLATE).render(
 | 
						|
                    scalar_t=c_dtype,
 | 
						|
                    w_type_id=scalar_type + ".id()",
 | 
						|
                    s_type_id=s_type + ".id()",
 | 
						|
                    threads=threads,
 | 
						|
                    thread_m_blocks=max(m_blocks, 1),
 | 
						|
                    thread_n_blocks=n_blocks,
 | 
						|
                    thread_k_blocks=k_blocks,
 | 
						|
                    m_block_size_8=m_blocks == 0.5,
 | 
						|
                    stages="pipe_stages",
 | 
						|
                    group_blocks=group_blocks,
 | 
						|
                    is_zp_float=is_zp_float,
 | 
						|
                )
 | 
						|
 | 
						|
                all_template_str_list.append(template_str)
 | 
						|
 | 
						|
        file_content = FILE_HEAD + "\n\n"
 | 
						|
        file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
 | 
						|
        filename = f"kernel_{dtype}_{scalar_type[6:].lower()}.cu"
 | 
						|
 | 
						|
        with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
 | 
						|
            f.write(file_content)
 | 
						|
 | 
						|
 | 
						|
if __name__ == "__main__":
 | 
						|
    remove_old_kernels()
 | 
						|
    generate_new_kernels()
 |