mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
Enable 4bit bnb prequant MOE (#21548)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
This commit is contained in:
@ -427,14 +427,10 @@ class BitsAndBytesModelLoader(BaseModelLoader):
|
||||
elif isinstance(module, FusedMoE) and hasattr(
|
||||
module.quant_method, "quant_config"):
|
||||
# TODO: support FusedMoE with prequant and 8bit.
|
||||
if self.pre_quant:
|
||||
if self.pre_quant and self.load_8bit:
|
||||
raise ValueError(
|
||||
"Prequant BitsAndBytes models with FusedMoE is not "
|
||||
"supported yet.")
|
||||
if self.load_8bit:
|
||||
raise ValueError(
|
||||
"BitsAndBytes 8bit quantization with FusedMoE is not "
|
||||
"supported yet.")
|
||||
"Prequant BitsAndBytes 8bit models with FusedMoE "
|
||||
"is not supported yet.")
|
||||
# Get the corresponding weight name using module name and
|
||||
# expert_params_mapping.
|
||||
|
||||
|
@ -684,4 +684,4 @@ class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA,
|
||||
return loader.load_weights(weights)
|
||||
|
||||
def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
|
||||
return self.model.get_expert_mapping()
|
||||
return self.model.get_expert_mapping()
|
Reference in New Issue
Block a user