[Bugfix] Fix import of CutlassExpertsFp8 in compressed_tensors_moe.py (#20381)

Signed-off-by: Bill Nell <bnell@redhat.com>
This commit is contained in:
bnellnm
2025-07-02 22:07:43 -04:00
committed by GitHub
parent 9965c47d0d
commit 2e25bb12a8
2 changed files with 11 additions and 7 deletions

View File

@ -14,9 +14,9 @@ import vllm.envs as envs
from vllm import _custom_ops as ops
from vllm.logger import init_logger
from vllm.model_executor.layers.fused_moe import (
CutlassExpertsFp8, FusedMoE, FusedMoEActivationFormat, FusedMoEConfig,
FusedMoEMethodBase, FusedMoEPermuteExpertsUnpermute,
FusedMoEPrepareAndFinalize, FusedMoeWeightScaleSupported, fused_experts)
FusedMoE, FusedMoEActivationFormat, FusedMoEConfig, FusedMoEMethodBase,
FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize,
FusedMoeWeightScaleSupported)
from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16 import ( # noqa
WNA16_SUPPORTED_BITS, WNA16_SUPPORTED_TYPES_MAP)
from vllm.model_executor.layers.quantization.utils import replace_parameter
@ -570,6 +570,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
del layer.w2_input_scale
self.fused_experts_func = None
else:
from vllm.model_executor.layers.fused_moe import fused_experts
self.fused_experts_func = fused_experts
def apply(
@ -826,6 +827,7 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
prepare_finalize: FusedMoEPrepareAndFinalize,
moe: FusedMoEConfig,
) -> FusedMoEPermuteExpertsUnpermute:
from vllm.model_executor.layers.fused_moe import CutlassExpertsFp8
use_batched_format = (prepare_finalize.activation_format ==
FusedMoEActivationFormat.BatchedExperts)

View File

@ -14,10 +14,9 @@ from vllm import _custom_ops as ops
from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.logger import init_logger
from vllm.model_executor.layers.fused_moe import (
BatchedTritonOrDeepGemmExperts, FusedMoE, FusedMoEActivationFormat,
FusedMoEConfig, FusedMoEMethodBase, FusedMoEPermuteExpertsUnpermute,
FusedMoEPrepareAndFinalize, FusedMoeWeightScaleSupported,
TritonOrDeepGemmExperts)
FusedMoE, FusedMoEActivationFormat, FusedMoEConfig, FusedMoEMethodBase,
FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize,
FusedMoeWeightScaleSupported)
from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
UnquantizedLinearMethod)
from vllm.model_executor.layers.quantization import QuantizationMethods
@ -785,6 +784,9 @@ class Fp8MoEMethod(FusedMoEMethodBase):
prepare_finalize: FusedMoEPrepareAndFinalize,
moe: FusedMoEConfig,
) -> FusedMoEPermuteExpertsUnpermute:
from vllm.model_executor.layers.fused_moe import (
BatchedTritonOrDeepGemmExperts, TritonOrDeepGemmExperts)
assert not self.use_marlin and not self.rocm_aiter_moe_enabled, (
"Marlin and ROCm AITER are not supported with all2all yet.")