diff --git a/vllm/envs.py b/vllm/envs.py index b1030997f2..93a7c8069c 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -49,6 +49,7 @@ if TYPE_CHECKING: VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache") VLLM_XLA_CHECK_RECOMPILATION: bool = False VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024 + VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING: bool = True VLLM_USE_RAY_SPMD_WORKER: bool = False VLLM_USE_RAY_COMPILED_DAG: bool = False VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: str = "auto" @@ -535,6 +536,12 @@ environment_variables: dict[str, Callable[[], Any]] = { lambda: bool(int(os.getenv("VLLM_XLA_USE_SPMD", "0"))), "VLLM_FUSED_MOE_CHUNK_SIZE": lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")), + # Control whether to use fused MoE activation chunking. Current chunking + # logic is incompatible with torch.compile and causes IMA. See issue + # https://github.com/vllm-project/vllm/issues/19631. + "VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING": + lambda: bool( + int(os.getenv("VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING", "1"))), # If set, vllm will skip the deprecation warnings. "VLLM_NO_DEPRECATION_WARNING": diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index ed3b6b8a1a..d25d70d3ef 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -225,6 +225,10 @@ class FusedMoEPermuteExpertsUnpermute(ABC): else: raise ValueError(f"Unsupported FusedMoe activation: {activation}") + def enable_chunking(self): + return envs.VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING and \ + self.supports_chunking() + @abstractmethod def apply( self, @@ -400,7 +404,7 @@ class FusedMoEModularKernel(torch.nn.Module): else: _, M, N, K, top_k = _moe_problem_size(a1q, w1, w2, topk_ids) - if self.fused_experts.supports_chunking(): + if self.fused_experts.enable_chunking(): CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE num_chunks = cdiv(M, CHUNK_SIZE) else: