[BugFix] Add an env to disable moe chunking to work around compile incompatibility (#19642)

Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
This commit is contained in:
Ye (Charlotte) Qi
2025-06-22 15:17:49 -07:00
committed by GitHub
parent e91386cde1
commit 33d51f599e
2 changed files with 12 additions and 1 deletions

View File

@ -49,6 +49,7 @@ if TYPE_CHECKING:
VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache")
VLLM_XLA_CHECK_RECOMPILATION: bool = False
VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024
VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING: bool = True
VLLM_USE_RAY_SPMD_WORKER: bool = False
VLLM_USE_RAY_COMPILED_DAG: bool = False
VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: str = "auto"
@ -535,6 +536,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
lambda: bool(int(os.getenv("VLLM_XLA_USE_SPMD", "0"))),
"VLLM_FUSED_MOE_CHUNK_SIZE":
lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")),
# Control whether to use fused MoE activation chunking. Current chunking
# logic is incompatible with torch.compile and causes IMA. See issue
# https://github.com/vllm-project/vllm/issues/19631.
"VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING":
lambda: bool(
int(os.getenv("VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING", "1"))),
# If set, vllm will skip the deprecation warnings.
"VLLM_NO_DEPRECATION_WARNING":

View File

@ -225,6 +225,10 @@ class FusedMoEPermuteExpertsUnpermute(ABC):
else:
raise ValueError(f"Unsupported FusedMoe activation: {activation}")
def enable_chunking(self):
return envs.VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING and \
self.supports_chunking()
@abstractmethod
def apply(
self,
@ -400,7 +404,7 @@ class FusedMoEModularKernel(torch.nn.Module):
else:
_, M, N, K, top_k = _moe_problem_size(a1q, w1, w2, topk_ids)
if self.fused_experts.supports_chunking():
if self.fused_experts.enable_chunking():
CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
num_chunks = cdiv(M, CHUNK_SIZE)
else: