mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
[BugFix] Add an env to disable moe chunking to work around compile incompatibility (#19642)
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
This commit is contained in:
committed by
GitHub
parent
e91386cde1
commit
33d51f599e
@ -49,6 +49,7 @@ if TYPE_CHECKING:
|
||||
VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache")
|
||||
VLLM_XLA_CHECK_RECOMPILATION: bool = False
|
||||
VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024
|
||||
VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING: bool = True
|
||||
VLLM_USE_RAY_SPMD_WORKER: bool = False
|
||||
VLLM_USE_RAY_COMPILED_DAG: bool = False
|
||||
VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: str = "auto"
|
||||
@ -535,6 +536,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
lambda: bool(int(os.getenv("VLLM_XLA_USE_SPMD", "0"))),
|
||||
"VLLM_FUSED_MOE_CHUNK_SIZE":
|
||||
lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")),
|
||||
# Control whether to use fused MoE activation chunking. Current chunking
|
||||
# logic is incompatible with torch.compile and causes IMA. See issue
|
||||
# https://github.com/vllm-project/vllm/issues/19631.
|
||||
"VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING":
|
||||
lambda: bool(
|
||||
int(os.getenv("VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING", "1"))),
|
||||
|
||||
# If set, vllm will skip the deprecation warnings.
|
||||
"VLLM_NO_DEPRECATION_WARNING":
|
||||
|
@ -225,6 +225,10 @@ class FusedMoEPermuteExpertsUnpermute(ABC):
|
||||
else:
|
||||
raise ValueError(f"Unsupported FusedMoe activation: {activation}")
|
||||
|
||||
def enable_chunking(self):
|
||||
return envs.VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING and \
|
||||
self.supports_chunking()
|
||||
|
||||
@abstractmethod
|
||||
def apply(
|
||||
self,
|
||||
@ -400,7 +404,7 @@ class FusedMoEModularKernel(torch.nn.Module):
|
||||
else:
|
||||
_, M, N, K, top_k = _moe_problem_size(a1q, w1, w2, topk_ids)
|
||||
|
||||
if self.fused_experts.supports_chunking():
|
||||
if self.fused_experts.enable_chunking():
|
||||
CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
|
||||
num_chunks = cdiv(M, CHUNK_SIZE)
|
||||
else:
|
||||
|
Reference in New Issue
Block a user