[V1] Fix local chunked attention always disabled (#21419)

Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
This commit is contained in:
Yong Hoon Shin
2025-07-23 15:59:30 -07:00
committed by GitHub
parent 5c9b807b34
commit 78c13e30e1

View File

@ -143,6 +143,8 @@ class Attention(nn.Module):
# the backends)
if envs.VLLM_USE_V1:
self.use_irope = extra_impl_args.pop("use_irope", False)
else:
self.use_irope = extra_impl_args.get("use_irope", False)
quant_method = quant_config.get_quant_method(
self, prefix=prefix) if quant_config else None
@ -177,7 +179,6 @@ class Attention(nn.Module):
kv_sharing_target_layer_name, **extra_impl_args)
self.backend = backend_name_to_enum(attn_backend.get_name())
self.dtype = dtype
self.use_irope = extra_impl_args.get("use_irope", False)
# For cuda-alike (CUDA and ROCM) and cpu platforms, we control how
# torch.compile works by registering the attention as one giant