mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
[V1] Fix local chunked attention always disabled (#21419)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
This commit is contained in:
@ -143,6 +143,8 @@ class Attention(nn.Module):
|
||||
# the backends)
|
||||
if envs.VLLM_USE_V1:
|
||||
self.use_irope = extra_impl_args.pop("use_irope", False)
|
||||
else:
|
||||
self.use_irope = extra_impl_args.get("use_irope", False)
|
||||
|
||||
quant_method = quant_config.get_quant_method(
|
||||
self, prefix=prefix) if quant_config else None
|
||||
@ -177,7 +179,6 @@ class Attention(nn.Module):
|
||||
kv_sharing_target_layer_name, **extra_impl_args)
|
||||
self.backend = backend_name_to_enum(attn_backend.get_name())
|
||||
self.dtype = dtype
|
||||
self.use_irope = extra_impl_args.get("use_irope", False)
|
||||
|
||||
# For cuda-alike (CUDA and ROCM) and cpu platforms, we control how
|
||||
# torch.compile works by registering the attention as one giant
|
||||
|
Reference in New Issue
Block a user