mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 23:03:52 +08:00
Remove redundant all gather + split (#23441)
Co-authored-by: Chenxi Yang <cxyang@meta.com> Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com>
This commit is contained in:
@ -272,23 +272,10 @@ class Glm4vVisionAttention(nn.Module):
|
||||
def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
|
||||
# [s, b, 3 * head * head_dim]
|
||||
seq_len, bs, _ = qkv.shape
|
||||
if self.tp_size > 1:
|
||||
qkv = all_gather_interleave(qkv, self.qkv.hidden_size,
|
||||
self.tp_size)
|
||||
|
||||
# [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim]
|
||||
q, k, v = qkv.chunk(3, dim=2)
|
||||
|
||||
# 3 * [s, b, head * head_dim]
|
||||
if self.tp_size > 1:
|
||||
splitter = partial(
|
||||
dist_utils.split_tensor_along_last_dim,
|
||||
num_partitions=self.tp_size,
|
||||
)
|
||||
q = splitter(q)[self.tp_rank]
|
||||
k = splitter(k)[self.tp_rank]
|
||||
v = splitter(v)[self.tp_rank]
|
||||
|
||||
# 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim]
|
||||
new_shape = (
|
||||
seq_len,
|
||||
|
Reference in New Issue
Block a user