mirror of
https://github.com/vllm-project/vllm-ascend.git
synced 2025-10-20 13:43:53 +08:00
[BugFix] fix qwenVL quant assertion error (#3466)
### What this PR does / why we need it? This PR fixes issues: 1. Solve the problem that multimodal scene cannot do weight prefetching and throw an assertion error exception. 2. Standardize the grid_thw data type of qwen2VL to torch.int32. ### Does this PR introduce _any_ user-facing change? None. ### How was this patch tested? - ci & e2e - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: elilzhu <2435754260@qq.com> Co-authored-by: zhulei (AK) <z00692222@china.huawei.com>
This commit is contained in:
@ -314,6 +314,7 @@ class AscendQwen2VisionTransformer(Qwen2VisionTransformer):
|
||||
x: torch.Tensor,
|
||||
grid_thw: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
grid_thw = torch.tensor(grid_thw, dtype=torch.int32)
|
||||
# compute cu_seqlens and avoid cumsum to fit operator unpadFA
|
||||
cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2],
|
||||
grid_thw[:,
|
||||
|
@ -99,8 +99,11 @@ class AscendW8A8LinearMethod:
|
||||
) -> torch.Tensor:
|
||||
if x.dtype != torch.int8:
|
||||
layer_cls_name = layer.__class__.__name__
|
||||
try:
|
||||
weight_prefetch_method = get_forward_context(
|
||||
).weight_prefetch_method
|
||||
except AssertionError:
|
||||
weight_prefetch_method = None
|
||||
|
||||
# prefetch qkvo_proj.weight preprocess
|
||||
if weight_prefetch_method:
|
||||
|
Reference in New Issue
Block a user