Files
vllm-ascend/csrc/kernels/bgmv_expand.cpp
yupeng 9f1e054fe3 [Bugfix][LoRA][Operator] Fix LoRA custom operators accuracy issue (#2672)
### What this PR does / why we need it?
Fix the LoRA accuracy issue that introduced by custom AscendC operator
"bgmv_shrink, sgmv_shrink, bgmv_expand, sgmv_epand".

The bug details are: 
- In the kernel function, if you want to call GlobalTensor.GetSize
method, you have to pass the second parameter of bufferSize when you
call GlobalTensor.SetGlobalBuffer first.
- Or GlobalTensor.GetSize method will return a random value.
- You can refer to [this
doc](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/81RC1alpha002/apiref/ascendcopapi/atlasascendc_api_07_00024.html).

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
pytest -sv tests/e2e/singlecard/test_ilama_lora.py
pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py

- vLLM version: v0.10.1.1
- vLLM main:
a344a5aa0a

---------

Signed-off-by: paulyu12 <paulyu0307@gmail.com>
Signed-off-by: paulyu12 <507435917@qq.com>
Co-authored-by: paulyu12 <paulyu0307@gmail.com>
2025-09-02 11:46:59 +08:00

369 lines
18 KiB
C++

/*
* Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "kernel_operator.h"
#include "types.h"
template <typename scalar_t>
class BGMVExpand {
public:
using X_T = float;
using W_T = scalar_t;
using Y_T = scalar_t;
static constexpr uint64_t LORA_RANK_8 = 8;
static constexpr uint64_t LORA_RANK_16 = 16;
static constexpr uint64_t LORA_RANK_32 = 32;
static constexpr uint64_t LORA_RANK_64 = 64;
static constexpr uint64_t SUPPORTED_RANKS[] = {LORA_RANK_8, LORA_RANK_16, LORA_RANK_32, LORA_RANK_64};
static constexpr int32_t BUFFER_NUM = 2;
// The vector unit reads 8 blocks (32 bytes each and 256 bytes in total) of contiguous data each time.
static constexpr int32_t NUM_BYTES_PER_REPEAT = 256;
static constexpr int32_t NUM_BLOCKS_PER_REPEAT = 8;
// The maximum number of elements in a single iteration is 256 / sizeof(intermediate data type).
static constexpr int32_t NUM_ELEMENTS_PER_REPEAT = NUM_BYTES_PER_REPEAT / sizeof(float);
// Mask is used to control the elements that participate in computation in each iteration.
static constexpr int32_t MASK_COUNT = NUM_BYTES_PER_REPEAT / sizeof(float);
// Refer to numOutputElementsPerInputTile_ initialization for the constraints on the following constants.
static constexpr int32_t W_IN_TILE_NUM_ELEMENTS = 8192;
static constexpr int32_t Y_OUT_TILE_NUM_ELEMENTS = 4096;
static constexpr int32_t BLOCK_REDUCE_NUM_REPEATS = W_IN_TILE_NUM_ELEMENTS / NUM_ELEMENTS_PER_REPEAT;
// BlockReduceSum would generate(BLOCK_REDUCE_NUM_REPEATS * NUM_BLOCKS_PER_REPEAT)floats.
// So need to read them all and apply PairReduceSum
static constexpr int32_t PAIR_REDUCE_NUM_REPEATS_16 =
(BLOCK_REDUCE_NUM_REPEATS * NUM_BLOCKS_PER_REPEAT + NUM_ELEMENTS_PER_REPEAT - 1) / NUM_ELEMENTS_PER_REPEAT;
// The second PairReduceSum for rank=32, needs half of the repetition that happened for rank=16.
// Same for rank=64, we do not support ranks greater than 64.
static constexpr int32_t PAIR_REDUCE_NUM_REPEATS_32 = (PAIR_REDUCE_NUM_REPEATS_16 + 1) / 2;
public:
__aicore__ inline BGMVExpand(AscendC::TPipe* pipe) : pipe_(pipe) {}
__aicore__ inline void Init(__gm__ void* x, __gm__ void* weight, __gm__ void* indices,
uint32_t indicesSize, __gm__ void* yIn, __gm__ void* yOut,
uint32_t batchSize, uint32_t numTokensPerCore, uint32_t maxLoRARank,
uint32_t outputHiddenDim, uint32_t sliceOffset, uint32_t outputFullDim)
{
batchSize_ = batchSize;
numTokensPerCore_ = numTokensPerCore;
maxLoRARank_ = maxLoRARank;
outputHiddenDim_ = outputHiddenDim;
sliceOffset_ = sliceOffset;
outputFullDim_ = outputFullDim;
singleLoRAWeightLen_ = maxLoRARank_ * outputHiddenDim_;
xGm_.SetGlobalBuffer((__gm__ X_T *)x);
wGm_.SetGlobalBuffer((__gm__ W_T *)weight);
yInGm_.SetGlobalBuffer((__gm__ Y_T *)yIn);
yOutGm_.SetGlobalBuffer((__gm__ Y_T *)yOut);
indicesGm_.SetGlobalBuffer((__gm__ int64_t *)indices, indicesSize);
pipe_->InitBuffer(inQueueX_, 1, NUM_ELEMENTS_PER_REPEAT * sizeof(X_T));
pipe_->InitBuffer(inQueueW_, BUFFER_NUM, W_IN_TILE_NUM_ELEMENTS * sizeof(W_T));
pipe_->InitBuffer(inQueueY_, BUFFER_NUM, Y_OUT_TILE_NUM_ELEMENTS * sizeof(Y_T));
pipe_->InitBuffer(outQueueY_, BUFFER_NUM, Y_OUT_TILE_NUM_ELEMENTS * sizeof(Y_T));
pipe_->InitBuffer(dupBufferX_, NUM_ELEMENTS_PER_REPEAT * sizeof(float));
pipe_->InitBuffer(tmpBufferW_, W_IN_TILE_NUM_ELEMENTS * sizeof(float));
pipe_->InitBuffer(inBufferY_, Y_OUT_TILE_NUM_ELEMENTS * sizeof(float));
pipe_->InitBuffer(tmpBufferY_, Y_OUT_TILE_NUM_ELEMENTS * sizeof(float));
// Each compute iteration would generate not one, but several output elements.
// Therefore, the following variable would determine how many output elements are calculated in each iteration.
numOutputElementsPerInputTile_ = BLOCK_REDUCE_NUM_REPEATS * (NUM_ELEMENTS_PER_REPEAT / maxLoRARank_);
numStreamInPerOutputTile_ = Y_OUT_TILE_NUM_ELEMENTS / numOutputElementsPerInputTile_;
}
__aicore__ inline void Process()
{
int64_t blockIdx = AscendC::GetBlockIdx();
int64_t startIdx = blockIdx * numTokensPerCore_;
int64_t endIdx = startIdx + numTokensPerCore_;
if (endIdx > batchSize_) {
endIdx = batchSize_;
}
for (int64_t idx = startIdx; idx < endIdx; idx++) {
yOffset_ = outputFullDim_ * idx + sliceOffset_;
// Set up LoRA index
CopyInIndex(idx);
if (reqLoRAIndex_ < 0) {
continue;
}
reqLoRAWeightOffset_ = reqLoRAIndex_ * singleLoRAWeightLen_;
CopyInX(idx);
int32_t numStreamOut = outputHiddenDim_ / Y_OUT_TILE_NUM_ELEMENTS;
for (int32_t i = 0; i < numStreamOut; i++) {
CopyInY(i);
for (int32_t j = 0; j < numStreamInPerOutputTile_; j++) {
CopyInW(i * numStreamInPerOutputTile_ + j);
Compute(j * numOutputElementsPerInputTile_);
}
ScaleOutput();
CopyOut(i);
}
ComputeLastIteration();
}
}
private:
__aicore__ inline void CopyInIndex(const int64_t idx)
{
// Look up the LoRA index
reqLoRAIndex_ = indicesGm_.GetValue(idx);
}
__aicore__ inline void ComputeLastIteration()
{
int32_t remainingY = outputHiddenDim_ % Y_OUT_TILE_NUM_ELEMENTS;
if (remainingY == 0) {
return;
}
int32_t numStreamOut = outputHiddenDim_ / Y_OUT_TILE_NUM_ELEMENTS;
int32_t remainingW = remainingY * maxLoRARank_;
int32_t numCompleteWTileInForLastIteration = remainingW / W_IN_TILE_NUM_ELEMENTS;
int32_t remainingWForLastRepeat = remainingW % W_IN_TILE_NUM_ELEMENTS;
CopyInY(numStreamOut, remainingY);
int32_t outputIdx = 0;
for (outputIdx = 0; outputIdx < numCompleteWTileInForLastIteration; outputIdx++) {
CopyInW(numStreamOut * numStreamInPerOutputTile_ + outputIdx);
Compute(outputIdx * numOutputElementsPerInputTile_);
}
if (remainingWForLastRepeat != 0) {
CopyInW(numStreamOut * numStreamInPerOutputTile_ + numCompleteWTileInForLastIteration,
remainingWForLastRepeat);
int32_t lastRepeatCount = remainingWForLastRepeat / NUM_ELEMENTS_PER_REPEAT;
int32_t pairReduceRepeat16 =
(lastRepeatCount * NUM_BLOCKS_PER_REPEAT + NUM_ELEMENTS_PER_REPEAT - 1) / NUM_ELEMENTS_PER_REPEAT;
int32_t pairReduceRepeat32 = (pairReduceRepeat16 + 1) / 2;
int32_t lastComputeOutputElement = outputIdx * numOutputElementsPerInputTile_;
Compute(lastComputeOutputElement, lastRepeatCount, pairReduceRepeat16, pairReduceRepeat32);
}
ScaleOutput(remainingY);
CopyOut(numStreamOut, remainingY);
}
__aicore__ inline void CopyInX(const int64_t idx)
{
AscendC::LocalTensor<X_T> xLocal = inQueueX_.AllocTensor<X_T>();
if constexpr (std::is_same_v<X_T, float>) {
DataCopy(xLocal, xGm_[maxLoRARank_ * idx], maxLoRARank_);
} else {
uint16_t blockLen = static_cast<uint16_t>(maxLoRARank_ * sizeof(X_T));
DataCopyPad(xLocal, xGm_[maxLoRARank_ * idx], {1, blockLen, 0, 0}, {});
}
inQueueX_.EnQue(xLocal);
xLocal = inQueueX_.DeQue<X_T>();
AscendC::LocalTensor<float> xDup = dupBufferX_.Get<float>();
// As we are generating multiple output elements with one API invocation,
// we need to duplicate the X vector multiple times to fill one NUM_BYTES_PER_REPEAT
if constexpr (std::is_same_v<X_T, float>) {
for (int32_t i = 0; i < NUM_ELEMENTS_PER_REPEAT; i += maxLoRARank_) {
for (int32_t j = 0; j < maxLoRARank_; j++) {
float entry = xLocal.GetValue(j);
xDup.SetValue(i + j, entry);
}
}
} else {
Cast(xDup, xLocal, AscendC::RoundMode::CAST_NONE, maxLoRARank_);
pipe_barrier(PIPE_V);
for (int32_t i = maxLoRARank_; i < NUM_ELEMENTS_PER_REPEAT; i += maxLoRARank_) {
for (int32_t j = 0; j < maxLoRARank_; j++) {
float entry = xDup.GetValue(j);
xDup.SetValue(i + j, entry);
}
}
}
inQueueX_.FreeTensor(xLocal);
}
__aicore__ inline void CopyInY(int32_t progress, int32_t numElements = Y_OUT_TILE_NUM_ELEMENTS)
{
AscendC::LocalTensor<Y_T> yInLocal = inQueueY_.AllocTensor<Y_T>();
DataCopy(yInLocal, yInGm_[yOffset_ + progress * Y_OUT_TILE_NUM_ELEMENTS], numElements);
inQueueY_.EnQue(yInLocal);
}
__aicore__ inline void CopyInW(int32_t progress, int32_t numElements = W_IN_TILE_NUM_ELEMENTS)
{
AscendC::LocalTensor<W_T> wLocal = inQueueW_.AllocTensor<W_T>();
DataCopy(wLocal, wGm_[reqLoRAWeightOffset_ + progress * W_IN_TILE_NUM_ELEMENTS], numElements);
inQueueW_.EnQue(wLocal);
}
__aicore__ inline void ScaleOutput(int32_t numElements = Y_OUT_TILE_NUM_ELEMENTS)
{
AscendC::LocalTensor<float> yLocal = tmpBufferY_.Get<float>();
AscendC::LocalTensor<Y_T> yInLocal = inQueueY_.DeQue<Y_T>();
AscendC::LocalTensor<float> yInLocalFP32 = inBufferY_.Get<float>();
Cast(yInLocalFP32, yInLocal, AscendC::RoundMode::CAST_NONE, numElements);
pipe_barrier(PIPE_V);
inQueueY_.FreeTensor(yInLocal);
Add(yLocal, yLocal, yInLocalFP32, numElements);
pipe_barrier(PIPE_V);
AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.AllocTensor<Y_T>();
Cast(yOutLocal, yLocal, AscendC::RoundMode::CAST_RINT, numElements);
pipe_barrier(PIPE_V);
outQueueY_.EnQue<Y_T>(yOutLocal);
}
__aicore__ inline void Compute(int32_t progress,
int32_t blockReduceRepeatCount=BLOCK_REDUCE_NUM_REPEATS,
int32_t pairReduceRepeat16=PAIR_REDUCE_NUM_REPEATS_16,
int32_t pairReduceRepeat32=PAIR_REDUCE_NUM_REPEATS_32)
{
AscendC::LocalTensor<float> yLocal = tmpBufferY_.Get<float>();
AscendC::LocalTensor<float> xDup = dupBufferX_.Get<float>();
AscendC::LocalTensor<W_T> wLocal = inQueueW_.DeQue<W_T>();
AscendC::LocalTensor<float> wTmpTensor = tmpBufferW_.Get<float>();
Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, MASK_COUNT, blockReduceRepeatCount, castParams_);
pipe_barrier(PIPE_V);
inQueueW_.FreeTensor(wLocal);
Mul(wTmpTensor, xDup, wTmpTensor, MASK_COUNT, blockReduceRepeatCount, dotProductParams_);
pipe_barrier(PIPE_V);
if (maxLoRARank_ == LORA_RANK_8) {
BlockReduceSum(yLocal[progress], wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
pipe_barrier(PIPE_V);
} else if (maxLoRARank_ == LORA_RANK_16) {
BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
pipe_barrier(PIPE_V);
PairReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat16, MASK_COUNT,
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
pipe_barrier(PIPE_V);
} else if (maxLoRARank_ == LORA_RANK_32) {
BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
pipe_barrier(PIPE_V);
PairReduceSum(wTmpTensor, wTmpTensor, pairReduceRepeat16, MASK_COUNT,
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
pipe_barrier(PIPE_V);
PairReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat32, MASK_COUNT,
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
pipe_barrier(PIPE_V);
} else if (maxLoRARank_ == LORA_RANK_64) {
BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
pipe_barrier(PIPE_V);
BlockReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat16, MASK_COUNT,
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
pipe_barrier(PIPE_V);
}
}
__aicore__ inline void CopyOut(int32_t progress, int32_t numElements = Y_OUT_TILE_NUM_ELEMENTS)
{
AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.DeQue<Y_T>();
DataCopy(yOutGm_[yOffset_ + progress * Y_OUT_TILE_NUM_ELEMENTS], yOutLocal, numElements);
outQueueY_.FreeTensor(yOutLocal);
}
private:
AscendC::TPipe* pipe_;
AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueY_, inQueueW_;
AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueueX_;
AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> outQueueY_;
AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBufferW_, dupBufferX_, inBufferY_, tmpBufferY_;
AscendC::GlobalTensor<X_T> xGm_;
AscendC::GlobalTensor<W_T> wGm_;
AscendC::GlobalTensor<Y_T> yInGm_;
AscendC::GlobalTensor<Y_T> yOutGm_;
AscendC::GlobalTensor<int64_t> indicesGm_;
uint32_t batchSize_;
uint32_t numTokensPerCore_;
uint32_t maxLoRARank_;
uint32_t outputHiddenDim_;
uint32_t sliceOffset_;
uint32_t outputFullDim_;
uint32_t singleLoRAWeightLen_;
int64_t reqLoRAIndex_;
uint64_t reqLoRAWeightOffset_;
uint32_t numOutputElementsPerInputTile_;
uint32_t numStreamInPerOutputTile_;
uint64_t yOffset_;
// The block stride is set to 1, and 8 blocks in the same repeat are processed continuously.
// The repeat stride is 8, so the vector unit reads 8 consecutive blocks in the first repeat,
// reads next 8 consecutive blocks in the second repeat.
AscendC::UnaryRepeatParams castParams_ = {1, 1, 8, 4};
// For each repeat in BlockReduceSum and PairReduceSum we should move forward only one block,
// so we set dstRepStride = 1
AscendC::UnaryRepeatParams reduceSumParams_ = {1, 1, 1, 8};
// When the repeat stride is 0, the vector unit repeatedly reads and computes the first 8 consecutive blocks.
// For xDup we repeatedly use it, so we set src0RepStride = 0
AscendC::BinaryRepeatParams dotProductParams_ = {1, 1, 1, 8, 0, 8};
};
#define BGMV_EXPAND_TYPE_DECLARE(TYPE) \
extern "C" __global__ __aicore__ void bgmv_expand_##TYPE(__gm__ void* x, __gm__ void* weight, __gm__ void* indices,\
uint32_t indicesSize, __gm__ void* yIn, __gm__ void* yOut,\
uint32_t batchSize, uint32_t numTokensPerCore, \
uint32_t maxLoRARank, uint32_t outputHiddenDim, \
uint32_t sliceOffset, uint32_t outputFullDim) \
{ \
AscendC::TPipe pipe; \
BGMVExpand<TYPE> op(&pipe); \
op.Init(x, weight, indices, indicesSize, yIn, yOut, batchSize, numTokensPerCore, maxLoRARank, \
outputHiddenDim, sliceOffset, outputFullDim); \
op.Process(); \
}
// declare all dtype kernel
BGMV_EXPAND_TYPE_DECLARE(half)
#if (__CCE_AICORE__ >= 220)
BGMV_EXPAND_TYPE_DECLARE(bfloat16_t)
#endif
namespace vllm_ascend {
extern void bgmv_expand_impl(AscendType type, void* stream, void* x, void* weight, void* indices, uint32_t indicesSize,
void* yIn, void* yOut, uint32_t batchSize, uint32_t numTokensPerCore, uint32_t maxLoRARank,
uint32_t outputHiddenDim, uint32_t sliceOffset, uint32_t outputFullDim)
{
uint32_t blockDim = (batchSize + numTokensPerCore - 1) / numTokensPerCore;
if (type == AscendType::FP16) {
bgmv_expand_half<<<blockDim, nullptr, stream>>>(x, weight, indices, indicesSize, yIn, yOut, batchSize, numTokensPerCore,
maxLoRARank, outputHiddenDim, sliceOffset, outputFullDim);
} else if (type == AscendType::BF16) {
#if (__CCE_AICORE__ >= 220)
bgmv_expand_bfloat16_t<<<blockDim, nullptr, stream>>>(x, weight, indices, indicesSize, yIn, yOut, batchSize,
numTokensPerCore, maxLoRARank, outputHiddenDim,
sliceOffset, outputFullDim);
#endif
} else {
return;
}
}
} // namespace vllm_ascend